class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if "tmp_tar" in os.listdir(out_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, output, out_folder): new_ref_folder = self.gen_folder(out_folder, ref_files) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("Transfering to target fasta") for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split(",") out = open(filename, "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print( "Error: No fasta information of {0}.fa".format(strain)) out.close() shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) print("Please use the new fasta file to remapping again.")
class Screen(object): '''generation of screenshot''' def __init__(self, args_sc, out_folder): self.helper = Helper() args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder(os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file) def _import_libs(self, texs, strand, lib_dict): if strand == "+": tex = "ft" notex = "fn" else: tex = "rt" notex = "rn" for flib in texs: if (flib[1] == "tex"): lib_dict[tex].append(flib[0]) for nlib in texs: if (nlib[1] == "notex") and \ (flib[2] == nlib[2]) and \ (flib[3] == nlib[3]): lib_dict[notex].append(nlib[0]) def screenshot(self, args_sc, log): lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []} f_texs = [] r_texs = [] if args_sc.tlibs is not None: for lib in args_sc.tlibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): log.write("Wiggle files should end with .wig.\n") print("Error: Wiggle files should end with .wig!") sys.exit() else: if lib_datas[-1] == "+": f_texs.append(lib_datas) else: r_texs.append(lib_datas) f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3])) r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3])) self._import_libs(f_texs, "+", lib_dict) self._import_libs(r_texs, "-", lib_dict) if args_sc.flibs is not None: for lib in args_sc.flibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): log.write("Wiggle files should end with .wig.\n") print("Error: Wiggle files should end with .wig!") sys.exit() else: if lib_datas[-1] == "+": lib_dict["ff"].append(lib_datas[0]) else: lib_dict["rf"].append(lib_datas[0]) log.write("Running gen_screenshots.py to generate IGV batch script.\n") gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt", self.reverse_file + ".txt", self.strain) log.write("\t" + self.forward_file + ".txt is generated.\n") log.write("\t" + self.reverse_file + ".txt is generated.\n") if (args_sc.tlibs is None) and (args_sc.flibs is None): log.write("No wig files can be found.\n") print("Error: There is no wig file assigned!") sys.exit()
class Multiparser(object): def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig" def combine_fasta(self, ref_folder, tar_folder, ref_feature): '''combine multiple fasta files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging fasta files of " + prefix) for file_ in os.listdir("/".join([ref_folder, folder])): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): if tar.endswith(".fa") or \ tar.endswith(".fna") or \ tar.endswith(".fasta"): filename = ".".join((tar.split("."))[:-1]) for file_ in files: if filename == file_: self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_fa)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_fa), os.path.join(tar_merge, prefix + ".fa")) self.helper.remove_all_content(tar_folder, ".fa", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def get_prefix(self, folder, ref_feature): datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] return prefix def combine_wig(self, ref_folder, tar_folder, ref_feature, libs): '''combine multiple wig files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change_f = False change_r = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: prefix = self.get_prefix(folder, ref_feature) print("Merging wig files of " + prefix) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): filename = tar.split("_STRAIN_") for file_ in files: if (tar.endswith(".wig")) and (file_ == filename[-1][:-4]): for lib in libs: if (filename[0] in lib) and (lib[-1] == "+"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_forward)) change_f = True elif (filename[0] in lib) and (lib[-1] == "-"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_reverse)) change_r = True if change_f and change_r: change_f = False change_r = False shutil.move( os.path.join(tar_folder, self.tmp_wig_forward), os.path.join(tar_merge, prefix + "_forward.wig")) shutil.move( os.path.join(tar_folder, self.tmp_wig_reverse), os.path.join(tar_merge, prefix + "_reverse.wig")) else: print("Error: comparing input files of {0} failed. " "Please check the seq IDs of all gff and fasta " "files, they should be the same.\nPlease " "also check the wiggle files which should contain " "forward and reverse files.".format(prefix)) sys.exit() self.helper.remove_all_content(tar_folder, ".wig", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature): '''combine multiple gff files''' tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if tar_feature is None: tar_feature = "" else: tar_feature = "_" + tar_feature if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging gff files of " + prefix + tar_feature) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): for file_ in files: if (".gff" in tar) and (file_ + tar_feature == tar[:-4]): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_gff)) change = True if change: change = False shutil.move( os.path.join(tar_folder, self.tmp_gff), os.path.join(tar_folder, "merge_tmp", prefix + tar_feature + ".gff")) self.helper.remove_all_content(tar_folder, ".gff", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def parser_fasta(self, fastas): '''parser the fasta file based on strain''' par_tmp = os.path.join(fastas, "tmp") first = True out = None out_t = None detect = False for fasta in os.listdir(fastas): if (fasta.endswith(".fasta") or fasta.endswith(".fa") or fasta.endswith(".fna")): detect = True self.seq_editer.modify_header(os.path.join(fastas, fasta)) self.helper.check_make_folder(par_tmp) if not detect: print("Error: there are folders which conatin no fasta files! " "The files should end with .fa or .fna or .fasta!") sys.exit() for fasta in os.listdir(fastas): if ("_folder" not in fasta) and ("tmp" != fasta): if (fasta.endswith(".fa")) or \ (fasta.endswith(".fna")) or \ (fasta.endswith(".fasta")): out_path = os.path.join(fastas, fasta + "_folder") print("Parsing " + fasta) self.helper.check_make_folder(out_path) with open(os.path.join(fastas, fasta), "r") as f_f: for line in f_f: if line[0] == ">": line = line.strip() if ("|" in line) and (len(line.split("|")) > 4): strain = line.split("|") name = strain[3] else: name = line[1:] if first: first = False else: out.close() out_t.close() out = open( os.path.join(out_path, name + ".fa"), "w") out_t = open( os.path.join(par_tmp, name + ".fa"), "w") out.write(">" + name + "\n") out_t.write(">" + name + "\n") else: out.write(line) out_t.write(line) if out is not None: out.close() if out_t is not None: out_t.close() def parser_gff(self, gff_folder, feature): '''parser gff file based on strain''' par_tmp = os.path.join(gff_folder, "tmp") out = None out_t = None first = True detect = False if feature is None: feature = "" else: feature = "_" + feature self.helper.check_make_folder(par_tmp) for filename in os.listdir(gff_folder): pre_seq_id = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(gff_folder, filename + "_folder") if ".gff" in filename: detect = True print("Parsing " + filename) self.helper.check_make_folder(out_path) self.helper.sort_gff(os.path.join(gff_folder, filename), os.path.join(gff_folder, "tmp.gff")) f_h = open(os.path.join(gff_folder, "tmp.gff"), "r") for row in csv.reader(f_h, delimiter="\t"): if row[0].startswith("#"): continue else: if pre_seq_id == row[0]: out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") else: if first: first = False else: out.close() out_t.close() out = open( os.path.join(out_path, row[0] + feature + ".gff"), "w") out_t = open( os.path.join(par_tmp, row[0] + feature + ".gff"), "w") pre_seq_id = row[0] out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") f_h.close() if not detect: print("Error: There are folders which contain no gff3 files! " "The files should end with .gff!") sys.exit() if os.path.exists(os.path.join(gff_folder, "tmp.gff")): os.remove(os.path.join(gff_folder, "tmp.gff")) if out is not None: out.close() if out_t is not None: out_t.close() def parser_wig(self, wig_folder): '''parser the wig file based on strain''' par_tmp = os.path.join(wig_folder, "tmp") first = True out = None out_t = None detect = False self.helper.check_make_folder(par_tmp) for filename in os.listdir(wig_folder): track_info = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(wig_folder, filename + "_folder") if ".wig" in filename: detect = True print("Parsing {0}".format(filename)) self.helper.check_make_folder(out_path) with open(os.path.join(wig_folder, filename), "r") as w_f: for line in w_f: line = line.split(" ") if (line[0] == "track"): track_info = " ".join(line) if (line[0] == "variableStep"): strain = line[1].split("=") if first: first = False else: out.close() out_t.close() out = open( "".join([ os.path.join(out_path, filename[:-4]), "_STRAIN_", strain[1], ".wig" ]), "w") out_t = open( "".join([ os.path.join(wig_folder, "tmp", filename[:-4]), "_STRAIN_", strain[1], ".wig" ]), "w") if track_info != "": out.write(track_info) out_t.write(track_info) out.write(" ".join(line)) out_t.write(" ".join(line)) if (line[0] != "track") and (line[0] != "variableStep"): out.write(" ".join(line)) out_t.write(" ".join(line)) if not detect: print("Error: There are folders which contain no wig files! " "The files should end with .wig!") sys.exit() if out is not None: out.close() if out_t is not None: out_t.close()
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and (".fna" not in mod_read) and ( ".fq" not in mod_read) and (".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({ "sample": reads["sample"], "files": tmp_datas, "zips": zips }) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) + "\n") call([ segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ]) + "\n") p = Popen([ args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write( "Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join( [read_prefix, fasta_prefix + ".sam"]) log_file = "_".join( [read_prefix, fasta_prefix + ".log"]) align_files.append("_".join( [read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir( os.path.join(self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write( " ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write( "Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write( " ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join( [samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join( out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join( [samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) + "\n") call([ samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample ]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file(self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join(read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write( "Please make sure the version of Segemehl is at least 0.1.9.\n") log.write( "Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed") ]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join(output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join(splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join(splice_path, header, splice), os.path.join(bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.splices["file"] ])) out_trans = os.path.join( bed_folder, "".join([fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and (sample in file_): self.helper.merge_file( os.path.join(bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write( "Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder( os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join(args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join( args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join( self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join( self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join(self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({ "sample": datas[0], "files": datas[-1].split(",") }) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join(read.split("/")[-1].split(".")[:-1]) bam_files.append( os.path.join(self.alignment_path, prefix + ".bam")) bam_datas.append({ "sample": read_data["sample"], "files": bam_files }) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed(args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and (".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([ os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta) ]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([ os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S" ], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align(args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and (pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([ samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam ]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([ samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam") ]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n" ]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder( os.path.join(os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile( os.path.join(splice_path, header, self.splices["file"]), os.path.join( fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile( os.path.join(splice_path, header, self.trans["file"]), os.path.join( fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and (self.splices["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and (self.trans["all"] not in file_): self.helper.merge_file( os.path.join(fasta_prefix, file_), out_trans) else: shutil.move( os.path.join( fasta_prefix, "_".join( [self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move( os.path.join( fasta_prefix, "_".join( [self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder( os.path.join(self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir( os.path.join(self.splice_path, prefix)): detect_circrna( os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and (args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file(bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class SNPCalling(object): def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _import_bam(self, bam_folder, bams): num_bam = 0 for bam in os.listdir(bam_folder): if bam.endswith(".bam"): num_bam += 1 bams.append(os.path.join(bam_folder, bam)) return num_bam def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_file = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"])) snp_detect(fasta, snp, out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_file, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp): if type_ == "with": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "without": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-B", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "extend": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-E", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) elif args_snp.chrom == "2": call([args_snp.bcftools_path, "call", self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_bcf = open(self.outputs["tmp"], "w") out_vcf = self._run_tools(fasta_file, out_bcf, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) out_bcf.close() def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "1": type_ = "with" print("Running SNP calling with BAQ...") elif index == "2": type_ = "without" print("Running SNP calling without BAQ...") elif index == "3": print("Running SNP calling extend BAQ...") type_ = "extend" else: print("Error: No correct program, please assign 1, 2, 3") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) os.system(command) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.frag_bams is None) and (args_snp.normal_bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: if args_snp.normal_bams is not None: num_normal = self._import_bam(args_snp.normal_bams, bams) if args_snp.frag_bams is not None: num_frag = self._import_bam(args_snp.frag_bams, bams) num_bam = num_normal + num_frag if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merge BAM files now ...") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path): command = " ".join([samtools_path, "view", "-H", self.bams["sort"]]) os.system(">".join([command, self.header])) def _get_genome_name(self, samtools_path): self._get_header(samtools_path) fh = open(self.header, "r") seq_names = [] for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() return seq_names def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp.samtools_path) if ("1" not in args_snp.program) and ( "2" not in args_snp.program) and ( "3" not in args_snp.program): print("Error:Please assign a correct BAQ type: " "'1' means 'with_BAQ', '2' means 'with_BAQ' or " "'3' means 'extend_BAQ'.") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now ...".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = {"raw_prefix": os.path.join( self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join( self.outputs["table"], prefix, prefix)} fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp(args_snp.fastas) os.remove(self.bams["whole"]) os.remove(self.bams["sort"]) os.remove(self.header)
class Screen(object): '''generation of screenshot''' def __init__(self, args_sc): self.helper = Helper() out_folder = os.path.join(args_sc.output_folder, "screenshots") if os.path.exists(out_folder): print("Error: The {0} already exist!".format(out_folder)) sys.exit() else: os.mkdir(out_folder) args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder( os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file) def _import_libs(self, texs, strand, lib_dict): if strand == "+": tex = "ft" notex = "fn" else: tex = "rt" notex = "rn" for flib in texs: if (flib[1] == "tex"): lib_dict[tex].append(flib[0]) for nlib in texs: if (nlib[1] == "notex") and \ (flib[2] == nlib[2]) and \ (flib[3] == nlib[3]): lib_dict[notex].append(nlib[0]) def screenshot(self, args_sc): lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []} f_texs = [] r_texs = [] if args_sc.tlibs is not None: for lib in args_sc.tlibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Exist a not proper wig files!") sys.exit() else: if lib_datas[-1] == "+": f_texs.append(lib_datas) else: r_texs.append(lib_datas) f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3])) r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3])) self._import_libs(f_texs, "+", lib_dict) self._import_libs(r_texs, "-", lib_dict) if args_sc.flibs is not None: for lib in args_sc.flibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Exist a not proper wig files!") sys.exit() else: if lib_datas[-1] == "+": lib_dict["ff"].append(lib_datas[0]) else: lib_dict["rf"].append(lib_datas[0]) gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt", self.reverse_file + ".txt", self.strain) if (args_sc.tlibs is None) and (args_sc.flibs is None): print("Error: There are no wig file assigned!") sys.exit()
class Controller(object): """Manage the actions of the subcommands. The Controller take care of providing the argumentes like path names and the parallel processing of tasks. """ def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args.project_path) self.args_container = ArgsContainer() self.helper = Helper() def check_folder(self, folders): for folder in folders: if folder is None: print("Error: There is wrong path of folder assigned, " "please check it!!") sys.exit() else: if os.path.exists(folder): if len(os.listdir(folder)) == 0: print("Error: There is empty folder, " "please check it!!") sys.exit() else: print("Error: There is wrong folder, please check it!!") sys.exit() def check_parameter(self, paras, names): for i in range(len(paras)): if paras[i] is None: print("Error: {0} is wrong, " "please check it!!".format(names[i])) sys.exit() def check_no_require_folder(self, folders): for folder in folders: if folder is not None: if os.path.exists(folder): if len(os.listdir(folder)) == 0: print("Error: There is empty folder, " "please check it!!") sys.exit() else: print("Error: There is wrong folder, " "please check it!!") sys.exit() def check_file(self, files, names, require): for i in range(len(files)): if require: if files[i] is None: print("Error: {0} is wrong, " "please check it!!".format(names[i])) sys.exit() else: if not os.path.isfile(files[i]): print("Error: There is wrong path of {0}, " "please check it!!".format(names[i])) sys.exit() else: if files[i] is not None: if not os.path.isfile(files[i]): print("Error: There is wrong path of {0}, " "please check it!!".format(names[i])) sys.exit() def create_project(self, version): """Create a new project.""" project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders("root")) project_creator.create_subfolders( self._paths.required_folders("get_target_fasta")) project_creator.create_version_file( self._paths.version_path, version) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( self._args.project_path)) def get_input(self): """Download required files from website.""" print("Running get input files...") if self._args.FTP_path is None: print("Error: Please assign the path for downloading the data!!") sys.exit() if self._args.for_target: annotation_folder = self._paths.tar_annotation_folder fasta_folder = self._paths.tar_fasta_folder else: annotation_folder = self._paths.ref_annotation_folder fasta_folder = self._paths.ref_fasta_folder self.helper.check_make_folder(annotation_folder) self.helper.check_make_folder(fasta_folder) if self._args.ref_gff is True: get_file(self._args.FTP_path, annotation_folder, "gff", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "_genomic.gff.gz", self._args.for_target) if self._args.ref_fasta is True: get_file(self._args.FTP_path, fasta_folder, "fna", self._args.for_target) get_file(self._args.FTP_path, fasta_folder, "_genomic.fna.gz", self._args.for_target) if self._args.ref_gbk is True: get_file(self._args.FTP_path, annotation_folder, "gbk", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "gbff", self._args.for_target) get_file(self._args.FTP_path, annotation_folder, "_genomic.gbff.gz", self._args.for_target) if self._args.ref_ptt is True: get_file(self._args.FTP_path, annotation_folder, "ptt", self._args.for_target) if self._args.ref_rnt is True: get_file(self._args.FTP_path, annotation_folder, "rnt", self._args.for_target) if self._args.convert_embl is True: annotation_files = os.listdir(annotation_folder) if len(annotation_files) == 0: sys.stdout.write("No gbk files!!\n") else: Converter().convert_gbk2embl(annotation_folder) def get_target_fasta(self): """Get target fasta""" print("Running get target fasta...") self.check_parameter([self._args.output_format], ["--output_format"]) self.check_folder([self._args.ref_fasta_folder]) self.check_file([self._args.mutation_table], "--mutation_table", True) project_creator.create_subfolders( self._paths.required_folders("get_target_fasta")) outputs = self._args.output_format.split(",") for output in outputs: output = output.strip() target = TargetFasta(self._paths.tar_fasta_folder, self._args.ref_fasta_folder) target.get_target_fasta( self._args.mutation_table, self._paths.tar_fasta_folder, self._args.ref_fasta_folder, outputs) def ratt(self): """Run RATT to transfer annotation file from reference to target.""" print("Running annotation transfer...") if (self._args.transfer_type != "Strain") and ( self._args.transfer_type != "Assembly") and ( self._args.transfer_type != "Species") and ( self._args.transfer_type != "Assembly.Repetitive") and ( self._args.transfer_type != "Strain.Repetitive") and ( self._args.transfer_type != "Species.Repetitive") and ( self._args.transfer_type != "Multiple") and ( self._args.transfer_type != "Free"): print("Error: please assign correct --transfer_type!!") sys.exit() self.check_folder([self._args.ref_embl_gbk, self._args.target_fasta, self._args.ref_fasta]) self.check_parameter([self._args.element, self._args.compare_pair], ["--element", "--compare_pair"]) project_creator.create_subfolders( self._paths.required_folders("annotation_transfer")) args_ratt = self.args_container.container_ratt( self._args.RATT_path, self._args.element, self._args.transfer_type, self._args.ref_embl_gbk, self._args.target_fasta, self._args.ref_fasta, self._paths.ratt_folder, self._args.convert_to_gff_rnt_ptt, self._paths.tar_annotation_folder, self._args.compare_pair) ratt = RATT(args_ratt) ratt.annotation_transfer(args_ratt) def tsspredator(self): """Run TSSpredator for predicting TSS candidates.""" self.check_folder([self._args.fasta_folder, self._args.annotation_folder, self._args.wig_folder]) self.check_parameter([self._args.lib, self._args.output_prefix], ["--lib", "--output_prefix"]) self.check_no_require_folder([self._args.compare_transcript_assembly, self._args.reference_gff_folder]) self.check_file([self._args.merge_manual], ["--merge_manual"], False) if self._args.compute_program.lower() == "tss": print("Running TSS prediction...") project_creator.create_subfolders( self._paths.required_folders("TSS")) out_folder = self._paths.tsspredator_folder elif self._args.compute_program.lower() == "processing_site": print("Running processing site prediction...") out_folder = self._paths.processing_site_folder project_creator.create_subfolders( self._paths.required_folders("processing")) else: print("Error:No such program!!!!") sys.exit() args_tss = self.args_container.container_tsspredator( self._args.TSSpredator_path, self._args.compute_program, self._args.fasta_folder, self._args.annotation_folder, self._args.wig_folder, self._args.lib, self._args.output_prefix, self._args.height, self._args.height_reduction, self._args.factor, self._args.factor_reduction, self._args.base_height, self._args.enrichment_factor, self._args.processing_factor, self._args.replicate_match, out_folder, self._args.statistics, self._args.validate_gene, self._args.merge_manual, self._args.compare_transcript_assembly, self._args.fuzzy, self._args.utr_length, self._args.cluster, self._args.length, self._args.re_check_orphan, self._args.overlap_feature, self._args.reference_gff_folder, self._args.remove_low_expression) tsspredator = TSSpredator(args_tss) tsspredator.run_tsspredator(args_tss) def optimize(self): """opimize TSSpredator""" self.check_folder([self._args.wig_folder, self._args.fasta_file, self._args.annotation_file]) self.check_file([self._args.manual], ["--manual"], True) self.check_parameter([self._args.strain_name, self._args.lib, self._args.output_prefix], ["--strain_name", "--lib", "--output_prefix"]) if self._args.program.lower() == "tss": print("Running optimization of TSS prediction...") project_creator.create_subfolders( self._paths.required_folders("TSS")) out_folder = self._paths.tsspredator_folder elif self._args.program.lower() == "processing_site": print("Running optimization of processing site prediction...") out_folder = self._paths.processing_site_folder project_creator.create_subfolders( self._paths.required_folders("processing")) else: print("Error:No such program!!!!") sys.exit() args_ops = self.args_container.container_optimize( self._args.TSSpredator_path, self._args.fasta_file, self._args.annotation_file, self._args.wig_folder, self._args.manual, out_folder, self._args.strain_name, self._args.max_height, self._args.max_height_reduction, self._args.max_factor, self._args.max_factor_reduction, self._args.max_base_height, self._args.max_enrichment_factor, self._args.max_processing_factor, self._args.utr_length, self._args.lib, self._args.output_prefix, self._args.cluster, self._args.length, self._args.core, self._args.program, self._args.replicate_match, self._args.steps) optimize_tss(args_ops) def color(self): """color the screenshots""" print("Running png files coloring...") self.check_parameter([self._args.track_number], ["--track_numer"]) self.check_folder([self._args.screenshot_folder]) color = ColorPNG() color.generate_color_png( self._args.track_number, self._args.screenshot_folder, self._args.ImageMagick_covert_path) def terminator(self): """Run TransTermHP for detecting terminators.""" print("Running terminator prediction...") if self._args.TransTermHP_path is None: print("Please assign the folder where you install TransTermHP.") self.check_folder([self._args.fasta_folder, self._args.annotation_folder, self._args.transcript_folder]) self.check_no_require_folder([self._args.sRNA]) project_creator.create_subfolders( self._paths.required_folders("terminator")) args_term = self.args_container.container_terminator( self._args.TransTermHP_path, self._args.expterm_path, self._args.RNAfold_path, self._paths.transterm_folder, self._args.fasta_folder, self._args.annotation_folder, self._args.transcript_folder, self._args.sRNA, self._args.statistics, self._args.tex_wig_folder, self._args.frag_wig_folder, self._args.decrease, self._args.highest_coverage, self._args.fuzzy_detect_coverage, self._args.fuzzy_within_transcript, self._args.fuzzy_downstream_transcript, self._args.fuzzy_within_gene, self._args.fuzzy_downstream_gene, self._paths.transtermhp_folder, self._args.tex_notex_libs, self._args.frag_libs, self._args.tex_notex, self._args.replicates_tex, self._args.replicates_frag, self._args.table_best, self._args.min_loop_length, self._args.max_loop_length, self._args.min_stem_length, self._args.max_stem_length, self._args.min_U_tail_length, self._args.miss_rate, self._args.range_U_tail) terminator = Terminator(args_term) terminator.run_terminator(args_term) def transcript(self): """Run Transcriptome assembly.""" print("Running transcript assembly...") self.check_folder([self._args.annotation_folder]) self.check_no_require_folder([ self._args.compare_TSS, self._args.compare_genome_annotation, self._args.terminator_folder]) project_creator.create_subfolders( self._paths.required_folders("transcript_assembly")) args_tran = self.args_container.container_transcript( self._args.frag_wig_path, self._args.tex_wig_path, self._args.tex_notex, self._args.length, self._args.annotation_folder, self._args.height, self._args.width, self._args.tolerance, self._args.tolerance_coverage, self._args.replicates_tex, self._args.replicates_frag, self._paths.transcript_assembly_output_folder, self._args.compare_TSS, self._args.compare_genome_annotation, self._args.TSS_fuzzy, self._args.Tex_treated_libs, self._args.fragmented_libs, self._args.compare_feature_genome, self._args.table_best, self._args.terminator_folder, self._args.fuzzy_term) transcript = TranscriptAssembly(args_tran) transcript.run_transcript_assembly(args_tran) def utr_detection(self): """Run UTR detection.""" print("Running UTR detection...") self.check_folder([self._args.annotation_folder, self._args.transcript_assembly_folder, self._args.TSS_folder]) self.check_no_require_folder([self._args.terminator_folder]) project_creator.create_subfolders(self._paths.required_folders("utr")) args_utr = self.args_container.container_utr( self._args.TSS_folder, self._args.annotation_folder, self._args.transcript_assembly_folder, self._args.terminator_folder, self._args.terminator_fuzzy, self._paths.utr_folder, self._args.TSS_source, self._args.base_5UTR, self._args.UTR_length, self._args.base_3UTR) utr = UTRDetection(args_utr) utr.run_utr_detection(args_utr) def srna_detection(self): """sRNA_detection.""" print("Running sRNA prediction...") self.check_folder([self._args.annotation_folder, self._args.transcript_assembly_folder]) self.check_no_require_folder([self._args.fasta_folder, self._args.sORF, self._args.terminator_folder]) self.check_file([self._args.promoter_table], ["--promoter_table"], False) if self._args.UTR_derived_sRNA: self.check_folder([self._args.TSS_folder, self._args.processing_site_folder]) else: self.check_no_require_folder([self._args.TSS_folder, self._args.processing_site_folder]) project_creator.create_subfolders(self._paths.required_folders("srna")) args_srna = self.args_container.container_srna( self._args.Vienna_folder, self._args.Vienna_utils, self._args.blast_plus_folder, self._args.ps2pdf14_path, self._paths.srna_folder, self._args.UTR_derived_sRNA, self._args.annotation_folder, self._args.TSS_folder, self._args.transcript_assembly_folder, self._args.TSS_intergenic_fuzzy, self._args.TSS_5UTR_fuzzy, self._args.TSS_3UTR_fuzzy, self._args.TSS_interCDS_fuzzy, self._args.import_info, self._args.tex_wig_folder, self._args.frag_wig_folder, self._args.processing_site_folder, self._args.fasta_folder, self._args.mountain_plot, self._args.nr_format, self._args.srna_format, self._args.sRNA_database_path, self._args.nr_database_path, self._args.cutoff_energy, self._args.run_intergenic_TEX_coverage, self._args.run_intergenic_noTEX_coverage, self._args.run_intergenic_fragmented_coverage, self._args.run_antisense_TEX_coverage, self._args.run_antisense_noTEX_coverage, self._args.run_antisense_fragmented_coverage, self._args.intergenic_tolerance, self._args.run_utr_TEX_coverage, self._args.run_utr_noTEX_coverage, self._args.run_utr_fragmented_coverage, self._args.max_length, self._args.min_length, self._args.tex_notex_libs, self._args.frag_libs, self._args.replicates_tex, self._args.replicates_frag, self._args.tex_notex, self._args.blast_e_nr, self._args.blast_e_srna, self._args.detect_sRNA_in_CDS, self._args.table_best, self._args.decrease_intergenic, self._args.decrease_utr, self._args.fuzzy_intergenic, self._args.fuzzy_utr, self._args.cutoff_nr_hit, self._args.sORF, self._args.best_with_all_sRNAhit, self._args.best_without_sORF_candidate, self._args.overlap_percent_CDS, self._args.terminator_folder, self._args.terminator_fuzzy_in_CDS, self._args.terminator_fuzzy_out_CDS, self._args.best_with_terminator, self._args.ignore_hypothetical_protein, self._args.TSS_source, self._args.min_utr_coverage, self._args.promoter_table, self._args.best_with_promoter, self._args.ranking_time_promoter, self._args.promoter_name) srna = sRNADetection(args_srna) srna.run_srna_detection(args_srna) def sorf_detection(self): """sORF_detection.""" print("Running sORF prediction...") self.check_folder([self._args.transcript_assembly_folder, self._args.annotation_folder, self._args.fasta_folder]) self.check_no_require_folder([ self._args.sRNA_folder, self._args.TSS_folder]) project_creator.create_subfolders( self._paths.required_folders("sorf")) args_sorf = self.args_container.container_sorf( self._paths.sorf_folder, self._args.UTR_derived_sORF, self._args.transcript_assembly_folder, self._args.annotation_folder, self._args.TSS_folder, self._args.utr_length, self._args.min_length, self._args.max_length, self._args.tex_wig_folder, self._args.frag_wig_folder, self._args.cutoff_intergenic_coverage, self._args.cutoff_antisense_coverage, self._args.cutoff_5utr_coverage, self._args.cutoff_3utr_coverage, self._args.cutoff_interCDS_coverage, self._args.fasta_folder, self._args.tex_notex_libs, self._args.frag_libs, self._args.tex_notex, self._args.replicates_tex, self._args.replicates_frag, self._args.table_best, self._args.sRNA_folder, self._args.start_codon, self._args.stop_codon, self._args.cutoff_background, self._args.fuzzy_rbs, self._args.rbs_not_after_TSS, self._args.print_all_combination, self._args.best_no_sRNA, self._args.best_no_TSS, self._args.ignore_hypothetical_protein, self._args.min_rbs_distance, self._args.max_rbs_distance) sorf = sORFDetection(args_sorf) sorf.run_sorf_detection(args_sorf) def meme(self): """promoter detectopn""" print("Running promoter detection...") self.check_folder([self._args.TSS_folder, self._args.fasta_folder]) if not self._args.TSS_source: self.check_folder([self._args.annotation_folder]) project_creator.create_subfolders( self._paths.required_folders("promoter")) args_pro = self.args_container.container_promoter( self._args.MEME_path, self._paths.promoter_output_folder, self._args.tex_libs, self._args.TSS_folder, self._args.fasta_folder, self._args.num_motif, self._args.nt_before_TSS, self._args.motif_width, self._args.TSS_source, self._args.tex_wig_path, self._args.annotation_folder, self._args.combine_all, self._args.e_value) meme = MEME(args_pro) meme.run_meme(args_pro) def operon(self): """operon detection""" print("Running operon detection...") self.check_folder([self._args.TSS_folder, self._args.annotation_folder, self._args.transcript_folder, self._args.UTR5_folder, self._args.UTR3_folder]) self.check_no_require_folder([self._args.term_folder]) project_creator.create_subfolders( self._paths.required_folders("operon")) args_op = self.args_container.container_operon( self._args.TSS_folder, self._args.annotation_folder, self._args.transcript_folder, self._args.UTR5_folder, self._args.UTR3_folder, self._args.term_folder, self._args.TSS_fuzzy, self._args.term_fuzzy, self._args.min_length, self._args.statistics, self._paths.operon_output_folder, self._args.combine_gff, self._paths.operon_statistics_folder) operon = OperonDetection(args_op) operon.run_operon(args_op) def circrna(self): """circRNA detection""" print("Running circular RNA prediction...") self.check_folder([self._args.fasta_path, self._args.annotation_path]) self.check_no_require_folder([self._args.tex_bam_path, self._args.fragmented_bam_path]) project_creator.create_subfolders( self._paths.required_folders("circrna")) args_circ = self.args_container.container_circrna( self._args.align, self._args.process, self._args.fasta_path, self._args.annotation_path, self._args.tex_bam_path, self._args.fragmented_bam_path, self._paths.read_folder, self._paths.circrna_stat_folder, self._args.support_reads, self._args.segemehl_folder, self._args.samtools_path, self._args.start_ratio, self._args.end_ratio, self._args.ignore_hypothetical_protein, self._paths.circrna_output_folder) circ = CircRNADetection(args_circ) circ.run_circrna(args_circ) def goterm(self): """Go term discovery""" print("Running GO term mapping...") self.check_folder([self._args.annotation_path]) self.check_no_require_folder([self._args.transcript_path]) self.check_file([self._args.UniProt_id, self._args.go_obo, self._args.goslim_obo], ["--UniProt_id", "--go.obo", "--goslim_obo"], True) project_creator.create_subfolders( self._paths.required_folders("go_term")) args_go = self.args_container.container_goterm( self._args.annotation_path, self._paths.goterm_output_folder, self._args.UniProt_id, self._args.go_obo, self._args.goslim_obo, self._args.transcript_path) goterm = GoTermFinding(args_go) goterm.run_go_term(args_go) def srna_target(self): """sRNA target prediction""" print("Running sRNA target prediction...") self.check_folder([self._args.fasta_path, self._args.sRNA_path, self._args.annotation_path]) project_creator.create_subfolders( self._paths.required_folders("srna_target")) args_tar = self.args_container.container_srna_target( self._args.Vienna_folder, self._args.annotation_path, self._args.fasta_path, self._args.sRNA_path, self._args.query_sRNA, self._args.program, self._args.interaction_length, self._args.window_size_target, self._args.span_target, self._args.window_size_srna, self._args.span_srna, self._args.unstructured_region_RNAplex_target, self._args.unstructured_region_RNAplex_srna, self._args.unstructured_region_RNAup, self._args.energy_threshold, self._args.duplex_distance, self._args.top, self._paths.starget_output_folder, self._args.process_rnaplex, self._args.process_rnaup, self._args.continue_rnaup, self._args.potential_target_start, self._args.potential_target_end, self._args.target_feature) srnatarget = sRNATargetPrediction(args_tar) srnatarget.run_srna_target_prediction(args_tar) def snp(self): """SNP transcript detection""" print("Running SNP/mutations calling...") self.check_folder([self._args.fasta_path]) if (self._args.bam_type != "target") and ( self._args.bam_type != "reference"): print("Error: please assign \"target\" or" " \"reference\" to --bam_type!!") sys.exit() if (self._args.ploidy != "haploid") and ( self._args.ploidy != "diploid"): print("Error: please assign \"haploid\" or" " \"diploid\" to --chromosome_type!!") project_creator.create_subfolders(self._paths.required_folders("snp")) args_snp = self.args_container.container_snp( self._args.samtools_path, self._args.bcftools_path, self._args.bam_type, self._args.program, self._args.fasta_path, self._args.tex_bam_path, self._args.frag_bam_path, self._args.quality, self._args.read_depth, self._paths.snp_output_folder, self._args.indel_fraction, self._args.ploidy) snp = SNPCalling(args_snp) snp.run_snp_calling(args_snp) def ppi(self): """PPI network retrieve""" print("Running protein-protein interaction networks prediction...") self.check_folder([self._args.gff_path]) self.check_parameter([self._args.proteinID_strains, self._args.species_STRING], ["--proteinID_strains", "--species_STRING"]) project_creator.create_subfolders( self._paths.required_folders("ppi_network")) args_ppi = self.args_container.container_ppi( self._args.gff_path, self._args.proteinID_strains, self._args.without_strain_pubmed, self._args.species_STRING, self._args.score, self._paths.ppi_output_folder, self._args.node_size, self._args.query) ppi = PPINetwork(self._paths.ppi_output_folder) ppi.retrieve_ppi_network(args_ppi) def sublocal(self): """Subcellular Localization prediction""" print("Running subcellular localization prediction...") self.check_folder([self._args.gff_path, self._args.fasta_path]) self.check_no_require_folder([self._args.transcript_path]) if (self._args.bacteria_type != "positive") and ( self._args.bacteria_type != "negative"): print("Error: please assign \"positive\" or" " \"negative\" to --bacteria_type!!") sys.exit() project_creator.create_subfolders( self._paths.required_folders("subcellular_localization")) args_sub = self.args_container.container_sublocal( self._args.Psortb_path, self._args.gff_path, self._args.fasta_path, self._args.bacteria_type, self._args.difference_multi, self._args.merge_to_gff, self._paths.sublocal_output_folder, self._args.transcript_path) sublocal = SubLocal(args_sub) sublocal.run_sub_local(args_sub) def ribos(self): """riboswitch prediction""" print("Running riboswitch prediction...") self.check_folder([self._args.gff_path, self._args.fasta_path, self._args.tss_path, self._args.transcript_path]) self.check_file([self._args.riboswitch_ID, self._args.Rfam], ["--riboswitch_ID", "--Rfam"], True) project_creator.create_subfolders( self._paths.required_folders("riboswitch")) args_ribo = self.args_container.container_ribos( self._args.infernal_path, self._args.riboswitch_ID, self._args.gff_path, self._args.fasta_path, self._args.tss_path, self._args.transcript_path, self._args.Rfam, self._paths.ribos_output_folder, self._args.e_value, self._args.output_all, self._paths.database_folder, self._args.fuzzy, self._args.start_codon, self._args.min_dist_rbs, self._args.max_dist_rbs, self._args.fuzzy_rbs, self._args.UTR_length) ribos = Ribos(args_ribo) ribos.run_ribos(args_ribo) def screen(self): """generate screenshot""" print("Running screenshot generating...") self.check_file([self._args.main_gff, self._args.fasta], ["--main_gff", "--fasta"], True) if self._args.side_gffs is not None: for gff in (self._args.side_gffs.split(",")): gff = gff.strip() if not os.path.isfile(gff): print("Error: The --side_gffs no exist!!") sys.exit() if self._args.output_folder is None: print("Error: please assign --output_folder!!") sys.exit() if (self._args.present != "expand") and ( self._args.present != "collapse") and ( self._args.present != "squish"): print("Error: please assign \"expand\" or " "\"collapse\" or \"squish\" to --present!!") sys.exit() args_sc = self.args_container.container_screen( self._args.main_gff, self._args.side_gffs, self._args.fasta, self._args.frag_wig_folder, self._args.tex_wig_folder, self._args.height, self._args.tex_libs, self._args.frag_libs, self._args.present, self._args.output_folder) screen = Screen(args_sc) screen.screenshot(args_sc)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and (cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and (cds.end > ta.start) and (cds.start <= ta.start)) or ( (cds.start > ta.start) and (cds.start < ta.end) and (cds.end >= ta.end)) or ( (cds.end >= ta.end) and (cds.start <= ta.start)) or ( (cds.end <= ta.end) and (cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran( os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open( os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format(args_sub.gram)) sys.exit() log.write( "\t" + os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb( os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join( [prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join([ "#Genome", "Protein", "Strand", "Start", "End", "Location", "Score" ]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join( stat_folder, "_".join(["stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class GoTermFinding(object): def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDS") self.out_express = os.path.join(args_go.out_folder, "expressed_CDS") self.result_all_path = os.path.join(self.out_all, "Go_term_results") self.result_express_path = os.path.join(self.out_express, "Go_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_strains_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_): prefixs = [] for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("extracting Go terms of {0} from UniProt...".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) def _merge_files(self, gffs, out_path, out_folder): folders = [] for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) def _stat(self, out_path, stat_path, go, goslim, out_folder): for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) print("Computing statistics of {0}".format(folder)) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])), out_folder) self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) def run_go_term(self, args_go): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDS...") self._retrieve_go(args_go.uniprot, self.result_all_path, "all") self._merge_files(args_go.gffs, self.result_all_path, self.out_all) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all) if args_go.trans is not None: print("Computing express CDS...") self._retrieve_go(args_go.uniprot, self.result_express_path, "express") self._merge_files(args_go.gffs, self.result_express_path, self.out_express) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express) self.helper.remove_tmp(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp(args_go.trans)
class TranscriptAssembly(object): def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_assembly_fragment.gff" self.tex = "transcript_assembly_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcript assembly for {0}...".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) assembly(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join( [strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join( [strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of Transcript assembly and TSS file...") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join([ "stat_compare_Transcriptome_assembly_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join( args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) def _compare_cds(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_cds, None) self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_cds, "tmp"), "transcript", None) print("Comaring of Transcript assembly and gene...") cds_folder = os.path.join(args_tran.compare_cds, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join(self.stat_path, "".join([ "stat_compare_Transcriptome_assembly_gene_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.compare_cds, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join( args_tran.compare_cds, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) def _compare_tss_cds(self, tas, args_tran): if (args_tran.compare_tss is not None) and ( args_tran.compare_cds is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) self._compare_tss(tas, args_tran) elif (args_tran.compare_cds is not None) and ( args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) elif (args_tran.compare_cds is None) and ( args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran) def _for_one_wig(self, type_, args_tran): if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Computing {0} wig files....".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join(self.gff_outfolder, "_".join([ strain, "transcript_assembly", type_ + ".gff"])) self.helper.sort_gff(os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran): if (args_tran.frag_wigs is not None) and ( args_tran.tex_wigs is not None): print("merge fragment and tex treat one ....") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "transcript_assembly" in gff: filename = gff.split("_transcript_assembly_") if (strain == filename[0]) and ( "tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ( "fragment.gff" == filename[1]): frag_file = gff combine(os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) def _post_modify(self, tas, args_tran): for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} refering to {1}...".format(ta, gff)) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"]) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"]) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move(final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if args_tran.frag_wigs is not None: self.helper.remove_wigs(args_tran.frag_wigs) if args_tran.tex_wigs is not None: self.helper.remove_wigs(args_tran.tex_wigs) if args_tran.gffs is not None: self.helper.remove_tmp(args_tran.gffs) if args_tran.compare_cds is not None: self.helper.remove_tmp(args_tran.compare_cds) if args_tran.compare_tss is not None: self.helper.remove_tmp(args_tran.compare_tss) if args_tran.terms is not None: self.helper.remove_tmp(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran): if args_tran.terms is not None: print("comparing between terminators and transcripts...") self.multiparser.parser_gff(args_tran.terms, "term") self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript") def run_transcript_assembly(self, args_tran): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): print("Error: there is no wigs files!!!!\n") sys.exit() if args_tran.frag_wigs is not None: strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran) tas = [] if args_tran.gffs is not None: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join( args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran) self._compare_term_tran(args_tran) gen_table_transcript(self.gff_outfolder, args_tran) self._remove_file(args_tran)
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _transcript_snp(self, fasta, out_table_prefix, type_, prefix, bam_datas, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) for bam in bam_datas: stat_prefix = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]), "SNP"])) snp_file = os.path.join(self.outputs["raw"], prefix, "_".join( [prefix, self.baqs[type_], bam["sample"] + ".vcf"])) snp_detect( fasta, snp_file, self.outputs["depth"] + bam["sample"], "_".join([out_table_prefix, bam["sample"]]), os.path.join(seq_path, "_".join([prefix, bam["sample"]])), bam["bam_number"], stat_prefix, args_snp, bam["rep"]) self.helper.move_all_content(table_path, self.fig_path, [".png"]) def _get_para(self, args_snp): if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bcf_para def _run_tools(self, fasta_file, type_, args_snp, bam_datas, log): bcf_para = self._get_para(args_snp) for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bam_file] else: command = command + ["--ignore-RG", "-ugf", fasta_file, bam_file] log.write(" ".join(command) + ">" + self.outputs["tmp"] + "\n") os.system(" ".join(command) + ">" + self.outputs["tmp"]) bam["vcf"] = os.path.join(self.outputs["raw"], "_".join( [self.baqs[type_], bam["sample"] + ".vcf"])) if args_snp.chrom == "1": log.write(" ".join([ args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n") call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) elif args_snp.chrom == "2": log.write(" ".join([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) + "\n") call([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"]]) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(self.outputs["raw"]): log.write("\t" + os.path.join(self.outputs["raw"], file_) + "\n") def _parse_vcf_by_fa(self, args_snp, type_, num_prog, log): seq_names = [] fa_prefixs = [] log.write("Parsing Vcf files by comparing fasta information.\n") for fa in os.listdir(args_snp.fastas): if (fa != "all.fa") and (not fa.endswith(".fai")): with open(os.path.join(args_snp.fastas, fa)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_names.append(line[1:]) fa_prefix = ".".join(fa.split(".")[:-1]) fa_prefixs.append(fa_prefix) vcf_folder = os.path.join( self.outputs["raw"], fa_prefix) if num_prog == 0: self.helper.check_make_folder(vcf_folder) self.helper.check_make_folder(os.path.join( self.outputs["table"], fa_prefix)) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], fa_prefix)) for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): out = open(os.path.join(vcf_folder, "_".join( [fa_prefix, vcf])), "w") with open(os.path.join(self.outputs["raw"], vcf)) as vh: for line in vh: line = line.strip() if line.startswith("#"): out.write(line + "\n") else: if line.split("\t")[0] in seq_names: out.write(line + "\n") out.close() log.write("\t" + os.path.join(vcf_folder, "_".join( [fa_prefix, vcf])) + " is generated.\n") for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): os.remove(os.path.join(self.outputs["raw"], vcf)) return fa_prefixs def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog, log): self._run_tools(all_fasta, type_, args_snp, bam_datas, log) fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog, log) log.write("Running transcript_SNP.py to do statistics, filter SNPs, " "and generate potential sequences.\n") log.write("The following files are generated:\n") for fa_prefix in fa_prefixs: for fasta in os.listdir(args_snp.fastas): if fa_prefix in fasta: fasta_file = os.path.join(args_snp.fastas, fasta) table_path = os.path.join(self.outputs["table"], fa_prefix) table_prefix = os.path.join(table_path, "_".join( [fa_prefix, self.baqs[type_]])) self._transcript_snp( fasta_file, table_prefix, type_, fa_prefix, bam_datas, table_path, args_snp) seq_path = os.path.join(self.seq_path, self.baqs[type_], fa_prefix) for folder in (table_path, self.stat_path, seq_path, self.fig_path): for file_ in os.listdir(folder): if os.path.isfile(os.path.join(folder, file_)): log.write("\t" + os.path.join(folder, file_) + "\n") def _run_program(self, all_fasta, bam_datas, args_snp, log): num_prog = 0 log.write("Running Samtools to mpileup, and using Bcftools to " "call snp.\n") log.write("Please make sure the version of Samtools and Bcftools " "are both at least 1.3.1.\n") for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") log.write("Running SNP calling with BAQ.\n") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") log.write("Running SNP calling without BAQ.\n") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") log.write("Running SNP calling extend BAQ.\n") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") log.write("No valid program can be found, please assign" "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".\n") sys.exit() self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog, log) num_prog += 1 def _run_bam(self, samtools_path, sub_command, bam_file, type_file, log): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": if type_file == "all": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) else: command = (" ".join([samtools_path, sub_command, "-o", bam_file, type_file])) log.write(command + "\n") os.system(command) def _merge_bams(self, args_snp, bam_datas, log): bams = [] num_normal = 0 num_frag = 0 log.write("Using Samtools to merge and sort BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for bam in bam_datas: bam["bam_number"] = 0 out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if len(bam["bams"]) == 1: print("Sorting BAM files of " + bam["sample"]) self._run_bam( args_snp.samtools_path, "sort", out_bam, bam["bams"][0], log) bam["bam_number"] = 1 else: print("Merging BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "merge", " ".join(bam["bams"]), "all", log) print("Sorting BAM files of " + bam["sample"]) self._run_bam( args_snp.samtools_path, "sort", out_bam, "all", log) bam["bam_number"] += 1 if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) out_depth = open(self.outputs["depth"] + bam["sample"], "w") log.write(" ".join([args_snp.samtools_path, "index", out_bam]) + "\n") call([args_snp.samtools_path, "index", out_bam]) log.write(" ".join([args_snp.samtools_path, "depth", out_bam]) + "\n") call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth) out_depth.close() log.write("Done!\n") log.write("The following files are generated:\n") log.write("\t" + self.bams["whole"] + " is temporary generated " "(be deleted afterward).\n") for file_ in os.listdir(args_snp.out_folder): if os.path.isfile(os.path.join(args_snp.out_folder, file_)): log.write("\t" + os.path.join(args_snp.out_folder, file_) + "\n") def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": if row[1].split(":")[1] not in seq_names: seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp, bam_datas): seq_names = [] for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") self._get_header(args_snp.samtools_path, bam_file, seq_names) return seq_names def _remove_bams(self, bam_datas, args_snp): for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if os.path.exists(bam_file): os.remove(bam_file) if os.path.exists(bam_file + ".bai"): os.remove(bam_file + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"] + bam["sample"]) def _extract_bams(self, bams, log): bam_datas = [] for bam in bams: datas = bam.split(":") if len(datas) != 2: log.write("the format of --bam_files is wrong!\n") print("Error: the format of --bam_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: there are some Bam files " "which do not exist!") log.write(file_ + " is not found.\n") sys.exit() bam_datas.append({"sample": datas[0], "rep": len(datas[-1].split(",")), "bams": datas[-1].split(",")}) return bam_datas def _merge_fasta(self, fastas, log): all_fasta = os.path.join(fastas, "all.fa") names = [] out = open(all_fasta, "w") print_ = False for fasta in os.listdir(fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fasta")) or ( fasta.endswith(".fna")): with open(os.path.join(fastas, fasta)) as fh: for line in fh: line = line.strip() if line.startswith(">"): if line not in names: print_ = True names.append(line) else: print_ = False if print_: out.write(line + "\n") log.write(os.path.join(fastas, fasta) + " is loaded.\n") out.close() return all_fasta def run_snp_calling(self, args_snp, log): self._modify_header(args_snp.fastas) all_fasta = self._merge_fasta(args_snp.fastas, log) bam_datas = self._extract_bams(args_snp.bams, log) self._merge_bams(args_snp, bam_datas, log) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ( "extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: print("Detecting mutations now") self._run_program(all_fasta, bam_datas, args_snp, log) os.remove(self.outputs["tmp"]) os.remove(all_fasta) os.remove(all_fasta + ".fai") self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams(bam_datas, args_snp) log.write("Remove all the temporary files.\n")
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = {"fasta": os.path.join( args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join( args_ribo.out_folder, "tmp_scan"), "table": os.path.join( args_ribo.out_folder, "tmp_table")} self.suffixs = {"csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv"} def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff(os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join( self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join(os.path.join( self.tmp_files["table"], "_".join([gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus") elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus") else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) out.write("minNumRepMatches = {0}\n".format(args_tss.repmatch)) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus") else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus") self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus") out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", args_tss.program, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move(stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_Transcriptome_assembly_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = { "pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa") } self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) if (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join([ "promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt" ]) if (args_pro.program.lower() == "meme") or (args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder(out_path, folder, "MEME") command = [ args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value) ] if args_pro.para is not None: command = command + ["-p", args_pro.para] call(command + [ "-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta) ]) elif (args_pro.program.lower() == "glam2") or (args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder(out_path, folder, "GLAM2") call([ args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta) ]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open( os.path.join( input_path, "".join([ filename[0], strain, filename[-1] ])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove( os.path.join( input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir( os.path.join(args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join(self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file( os.path.join(args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder( os.path.join(args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join(self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program): '''generate the promoter table''' if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or (program.lower() == "meme"): for folder in os.listdir( os.path.join(output_folder, strain, "MEME")): gen_promoter_table( os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), os.path.join(output_folder, strain, "MEME", folder, "meme.csv"), tss_file, "meme") if (program.lower() == "both") or (program.lower() == "glam2"): for folder in os.listdir( os.path.join(output_folder, strain, "GLAM2")): gen_promoter_table( os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv"), tss_file, "glam2") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None) or (args_pro.tex_wigs is None) or ( args_pro.input_libs is None): print("Error: Please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("Classifying TSS and extracting fasta {0}".format(prefix)) upstream( os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree( os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder( os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("Generating the table") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program) self._remove_files(args_pro)
class Screen(object): def __init__(self, args_sc): self.multiparser = Multiparser() self.helper = Helper() out_folder = os.path.join(args_sc.output_folder, "screenshots") if os.path.exists(out_folder): print("Error: The {0} already exist!!!".format( out_folder)) sys.exit() else: os.mkdir(out_folder) args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder(os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file) def _import_libs(self, texs, strand, wig_path, lib_dict): if strand == "+": tex = "ft" notex = "fn" else: tex = "rt" notex = "rn" for flib in texs: if (flib[1] == "tex"): lib_dict[tex].append(os.path.join(wig_path, flib[0])) for nlib in texs: if (nlib[1] == "notex") and \ (flib[2] == nlib[2]) and \ (flib[3] == nlib[3]): lib_dict[notex].append(os.path.join(wig_path, nlib[0])) def screenshot(self, args_sc): lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []} f_texs = [] r_texs = [] if args_sc.tlibs is not None: for lib in args_sc.tlibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() else: if lib_datas[-1] == "+": f_texs.append(lib_datas) else: r_texs.append(lib_datas) f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3])) r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3])) self._import_libs(f_texs, "+", args_sc.tex_wigs, lib_dict) self._import_libs(r_texs, "-", args_sc.tex_wigs, lib_dict) if args_sc.flibs is not None: for lib in args_sc.flibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() else: if lib_datas[-1] == "+": lib_dict["ff"].append(os.path.join( args_sc.frag_wigs, lib_datas[0])) else: lib_dict["rf"].append(os.path.join( args_sc.frag_wigs, lib_datas[0])) gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt", self.reverse_file + ".txt", self.strain) if (args_sc.tlibs is None) and (args_sc.flibs is None): print("Error: There are no wig file assigned!!!") sys.exit()
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or (args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or (args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = { "fasta": os.path.join(out_folder, "tmp_fasta"), "scan": os.path.join(out_folder, "tmp_scan"), "table": os.path.join(out_folder, "tmp_table") } rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = { "csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv" } return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam, log): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") if args_ribo.cutoff.split("_")[0] == "e": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join( [args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq], stdout=scan) elif args_ribo.cutoff.split("_")[0] == "s": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join( [args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq], stdout=scan) else: print("Error: the --cutoff needs to start from 'e' " "(e value) or 's' (score)!") log.write("the --cutoff needs to start from 'e' " "(e value) or 's' (score).\n") sys.exit() scan.close() log.write("Done!\n") log.write("\t" + scan_file + " is temporary generated.\n") return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) if self.tss_path is not None: tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff") else: tss_file = None log.write("Running extract_RBS.py to extract potential " "sequences of riboswitches/RNA thermometers for " "{0}.\n".format(prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), tss_file, os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) log.write("\t" + first_seq + " is temporary generated.\n") print("Pre-scanning of {0}".format(prefix)) log.write("Using Infernal to pre-scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write( "Please make sure the version of Infernal is at least 1.1.1.\n" ) first_scan_file = self._run_cmscan(args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam, log) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join(tmp_files["table"], "_".join([prefix, suffixs["csv"]])) log.write( "Running recompute_RBS.py to update the potential " "sequences of riboswitches/RNA thermometers for {0} " "based on the pre-scanning results.\n".format(prefix)) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) log.write("\t" + sec_seq + " is temporary generated.\n") print("Scanning of {0}".format(prefix)) log.write("Using Infernal to scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at " "least 1.1.1.\n") sec_scan_file = self._run_cmscan(args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam, log) sec_table = os.path.join(tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) log.write("Running recompute_RBS.py and modify_rbs_table.py " "to generate tables for {0} " "based on the scanning results.\n".format(prefix)) reextract_rbs(sec_scan_file, first_table, sec_table, args_ribo.cutoff) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) log.write("Merging the results from Infernal to generate " "tables for {0}.\n".format(prefix)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile( os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file( os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy( os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy( os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id log.write("The following files are generated.\n") for folder in (table_folder, scan_folder): for file_ in os.listdir(folder): log.write("\t" + os.path.join(folder, file_) + "\n") out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) log.write("Running ribo_gff.py to do statistics and generate " "gff files for {0}.\n".format(prefix)) log.write("The following files are generated:\n") out_gff = os.path.join(gff_outfolder, "_".join([prefix, feature + ".gff"])) stat_and_covert2gff( os.path.join(table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, out_gff, args_ribo.fuzzy, out_stat, feature) log.write("\t" + out_gff + "\n") log.write("\t" + out_stat + "\n") fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log): log.write("Running rbs_overlap.py to remove the overlapping " "riboswitches/RNA thermometers.\n") for gff in os.listdir(gff_path): if gff.endswith(".gff"): tmp_table = os.path.join( os.path.join( tmp_files["table"], "_".join([gff.replace(".gff", ""), suffixs["csv"]]))) rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_, fuzzy) log.write("\t" + tmp_table + " is updated.\n") def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder, type_, log): '''main part of detection''' log.write("Running get_Rfam_ribo.py to get the information of " "riboswitches/RNA thermometers from Rfam.\n") rbs_from_rfam(feature_id, args_ribo.rfam, rfam) log.write("Using Infernal to compress the Rfam data of " "riboswitches/RNA thermometers.\n") log.write( "Please make sure the version of Infernal is at least 1.1.1.\n") print("Compressing Rfam of " + feature) log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n") call([args_ribo.cmpress_path, "-F", rfam]) log.write("Done!\n") prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log) self._remove_overlap(self.gff_path, tmp_files, suffixs, type_, args_ribo.fuzzy, log) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log) log.write( "Running map_ribos.py to extract all the details from Rfam.\n") mapping_ribos(table_folder, feature_id, feature) log.write("The following files are updated:\n") for file_ in os.listdir(table_folder): log.write("\t" + os.path.join(table_folder, file_) + "\n") self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo, log_t, log_r): if args_ribo.fuzzy_rbs > 6: if log_t is not None: log_t.write("--fuzzy_rbs should be equal or less than 6!\n") if log_r is not None: log_r.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") if args_ribo.tsss is not None: self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or (args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder, "riboswitch", log_r) if (args_ribo.program.lower() == "both") or (args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction(args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder, "thermometer", log_t) self._remove_tmp(args_ribo)
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if os.path.exists(os.path.join(out_folder, "fasta_files")): shutil.rmtree(os.path.join(out_folder, "fasta_files")) os.mkdir(os.path.join(out_folder, "fasta_files")) if os.path.exists(self.folders["tmp_tar"]): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, out_name, out_folder, log): new_ref_folder = self.gen_folder(out_folder, ref_files) log.write("Running seq_editor.py for updating sequence.\n") self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"], out_name) print("Updating the reference sequences") mh = open(mut_table, "r") pre_strain = None out = None strain_num = 0 for row in csv.reader(mh, delimiter='\t'): if not row[0].startswith("#"): if (pre_strain != row[0]): strain_num = strain_num + 1 tmp_tar_name = "_".join([out_name, row[0]]) + ".fa" fasta = os.path.join(out_folder, "fasta_files", tmp_tar_name) if out is not None: out.close() out = open(fasta, "w") if tmp_tar_name in os.listdir(self.folders["tmp_tar"]): with open(os.path.join( self.folders["tmp_tar"], tmp_tar_name)) as f_h: for line in f_h: out.write(line) else: print("Error: No updated information of {0}.fa".format( row[0])) pre_strain = row[0] out.close() out_seq = out_name + ".fa" if os.path.exists(out_seq): os.remove(out_seq) if strain_num == 1: o_s = open(out_seq, "w") for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): with open(os.path.join( out_folder, "fasta_files", seq)) as t_h: for line in t_h: if len(line) != 0: if line.startswith(">"): o_s.write(">" + out_name + "\n") else: o_s.write(line) os.remove(os.path.join(out_folder, "fasta_files", seq)) o_s.close() else: for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): os.system(" ".join(["cat", os.path.join( out_folder, "fasta_files", seq), ">>", out_seq])) os.remove(os.path.join(out_folder, "fasta_files", seq)) shutil.move(out_seq, os.path.join( out_folder, "fasta_files", out_seq)) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) log.write("\t" + os.path.join(out_folder, "fasta_files", out_seq) + " is generated.\n") print("Please use the new fasta files to remapping again.")
class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _set_gen_config(self, args_tss, input_folder, log): prefixs = [] detect = False log.write("Generating config files for TSSpredator.\n") for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config, log) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder, log) for prefix in prefixs: out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class Crispr(object): '''Detection of CRISPR''' def __init__(self, args_cris): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_cris.gffs, "tmp") self.fasta_path = os.path.join(args_cris.fastas, "tmp") self.stat_folder = os.path.join(args_cris.out_folder, "statistics") self.gff_out = os.path.join(args_cris.out_folder, "gffs") self.all_out = os.path.join(args_cris.out_folder, "gffs", "all_candidates") self.best_out = os.path.join(args_cris.out_folder, "gffs", "best") self.helper.check_make_folder(self.all_out) self.helper.check_make_folder(self.best_out) self.data_folder = os.path.join(args_cris.out_folder, "CRT_output") self.helper.check_make_folder(self.data_folder) self.helper.check_make_folder(self.stat_folder) def _run_crt(self, args_cris): '''Running CRT''' print("Running CRT") for seq in os.listdir(self.fasta_path): prefix = ".".join(seq.split(".")[:-1]) call([ "java", "-cp", args_cris.crt_path, "crt", "-minNR", str(args_cris.min_num_r), "-minRL", str(args_cris.min_len_r), "-maxRL", str(args_cris.max_len_r), "-minSL", str(args_cris.min_len_s), "-maxSL", str(args_cris.max_len_s), "-searchWL", str(args_cris.win_size), os.path.join(self.fasta_path, seq), os.path.join(self.data_folder, prefix + ".txt") ]) def _read_gff(self, txt): gffs = [] gh = open(os.path.join(self.gff_path, txt.replace(".txt", ".gff")), "r") for entry in Gff3Parser().entries(gh): if (entry.feature == "gene") or (entry.feature == "CDS") or ( entry.feature == "tRNA") or (entry.feature == "rRNA"): gffs.append(entry) gh.close() return gffs def _compare_gff(self, strain, start, end, gffs, bh, indexs, ignore_hypo): '''Compare CRISPR and genome annotation to remove the false positives''' overlap = False id_ = None for gff in gffs: if (gff.seq_id == strain): if ((gff.start <= start) and (gff.end >= end)) or ( (gff.start >= start) and (gff.end <= end)) or ( (gff.start <= start) and (gff.end > start) and (gff.end <= end)) or ((gff.start >= start) and (gff.start < end) and (gff.end >= end)): if "product" in gff.attributes.keys(): if ((not ignore_hypo) and ("hypothetical protein" in gff.attributes["product"])) or ( "hypothetical protein" not in gff.attributes["product"]): overlap = True if not overlap: id_ = "CRISPR_" + str(indexs["best"]) attribute = ";".join(["ID=" + strain + "_" + id_, "method=CRT"]) bh.write("\t".join([ strain, "ANNOgesic", "CRISPR", str(start), str(end), ".", ".", ".", attribute ]) + "\n") indexs["best"] += 1 return overlap, id_ def _print_repeat(self, row, strain, file_h, indexs, id_, best): '''Print the repeat units''' if best: num = indexs["re_best"] else: num = indexs["re_all"] if (not row[0].startswith("-")) and ( not row[0].startswith("Repeats:")) and ( not row[0].startswith("CRISPR")) and ( not row[0].startswith("POSITION")): start = row[0].strip() end = str(int(start) + len(row[2].strip()) - 1) attribute = ";".join([ "ID=" + strain + "_Repeat_" + str(num), "method=CRT", "Parent=" + id_ ]) file_h.write("\t".join([ strain, "ANNOgesic", "repeat_unit", start, end, ".", ".", ".", attribute ]) + "\n") num += 1 if row[0].startswith("Repeats:"): indexs["run"] = False return num def _convert_gff(self, ignore_hypo): '''Convert the final CRT output to gff format''' for txt in os.listdir(self.data_folder): gffs = self._read_gff(txt) fh = open(os.path.join(self.data_folder, txt), "r") oh = open( os.path.join(self.all_out, txt.replace(".txt", "_CRISPR.gff")), "w") bh = open( os.path.join(self.best_out, txt.replace(".txt", "_CRISPR.gff")), "w") indexs = { "all": 0, "re_all": 0, "best": 0, "re_best": 0, "run": False } for row in csv.reader(fh, delimiter='\t'): if len(row) != 0: if row[0].startswith("ORGANISM:"): strain = row[0].split(" ")[-1] elif row[0].startswith("CRISPR"): end = row[0].split("-")[-1].strip() start = row[0].split("-")[0].split(":")[-1].strip() id_ = "CRISPR_" + str(indexs["all"]) attribute = ";".join( ["ID=" + strain + "_" + id_, "method=CRT"]) oh.write("\t".join([ strain, "ANNOgesic", "CRISPR", start, end, ".", ".", ".", attribute ]) + "\n") overlap, over_id = self._compare_gff( strain, int(start), int(end), gffs, bh, indexs, ignore_hypo) indexs["all"] += 1 indexs["run"] = True if indexs["run"]: indexs["re_all"] = self._print_repeat( row, strain, oh, indexs, id_, False) if not overlap: indexs["re_best"] = self._print_repeat( row, strain, bh, indexs, over_id, True) fh.close() oh.close() bh.close() def _stat_and_correct(self, stats, folder): '''do statistics and print the final gff file''' for gff in os.listdir(folder): prefix = gff.replace("_CRISPR.gff", "") stats[prefix] = {"all": {"cri": 0, "re": {}}} gh = open(os.path.join(folder, gff), "r") oh = open("tmp_cri.gff", "w") oh.write("##gff-version 3\n") cr_num = 0 re_num = 0 first = True for entry in Gff3Parser().entries(gh): if entry.seq_id not in stats[prefix].keys(): stats[prefix][entry.seq_id] = {"cri": 0, "re": {}} if entry.feature == "CRISPR": id_ = "CRISPR_" + str(cr_num) attribute = ";".join( ["ID=" + entry.seq_id + "_" + id_, "method=CRT"]) cr_num += 1 if first: first = False else: if repeat not in stats[prefix][ entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 repeat = 0 stats[prefix][entry.seq_id]["cri"] += 1 stats[prefix]["all"]["cri"] += 1 elif entry.feature == "repeat_unit": attribute = ";".join([ "ID=" + entry.seq_id + "_Repeat_" + str(re_num), "method=CRT", "Parent=" + id_ ]) re_num += 1 repeat += 1 oh.write( "\t".join([entry.info_without_attributes, attribute]) + "\n") if not first: if repeat not in stats[prefix][entry.seq_id]["re"].keys(): stats[prefix][entry.seq_id]["re"][repeat] = 1 else: stats[prefix][entry.seq_id]["re"][repeat] += 1 if repeat not in stats[prefix]["all"]["re"].keys(): stats[prefix]["all"]["re"][repeat] = 1 else: stats[prefix]["all"]["re"][repeat] += 1 gh.close() oh.close() os.remove(os.path.join(folder, gff)) shutil.move("tmp_cri.gff", os.path.join(folder, gff)) def _print_file(self, sh, cri_res_all, cri_res_best): sh.write("\tthe number of CRISPR - {0}\n".format(cri_res_all["cri"])) for index, num in cri_res_all["re"].items(): sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format( index, num)) sh.write("\tthe number of CRISPR which not overlap " "with genome annotation - {0}\n".format(cri_res_best["cri"])) for index, num in cri_res_best["re"].items(): sh.write("\t\tCRISPR with {0} repeat units - {1}\n".format( index, num)) def _print_stat(self, stats): '''print the statistics file''' for prefix, strains in stats["all"].items(): sh = open(os.path.join(self.stat_folder, prefix + ".csv"), "w") if len(strains) == 1: sh.write("No CRISPR can be detected") elif len(strains) <= 2: for strain, cri_res in strains.items(): if strain != "all": sh.write(strain + ":\n") self._print_file(sh, cri_res, stats["best"][prefix][strain]) else: sh.write("All strains:\n") self._print_file(sh, stats["all"][prefix]["all"], stats["best"][prefix]["all"]) for strain, cri_res in strains.items(): if strain != "all": sh.write(strain + ":\n") if strain not in stats["best"][prefix].keys(): stats["best"][prefix][strain] = { "cri": 0, "re": {} } self._print_file(sh, cri_res, stats["best"][prefix][strain]) sh.close() def run_crispr(self, args_cris): '''detection of CRISPR''' self.multiparser.parser_fasta(args_cris.fastas) self.multiparser.parser_gff(args_cris.gffs, None) self._run_crt(args_cris) self._convert_gff(args_cris.ignore_hypo) print("All candidates:") self.multiparser.combine_gff(args_cris.gffs, self.all_out, None, "CRISPR") print("Best candidates:") self.multiparser.combine_gff(args_cris.gffs, self.best_out, None, "CRISPR") stats = {"all": {}, "best": {}} self._stat_and_correct(stats["all"], self.all_out) self._stat_and_correct(stats["best"], self.best_out) self._print_stat(stats) self.helper.remove_tmp_dir(args_cris.gffs) self.helper.remove_tmp_dir(args_cris.fastas)
class CircRNADetection(object): def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_folder): tmp_reads = [] for read in os.listdir(read_folder): if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["bzcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read): mod_read = mod_read + ".fa" read_out = open(os.path.join(read_folder, mod_read), "w") tmp_reads.append(os.path.join(read_folder, mod_read)) print(" ".join(["unzip", read])) call(["zcat", os.path.join(read_folder, read)], stdout=read_out) read_out.close() return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta): call([os.path.join(segemehl_path, "segemehl.x"), "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") p = Popen([os.path.join(args_circ.segemehl_path, "segemehl.x"), "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", os.path.join(args_circ.read_folder, read), "-S"], stdout=out, stderr=log) return p def _align(self, args_circ): prefixs = [] align_files = [] for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) for read in os.listdir(args_circ.read_folder): num_process += 1 if read.endswith(".fa") or \ read.endswith(".fna") or \ read.endswith("fasta"): filename = read.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam): call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files): bam_files = [] convert_ones = [] remove_ones = [] for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Convert {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, sub_alignment_path, bam_files): print("Merge all bam files....") whole_bam = os.path.join(sub_alignment_path, self.bams["whole"]) if len(bam_files) <= 1: shutil.copyfile(bam_files[0], whole_bam) else: file_line = " ".join(bam_files) os.system(" ".join([samtools_path, "merge", whole_bam, file_line])) print("Sort bam files....") call([samtools_path, "sort", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".bam"), whole_bam]) os.remove(os.path.join(sub_alignment_path, self.bams["whole"])) def _run_samtools_convert_sam(self, samtools_path, sub_alignment_path): print("Convert whole reads bam file to sam file....") call([samtools_path, "view", "-h", "-o", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), os.path.join(sub_alignment_path, self.bams["sort"] + ".bam")]) def _merge_sort_aligment_file(self, bam_files, samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones): self._run_samtools_merge_sort(samtools_path, sub_alignment_path, bam_files) self._run_samtools_convert_sam(samtools_path, sub_alignment_path) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) if len(tmp_reads) != 0: for read in tmp_reads: os.remove(read) def _run_testrealign(self, prefix, segemehl_path, sub_alignment_path): self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) sub_splice_path = os.path.join(self.splice_path, prefix) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) command = " ".join([ os.path.join(segemehl_path, "testrealign.x"), "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(sub_alignment_path, self.bams["sort"] + ".sam"), "-n"]) os.system(command + " 2>" + err_log) self.helper.move_all_content(os.getcwd(), sub_splice_path, [".bed"]) self.helper.remove_all_content(sub_alignment_path, self.bams["sort"], "file") def _merge_bed(self, fastas, splice_path): tmp_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) tmp_prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( os.getcwd(), fasta_prefix)) for header in headers: shutil.copyfile(os.path.join(splice_path, header, self.splices["file"]), os.path.join(fasta_prefix, "_".join([self.splices["splice"], header + ".bed"]))) shutil.copyfile(os.path.join(splice_path, header, self.trans["file"]), os.path.join(fasta_prefix, "_".join([self.trans["trans"], header + ".bed"]))) out_splice = os.path.join(fasta_prefix, self.splices["all_file"]) out_trans = os.path.join(fasta_prefix, self.trans["all_file"]) if len(headers) > 1: for file_ in os.listdir(fasta_prefix): if (self.splices["splice"] in file_) and ( self.splices["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_splice) elif (self.trans["trans"] in file_) and ( self.trans["all"] not in file_): self.helper.merge_file(os.path.join( fasta_prefix, file_), out_trans) else: shutil.move(os.path.join( fasta_prefix, "_".join([self.splices["splice"], headers[0] + ".bed"])), out_splice) shutil.move(os.path.join( fasta_prefix, "_".join([self.trans["trans"], headers[0] + ".bed"])), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return tmp_prefixs def _stat_and_gen_gff(self, tmp_prefixs, args_circ): for prefix in tmp_prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) shutil.copytree(prefix, os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("comparing with annotation of {0}".format(prefix)) if self.splices["all_file"] in os.listdir(os.path.join( self.splice_path, prefix)): detect_circrna(os.path.join(self.splice_path, prefix, self.splices["all_file"]), os.path.join( self.gff_path, prefix + ".gff"), os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join(args_circ.stat_folder, "_".join(["stat_circRNA", prefix + ".csv"]))) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "_".join(["circRNA", prefix + "_all.csv"])), args_circ, os.path.join( self.gff_folder, prefix, "_".join([prefix, "circRNA_all.gff"])), os.path.join(self.gff_folder, prefix, "_".join([prefix, "circRNA_best.gff"]))) def _assign_merge_bam(self, args_circ): remove_frags = [] bam_files = [] if (args_circ.normal_bams is not None) and ( args_circ.frag_bams is not None): for frag in os.listdir(args_circ.frag_bams): if frag.endswith(".bam"): shutil.copyfile(os.path.join(args_circ.frag_bams, frag), os.path.join(args_circ.normal_bams, frag)) remove_frags.append(frag) merge_folder = args_circ.normal_bams elif (args_circ.normal_bams is not None): merge_folder = args_circ.normal_bams elif (args_circ.frag_bams is not None): merge_folder = args_circ.frag_bams else: print("Error: please assign bam folder or do alignment!!") sys.exit() for bam in os.listdir(merge_folder): if bam.endswith(".bam"): bam_files.append(os.path.join(merge_folder, bam)) return merge_folder, remove_frags, bam_files def run_circrna(self, args_circ): for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: print("Error: please assign segemehl folder!!") sys.exit() self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.align: self.multiparser.parser_fasta(args_circ.fastas) tmp_reads = self._deal_zip_file(args_circ.read_folder) align_files, prefixs = self._align(args_circ) else: self.multiparser.parser_fasta(args_circ.fastas) prefixs = [] for fasta in os.listdir(self.fasta_path): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) merge_folder, remove_frag, bam_files = self._assign_merge_bam( args_circ) align_files = None for prefix in prefixs: if args_circ.align: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files) else: sub_alignment_path = merge_folder convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_files, args_circ.samtools_path, sub_alignment_path, convert_ones, tmp_reads, remove_ones) self._run_testrealign(prefix, args_circ.segemehl_path, sub_alignment_path) tmp_prefixs = self._merge_bed(args_circ.fastas, self.splice_path) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) self._stat_and_gen_gff(tmp_prefixs, args_circ) self.helper.remove_tmp(args_circ.fastas) self.helper.remove_tmp(args_circ.gffs) for tmp_prefix in tmp_prefixs: shutil.rmtree(tmp_prefix) if (not args_circ.align) and (len(remove_frag) != 0): for frag in remove_frag: os.remove(os.path.join(merge_folder, frag))
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class MEME(object): def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_class") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro): print(os.path.join(input_path, fasta)) folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro): data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if folder not in os.listdir(out_path): call([args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value), "-oc", os.path.join(out_path, folder), os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allstrain_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allstrain_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allstrain_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allstrain_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allstrain_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allstrain_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allstrain_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allstrain" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allstrain") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro): for prefix in prefixs: print(prefix) input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro) def _combine_file(self, prefixs, args_pro): if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_class")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("generating fasta file of all fasta files") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp(args_pro.fastas) self.helper.remove_tmp(args_pro.tsss) self.helper.remove_tmp(args_pro.gffs) self.helper.remove_tmp(args_pro.wigs) if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine): if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: for folder in os.listdir(os.path.join(output_folder, strain)): tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") gen_promoter_table(os.path.join(output_folder, strain, folder, "meme.txt"), os.path.join(output_folder, strain, folder, "meme.csv"), tss_file) def _get_upstream(self, args_pro, prefix, tss, fasta): if args_pro.source: print("generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro) else: if (args_pro.gffs is None) or ( args_pro.wigs is None) or ( args_pro.input_libs is None): print("Error:please assign proper annotation, tex +/- " "wig folder and tex treated libs!!!") sys.exit() if "TSS_class" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_class")) print("classifying TSS and extracting fasta {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_class", "_".join([prefix, "TSS.gff"])), args_pro) def run_meme(self, args_pro): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) if args_pro.combine: self._combine_file(prefixs, args_pro) self._run_program(prefixs, args_pro) print("generating the table...") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine) self._remove_files(args_pro)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbk) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") self._remove_files(args_ratt, out_gbk)
class sRNATargetPrediction(object): def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, vienna_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([os.path.join(vienna_path, "RNAplfold"), "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[1], "start": int(srna_datas[2]), "end": int(srna_datas[3])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files...") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files...") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_path, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([os.path.join(args_tar.vienna_path, "RNAplex"), "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_path], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_path = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_path) num_process = self._run_rnaplex(prefix, rnaplfold_path, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([os.path.join(args_tar.vienna_path, "RNAup"), "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now...".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target(rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or ( args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") if (args_tar.program == "both") or ( args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) if (args_tar.program == "RNAplex") or ( args_tar.program == "both"): for strain in os.listdir(os.path.join( args_tar.out_folder, "RNAplex")): shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex", strain, "RNAplfold")) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp(args_tar.gffs) self.helper.remove_tmp(args_tar.srnas) self.helper.remove_tmp(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": log.write("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: log.write("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder]) + "\n") p = Popen([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _rna_plex(self, prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold( args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder, log) self._run_rnaplfold( args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"]) + "\n") p = Popen([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write("Please make sure the version of IntaRNA is at least 2.0.4.\n") for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([args_tar.intarna_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, prefix + "_target.fa"), "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write("Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target(rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or ( line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup( line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format( args_sub.gram)) sys.exit() log.write("\t" + os.path.join(tmp_result, "_".join([ prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join(stat_folder, "_".join([ "stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class GoTermFinding(object): '''Retrieving the GO term''' def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_): prefixs = [] for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("Extracting GO terms of {0} from UniProt".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) def _merge_files(self, gffs, out_path, out_folder): '''merge the files according to the input genome folder''' folders = [] for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) def _stat(self, out_path, stat_path, go, goslim, out_folder): for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])), out_folder) self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) def run_go_term(self, args_go): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDSs") self._retrieve_go(args_go.uniprot, self.result_all_path, "all") self._merge_files(args_go.gffs, self.result_all_path, self.out_all) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all) if args_go.trans is not None: print("Computing express CDSs") self._retrieve_go(args_go.uniprot, self.result_express_path, "express") self._merge_files(args_go.gffs, self.result_express_path, self.out_express) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express) self.helper.remove_tmp_dir(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp_dir(args_go.trans)
class SubLocal(object): def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generate CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("transfer DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error:It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): if args_sub.merge: print("Merge to gff...") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp(args_sub.fastas) self.helper.remove_tmp(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now...") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now...") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class TranscriptDetection(object): '''doing for transcript detection''' def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcripts for {0}".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join( [strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join( [strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of transcripts and TSSs") log.write("Running stat_TA_comparison.py to compare transcripts " "with TSSs.\n") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join([ "stat_compare_transcript_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join( args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) log.write("\t" + stat_tss_out + "\n") def _compare_cds(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.gffs, None) self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"), "transcript", None) print("Comaring of transcripts and genome annotations") cds_folder = os.path.join(args_tran.gffs, "tmp") log.write("Running stat_TA_comparison.py to compare transcripts " "with genome annotations.\n") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join(self.stat_path, "".join([ "stat_compare_transcript_genome_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.gffs, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join( args_tran.gffs, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) log.write("\t" + stat_gff_out + ".\n") def _compare_tss_cds(self, tas, args_tran, log): '''compare transcript with CDS and TSS''' if (args_tran.compare_tss is not None) and ( args_tran.c_feature is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) self._compare_tss(tas, args_tran, log) elif (args_tran.c_feature is not None) and ( args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) elif (args_tran.c_feature is None) and ( args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran, log) def _for_one_wig(self, type_, args_tran): '''running transcript detection to one type of wig files''' if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Importing {0} wig files".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join(self.gff_outfolder, "_".join([ strain, "transcript", type_ + ".gff"])) print(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) self.helper.sort_gff(os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran, log): '''merge the results of fragemented and tex treated libs''' if (args_tran.frag_wigs is not None) and ( args_tran.tex_wigs is not None): log.write("Running combine_frag_tex.py to merge the results from " "fragmented libs and dRNA-Seq libs.\n") print("Merging fragmented and tex treated ones") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "_transcript_" in gff: filename = gff.split("_transcript_") if (strain == filename[0]) and ( "tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ( "fragment.gff" == filename[1]): frag_file = gff combine(os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) log.write("\t" + final_gff + " is generated.\n") else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") def _post_modify(self, tas, args_tran): '''modify the transcript by comparing with genome annotation''' for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} by refering to {1}".format(ta, gff)) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"], args_tran.modify) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"], args_tran.modify) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move(final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if "tmp_wig" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig")) if "merge_wigs" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs")) self.helper.remove_tmp_dir(args_tran.gffs) self.helper.remove_tmp_dir(args_tran.compare_tss) self.helper.remove_tmp_dir(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran, log): '''searching the associated terminator to transcript''' if args_tran.terms is not None: print("Comparing between terminators and transcripts") self.multiparser.parser_gff(args_tran.terms, "term") if args_tran.gffs is not None: self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") log.write("Running compare_tran_term.py to compare transcripts " "with terminators.\n") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript", args_tran.terms, self.gff_outfolder) for file_ in os.listdir(os.path.join(args_tran.out_folder, "statistics")): if file_.startswith("stat_compare_transcript_terminator_"): log.write("\t" + file_ + " is generated.\n") def _re_table(self, args_tran, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for gff in os.listdir(self.gff_outfolder): if os.path.isfile(os.path.join(self.gff_outfolder, gff)): tran_table = os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")) reorganize_table(args_tran.libs, args_tran.merge_wigs, "Coverage_details", tran_table) log.write("\t" + tran_table + "\n") def _list_files(self, folder, log, end): log.write("The following files in {0} are generated:\n".format(folder)) for file_ in os.listdir(folder): if (end is not None) and (file_.endswith(end)): log.write("\t" + file_ + "\n") elif end is None: log.write("\t" + file_ + "\n") def run_transcript(self, args_tran, log): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): log.write("No wig file is assigned.\n") print("Error: There is no wiggle file!\n") sys.exit() if args_tran.frag_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on fragmented libs.\n") strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on dRNA-Seq libs.\n") strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran, log) tas = [] if "none" not in args_tran.modify: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join( args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) log.write("Running fill_gap.py to modify transcripts " "based on genome annotations.\n") self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran, log) self._compare_term_tran(args_tran, log) print("Generating tables for the details") log.write("Running gen_table_tran.py to generate the table of transcripts.\n") gen_table_transcript(self.gff_outfolder, args_tran) self._list_files(os.path.join(args_tran.out_folder, "tables"), log, None) log.write("Running plot_tran to plot the distribution of the length of " "the transcripts.\n") plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist) self._list_files(self.stat_path, log, ".png") self._re_table(args_tran, log) self._remove_file(args_tran)
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = {"fasta": os.path.join( out_folder, "tmp_fasta"), "scan": os.path.join( out_folder, "tmp_scan"), "table": os.path.join( out_folder, "tmp_table")} rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = {"csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv"} return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") call([args_ribo.cmscan_path, "--incE", str(args_ribo.e_value), "--acc", rfam, seq], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) print("Pre-scanning of {0}".format(prefix)) first_scan_file = self._run_cmscan( args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("Scanning of {0}".format(prefix)) sec_scan_file = self._run_cmscan( args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam) sec_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) stat_and_covert2gff(os.path.join( table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, os.path.join(gff_outfolder, "_".join([prefix, feature + ".gff"])), args_ribo.fuzzy, out_stat, feature) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join(os.path.join( tmp_files["table"], "_".join([gff.replace(".gff", ""), suffixs["csv"]]))), os.path.join(gff_path, gff)) def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder): '''main part of detection''' rbs_from_rfam(feature_id, args_ribo.rfam, rfam) print("Compressing Rfam of " + feature) call([args_ribo.cmpress_path, "-F", rfam]) prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam( prefixs, args_ribo, tmp_files, suffixs, feature, rfam) self._remove_overlap(self.gff_path, tmp_files, suffixs) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature) mapping_ribos(table_folder, feature_id, feature) self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction( args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder) self._remove_tmp(args_ribo)
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return {"wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4]} def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num+1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format( prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log): print("Running TSSpredator for " + prefix) log.write("Make sure the version of TSSpredator is at least 1.06.\n") out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") log.write(" ".join(["java", "-jar", tsspredator_path, config_file]) + "\n") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() log.write("Done!\n") log.write("The following files are generated in {0}:\n".format(out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error: Wiggle files are not end with .wig!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and ( filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num+1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "ps": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name! Please assing tss " "or processing_site.") sys.exit() for num_id in range(1, lib_num+1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num+1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' detect_all = False for rep in args_tss.repmatch: if "all" in rep: detect_all = True match = rep.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) break if not detect_all: nums = {} matchs = {} for match in args_tss.repmatch: lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format( lib, rep)) def _extract_best_para(self, args_tss, prefix, log): detect = False for best_file in os.listdir(args_tss.auto_load): if best_file == "_".join(["best", prefix + ".csv"]): bh = open(os.path.join(args_tss.auto_load, best_file),"r" ) lines = bh.readlines() bh.close() if len(lines[len(lines)-1].split("\t")) < 8: print("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.".format(best_file)) log.write("Error: some information in {0} is missing. " "It may be due to that \"optimize_tss_ps\" did " "not finish successfully.\n".format(best_file)) sys.exit() else: para_info = lines[len(lines)-1].split("\t")[1].split("_") detect_all = all(elem in para_info for elem in ["he", "rh", "fa", "rf", "bh", "ef", "pf"]) if (not detect_all) or (len(para_info) != 14): print("Error: {0} is complete. Some parameters are " "missing!".format(best_file)) log.write("Error: {0} is complete. Some parameters " "are missing!\n".format(best_file)) sys.exit() else: detect = True height = para_info[para_info.index("he") + 1] height_reduction = para_info[ para_info.index("rh") + 1] factor = para_info[para_info.index("fa") + 1] factor_reduction = para_info[ para_info.index("rf") + 1] base_height = para_info[ para_info.index("bh") + 1] enrichment_factor = para_info[ para_info.index("ef") + 1] processing_factor = para_info[ para_info.index("pf") + 1] if detect: return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor else: print("Error: No best_{0}.csv can be found in {1}! ".format( prefix, args_tss.auto_load)) log.write("Error: No best_{0}.csv can be found in {1}\n".format( prefix, args_tss.auto_load)) sys.exit() def _get_input_para(self, args_tss, prefix, log): if args_tss.genome_order is None: height = args_tss.height[0] height_reduction = args_tss.height_reduction[0] factor = args_tss.factor[0] factor_reduction = args_tss.factor_reduction[0] base_height = args_tss.base_height[0] enrichment_factor = args_tss.enrichment_factor[0] processing_factor = args_tss.processing_factor[0] else: if prefix not in args_tss.genome_order: print("Error: the parameters for {0} were not assigned!".format( prefix)) log.write("Error: the parameters for {0} were not assigned!\n".format( prefix)) sys.exit() else: index = args_tss.genome_order.index(prefix) height = args_tss.height[index] height_reduction = args_tss.height_reduction[index] factor = args_tss.factor[index] factor_reduction = args_tss.factor_reduction[index] base_height = args_tss.base_height[index] enrichment_factor = args_tss.enrichment_factor[index] processing_factor = args_tss.processing_factor[index] return height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file, log): '''generation of config files''' log.write("Generating config files for TSSpredator.\n") if args_tss.auto_load is not None: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._extract_best_para(args_tss, project_strain_name, log) else: height, height_reduction, factor, factor_reduction, \ base_height, enrichment_factor, processing_factor = \ self._get_input_para(args_tss, project_strain_name, log) master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format( args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( enrichment_factor)) out.write("minCliffFactor = {0}\n".format(factor)) out.write("minCliffFactorDiscount = {0}\n".format( factor_reduction)) out.write("minCliffHeight = {0}\n".format(height)) out.write("minCliffHeightDiscount = {0}\n".format( height_reduction)) out.write("minNormalHeight = {0}\n".format(base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") log.write("\t" + config_file + " is generated.\n") out.close() def _convert_gff(self, prefixs, args_tss, log): for prefix in prefixs: out_file = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join([ "MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error: There is not MasterTable file in {0} ".format( out_path)) print("Please check configuration file.") log.write("not MasterTable file is found in {0}\n".format( out_path)) else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) log.write("\t" + out_file + "is generated.\n") gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) manual = os.path.join(self.manual_path, tss + ".gff") fasta = os.path.join(self.fasta_path, tss + ".fa") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) if os.path.exists(manual): print("Merging and classiflying manually-detected " "TSSs for {0}".format(tss)) merge_manual_predict_tss( predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss, manual, fasta) if os.path.exists(stat_file): shutil.move(stat_file, os.path.join( args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss, log): '''validate TSS with genome annotation''' print("Validating TSSs with genome annotations") log.write("Running validate_gene.py to compare genome " "annotations and TSSs/PSs.\n") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join( self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) log.write("\t" + stat_file + " is generated.\n") shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss, log): '''compare TSS with transcript''' detect = False log.write("Running stat_TA_comparison to compare transcripts " "and TSSs/PSs.\n") print("Comparing transcripts and TSSs") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join([ "stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False log.write("\t" + stat_out + " is generated.\n") def _stat_tss(self, tsss, feature, log): print("Running statistaics") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join([ "stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists(os.path.join( self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join( self.stat_outfolder, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content(os.getcwd(), os.path.join( self.stat_outfolder, tss), ["_venn", ".png"]) log.write("The following files in {0} are generated:\n".format( (os.path.join(self.stat_outfolder, tss)))) for file_ in os.listdir(os.path.join( self.stat_outfolder, tss)): log.write("\t" + file_ + "\n") def _get_prefixs(self, args_tss): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): run = False for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder(os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and ( os.path.isfile(os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join(self.tmps["tmp"], "_".join([ prefix, args_tss.program + ".gff"])) pre_tss = os.path.join(self.gff_outfolder, "_".join([ prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join( args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders") self.helper.remove_tmp_dir(args_tss.fastas) self.helper.remove_tmp_dir(args_tss.gffs) self.helper.remove_tmp_dir(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") shutil.rmtree(args_tss.wig_folder) if args_tss.manual is not None: shutil.rmtree(args_tss.manual) def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if not args_tss.overlap_feature: pass else: print("Comparing TSSs and Processing sites") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) elif args_tss.program.lower() == "processing": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.overlap_gffs, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.program, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and ( gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and ( gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open(os.path.join( self.stat_outfolder, prefix, "_".join([ "stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss, log): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._get_prefixs(args_tss) for prefix in prefixs: config = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config( prefix, args_tss, os.path.join(self.gff_path, prefix + ".gff"), self.wig_path, os.path.join(self.fasta_path, prefix + ".fa"), config, log) out_path = os.path.join( self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix, log) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move(os.path.join(out_path, "TSSstatistics.tsv"), os.path.join( self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "ps": args_tss.program = "processing" self._convert_gff(prefixs, args_tss, log) if args_tss.check_orphan: print("checking the orphan TSSs") log.write("Running check_orphan.py to re-check orphan TSSs.\n") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace("".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: log.write("Running filter_low_expression.py to filter out " "low expressed TSS/PS.\n") self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.parser_gff(args_tss.manual, None) self.multiparser.combine_gff(args_tss.gffs, self.manual_path, None, None) self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path, None) self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) log.write("Running merge_manual.py to merge the manual TSSs.\n") self._merge_manual(datas, args_tss) log.write("Running filter_TSS_pro.py to deal with the overlap " "position between TSS and PS.\n") self._deal_with_overlap(self.gff_outfolder, args_tss) log.write("Running stat_TSSpredator.py to do statistics.\n") self._stat_tss(datas, args_tss.program, log) if args_tss.validate: self._validate(datas, args_tss, log) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss, log) self._remove_files(args_tss)
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if os.path.exists(self.folders["tmp_tar"]): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, combine, out_folder): pass new_ref_folder = self.gen_folder(out_folder, ref_files) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("Updating the reference sequences") mh = open(mut_table, "r") pre_strain = None out = None for row in csv.reader(mh, delimiter='\t'): strain = row[1] if not row[0].startswith("#"): if (pre_strain != row[1]): fasta = os.path.join(out_folder, "fasta_files", strain + ".fa") if out is not None: out.close() out = open(fasta, "w") if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error: No fasta information of {0}.fa".format( strain)) out.close() if combine: out_seq = "updated_genomes.fa" if os.path.exists(out_seq): os.remove(out_seq) for seq in os.listdir(os.path.join(out_folder, "fasta_files")): if seq.endswith(".fa"): os.system(" ".join([ "cat", os.path.join(out_folder, "fasta_files", seq), ">>", out_seq ])) os.remove(os.path.join(out_folder, "fasta_files", seq)) shutil.move(out_seq, os.path.join(out_folder, "fasta_files", out_seq)) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) print("Please use the new fasta files to remapping again.")
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: if (file_.endswith(".ptt")) and (os.stat(file_).st_size == 0): print("Warning: No CDS information, " "TransTermHP can not work!") return "NO_CDS" if os.path.exists(file_) and ( os.stat(file_).st_size != 0): check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() return "Normal" def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs, log): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): log.write("Running converter.py to convert {0} and " "{1} to {2}, {3}, and {4}.\n".format( gff_file, srna, ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n\t{2}\n".format( ptt_file, rnt_file, srna.replace(".gff", ".rnt"))) if (not srna) and (fasta): log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) else: log.write("Running converter.py to convert {0} " "to {1}, and {2}.\n".format( gff_file, ptt_file, rnt_file)) self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" log.write("The following files are generated:\n") log.write("\t{0}\n\t{1}\n".format(ptt_file, rnt_file)) return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] check = self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] check = self._combine_annotation(combine_file, files) return check def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term, log): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) log.write(" ".join([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))]) + "\n") def _run_TransTermHP(self, args_term, log): self.helper.check_make_folder(self.tmps["transterm"]) log.write("Running TransTermHP.\n") log.write("Make sure the version is at least 2.09.\n") for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: log.write("{0}.fa can not be found!.\n".format(prefix)) print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term, log) log.write("Done!\n") log.write("The following files are generated in {0}.\n".format( out_path)) for file_ in os.listdir(out_path): log.write("\t" + file_ + "\n") out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term, log): log.write("Running coverter.py to convert the results of TransTermHP " "to gff3 format.\n") for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) log.write("\t" + out_file + " is generated.\n") self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix, log): log.write("Computing secondray structures of {0}.\n".format(prefix)) log.write("Make sure the version of Vienna RNA package is at least 2.3.2.\n") print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) log.write(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)]) + "\n") os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) log.write("Done!\n") log.write("\t" + tmp_sec + " is generated for storing secondary " "structure.\n") os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term, log): '''the approach for searching gene converged region terminator''' log.write("Searching terminators which located in gene converged " "region.\n") for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) log.write("Running get_inter_seq.py to extract the potential " "sequences from {0}.\n".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) log.write("\t" + tmp_seq + " is generated for storing the " "potential sequences.\n") self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix, log) log.write("Running extract_sec_info.py to extract the " "information of secondary structure from {0}.\n".format( prefix)) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) log.write("Running get_polyT.py to detect the " "terminator candidates for {0}.\n".format(prefix)) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) log.write("\t" + tmp_cand + " which temporary stores terminator " "candidates is generated.\n") print("Detecting terminators for " + prefix) log.write("Running detect_coverage_term.py to gain " "high-confidence terminators for {0}.\n".format(prefix)) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term, log): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = ( entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) log.write("Running stat_term.py to do statistics.\n") stat_path = os.path.join(args_term.out_folder, "statistics") log.write("The following files are generated:\n") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) log.write("\t" + os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["gff"]])) + "\n") log.write("\t" + os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])) + "\n") log.write("\t" + os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]])) + "\n") log.write("\t" + os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]])) + "\n") def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs, log): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) log.write("Running compare_tran_term.py for comparing transcripts " "and terminators.\n") log.write("The following files are generated:\n") for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"]))) log.write("\t" + os.path.join( args_term.out_folder, "statistics", "_".join(["stat_compare_terminator_transcript", prefix, type_ + ".csv"])) + "\n") def _re_table(self, args_term, prefixs, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates", "expressed_candidates", "non_expressed_candidates"]: for table in os.listdir(os.path.join( args_term.out_folder, "tables", type_)): term_table = os.path.join(args_term.out_folder, "tables", type_, table) reorganize_table(args_term.libs, args_term.merge_wigs, "Coverage_detail", term_table) log.write("\t" + term_table + "\n") def run_terminator(self, args_term, log): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas, log) check = self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term, log) self._convert_to_gff(prefixs, args_term, log) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) if check != "NO_CDS": self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term, log) self._compute_stat(args_term, log) self._compare_term_tran(args_term, prefixs, log) self._re_table(args_term, prefixs, log) self._remove_tmp_file(args_term.merge_wigs, args_term)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): '''compare CDS and transcript to find the expressed CDS''' out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("Transfering DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error: It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): '''extract the result of psortb''' if args_sub.merge: print("Merging gff") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth") } self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": [] } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" } def _transcript_snp(self, fasta, out_table_prefix, type_, prefix, bam_datas, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) for bam in bam_datas: stat_prefix = os.path.join( self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_], bam["sample"]]), "SNP" ])) snp_file = os.path.join( self.outputs["raw"], prefix, "_".join([prefix, self.baqs[type_], bam["sample"] + ".vcf"])) snp_detect( fasta, snp_file, self.outputs["depth"] + bam["sample"], "_".join([out_table_prefix, bam["sample"]]), os.path.join(seq_path, "_".join([prefix, bam["sample"]])), bam["bam_number"], stat_prefix, args_snp, bam["rep"]) self.helper.move_all_content(table_path, self.fig_path, [".png"]) def _get_para(self, args_snp): if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bcf_para def _run_tools(self, fasta_file, type_, args_snp, bam_datas): bcf_para = self._get_para(args_snp) for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bam_file] else: command = command + [ "--ignore-RG", "-ugf", fasta_file, bam_file ] os.system(" ".join(command) + ">" + self.outputs["tmp"]) bam["vcf"] = os.path.join( self.outputs["raw"], "_".join([self.baqs[type_], bam["sample"] + ".vcf"])) if args_snp.chrom == "1": call([ args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"] ]) elif args_snp.chrom == "2": call([ args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", bam["vcf"] ]) def _parse_vcf_by_fa(self, args_snp, type_, num_prog): seq_names = [] fa_prefixs = [] for fa in os.listdir(args_snp.fastas): if (fa != "all.fa") and (not fa.endswith(".fai")): with open(os.path.join(args_snp.fastas, fa)) as fh: for line in fh: line = line.strip() if line.startswith(">"): seq_names.append(line[1:]) fa_prefix = ".".join(fa.split(".")[:-1]) fa_prefixs.append(fa_prefix) vcf_folder = os.path.join(self.outputs["raw"], fa_prefix) if num_prog == 0: self.helper.check_make_folder(vcf_folder) self.helper.check_make_folder( os.path.join(self.outputs["table"], fa_prefix)) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], fa_prefix)) for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): out = open( os.path.join(vcf_folder, "_".join([fa_prefix, vcf])), "w") with open(os.path.join(self.outputs["raw"], vcf)) as vh: for line in vh: line = line.strip() if line.startswith("#"): out.write(line + "\n") else: if line.split("\t")[0] in seq_names: out.write(line + "\n") out.close() for vcf in os.listdir(self.outputs["raw"]): if vcf.endswith(".vcf"): os.remove(os.path.join(self.outputs["raw"], vcf)) return fa_prefixs def _run_sub(self, args_snp, all_fasta, type_, bam_datas, num_prog): self._run_tools(all_fasta, type_, args_snp, bam_datas) fa_prefixs = self._parse_vcf_by_fa(args_snp, type_, num_prog) for fa_prefix in fa_prefixs: for fasta in os.listdir(args_snp.fastas): if fa_prefix in fasta: fasta_file = os.path.join(args_snp.fastas, fasta) table_path = os.path.join(self.outputs["table"], fa_prefix) table_prefix = os.path.join( table_path, "_".join([fa_prefix, self.baqs[type_]])) self._transcript_snp(fasta_file, table_prefix, type_, fa_prefix, bam_datas, table_path, args_snp) def _run_program(self, all_fasta, bam_datas, args_snp): num_prog = 0 for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") sys.exit() self._run_sub(args_snp, all_fasta, type_, bam_datas, num_prog) num_prog += 1 def _run_bam(self, samtools_path, sub_command, bam_file, type_file): if sub_command == "merge": command = (" ".join( [samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": if type_file == "all": command = (" ".join([ samtools_path, sub_command, "-o", bam_file, self.bams["whole"] ])) else: command = (" ".join( [samtools_path, sub_command, "-o", bam_file, type_file])) os.system(command) def _merge_bams(self, args_snp, bam_datas): bams = [] num_normal = 0 num_frag = 0 for bam in bam_datas: bam["bam_number"] = 0 out_bam = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if len(bam["bams"]) == 1: print("Sorting BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "sort", out_bam, bam["bams"][0]) bam["bam_number"] = 1 else: print("Merging BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "merge", " ".join(bam["bams"]), "all") print("Sorting BAM files of " + bam["sample"]) self._run_bam(args_snp.samtools_path, "sort", out_bam, "all") bam["bam_number"] += 1 if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) out_depth = open(self.outputs["depth"] + bam["sample"], "w") call([args_snp.samtools_path, "index", out_bam]) call([args_snp.samtools_path, "depth", out_bam], stdout=out_depth) out_depth.close() def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": if row[1].split(":")[1] not in seq_names: seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp, bam_datas): seq_names = [] for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") self._get_header(args_snp.samtools_path, bam_file, seq_names) return seq_names def _remove_bams(self, bam_datas, args_snp): for bam in bam_datas: bam_file = os.path.join(args_snp.out_folder, bam["sample"] + ".bam") if os.path.exists(bam_file): os.remove(bam_file) if os.path.exists(bam_file + ".bai"): os.remove(bam_file + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"] + bam["sample"]) def _extract_bams(self, bams): bam_datas = [] for bam in bams: datas = bam.split(":") if len(datas) != 2: print("Error: the format of --bam_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: there are some Bam files " "which do not exist!") sys.exit() bam_datas.append({ "sample": datas[0], "rep": len(datas[-1].split(",")), "bams": datas[-1].split(",") }) return bam_datas def _merge_fasta(self, fastas): all_fasta = os.path.join(fastas, "all.fa") names = [] out = open(all_fasta, "w") print_ = False for fasta in os.listdir(fastas): if (fasta.endswith(".fa")) or (fasta.endswith(".fasta")) or ( fasta.endswith(".fna")): with open(os.path.join(fastas, fasta)) as fh: for line in fh: line = line.strip() if line.startswith(">"): if line not in names: print_ = True names.append(line) else: print_ = False if print_: out.write(line + "\n") out.close() return all_fasta def run_snp_calling(self, args_snp): self._modify_header(args_snp.fastas) all_fasta = self._merge_fasta(args_snp.fastas) bam_datas = self._extract_bams(args_snp.bams) self._merge_bams(args_snp, bam_datas) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ("extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: print("Detecting mutations now") self._run_program(all_fasta, bam_datas, args_snp) os.remove(self.outputs["tmp"]) os.remove(all_fasta) os.remove(all_fasta + ".fai") self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams(bam_datas, args_snp)
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = {"fasta": os.path.join( out_folder, "tmp_fasta"), "scan": os.path.join( out_folder, "tmp_scan"), "table": os.path.join( out_folder, "tmp_table")} rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = {"csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv"} return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam, log): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") if args_ribo.cutoff.split("_")[0] == "e": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq], stdout=scan) elif args_ribo.cutoff.split("_")[0] == "s": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq], stdout=scan) else: print("Error: the --cutoff needs to start from 'e' " "(e value) or 's' (score)!") log.write("the --cutoff needs to start from 'e' " "(e value) or 's' (score).\n") sys.exit() scan.close() log.write("Done!\n") log.write("\t" + scan_file + " is temporary generated.\n") return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) if self.tss_path is not None: tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff") else: tss_file = None log.write("Running extract_RBS.py to extract potential " "sequences of riboswitches/RNA thermometers for " "{0}.\n".format(prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), tss_file, os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) log.write("\t" + first_seq + " is temporary generated.\n") print("Pre-scanning of {0}".format(prefix)) log.write("Using Infernal to pre-scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at least 1.1.1.\n") first_scan_file = self._run_cmscan( args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam, log) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["csv"]])) log.write("Running recompute_RBS.py to update the potential " "sequences of riboswitches/RNA thermometers for {0} " "based on the pre-scanning results.\n".format(prefix)) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) log.write("\t" + sec_seq + " is temporary generated.\n") print("Scanning of {0}".format(prefix)) log.write("Using Infernal to scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at " "least 1.1.1.\n") sec_scan_file = self._run_cmscan( args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam, log) sec_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) log.write("Running recompute_RBS.py and modify_rbs_table.py " "to generate tables for {0} " "based on the scanning results.\n".format(prefix)) reextract_rbs(sec_scan_file, first_table, sec_table, args_ribo.cutoff) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) log.write("Merging the results from Infernal to generate " "tables for {0}.\n".format(prefix)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id log.write("The following files are generated.\n") for folder in (table_folder, scan_folder): for file_ in os.listdir(folder): log.write("\t" + os.path.join(folder, file_) + "\n") out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) log.write("Running ribo_gff.py to do statistics and generate " "gff files for {0}.\n".format(prefix)) log.write("The following files are generated:\n") out_gff = os.path.join(gff_outfolder, "_".join([ prefix, feature + ".gff"])) stat_and_covert2gff(os.path.join( table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, out_gff, args_ribo.fuzzy, out_stat, feature) log.write("\t" + out_gff + "\n") log.write("\t" + out_stat + "\n") fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log): log.write("Running rbs_overlap.py to remove the overlapping " "riboswitches/RNA thermometers.\n") for gff in os.listdir(gff_path): if gff.endswith(".gff"): tmp_table = os.path.join(os.path.join( tmp_files["table"], "_".join([ gff.replace(".gff", ""), suffixs["csv"]]))) rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_, fuzzy) log.write("\t" + tmp_table + " is updated.\n") def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder, type_, log): '''main part of detection''' log.write("Running get_Rfam_ribo.py to get the information of " "riboswitches/RNA thermometers from Rfam.\n") rbs_from_rfam(feature_id, args_ribo.rfam, rfam) log.write("Using Infernal to compress the Rfam data of " "riboswitches/RNA thermometers.\n") log.write("Please make sure the version of Infernal is at least 1.1.1.\n") print("Compressing Rfam of " + feature) log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n") call([args_ribo.cmpress_path, "-F", rfam]) log.write("Done!\n") prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam( prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log) self._remove_overlap(self.gff_path, tmp_files, suffixs, type_, args_ribo.fuzzy, log) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log) log.write("Running map_ribos.py to extract all the details from Rfam.\n") mapping_ribos(table_folder, feature_id, feature) log.write("The following files are updated:\n") for file_ in os.listdir(table_folder): log.write("\t" + os.path.join(table_folder, file_) + "\n") self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo, log_t, log_r): if args_ribo.fuzzy_rbs > 6: if log_t is not None: log_t.write("--fuzzy_rbs should be equal or less than 6!\n") if log_r is not None: log_r.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") if args_ribo.tsss is not None: self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder, "riboswitch", log_r) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction( args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder, "thermometer", log_t) self._remove_tmp(args_ribo)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_long_id(self, seq_file, long_ids, type_): out_file = seq_file + "_tmp.fa" out = open(out_file, "w") with open(seq_file) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): if len(line) > 40: long_ids[type_].append(line[1:]) out.write(">TMP" + type_ + "_" + str(len(long_ids[type_])) + "\n") else: out.write(line + "\n") else: out.write(line + "\n") out.close() return out_file def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, long_ids, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": srna_seq_file = os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) out_file = self._check_long_id(srna_seq_file, long_ids, "srna") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) else: tar_seq_file = os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) for tar_seq_file in os.listdir(os.path.join(current, seq_path)): if (prefix + "_" + file_type + "_") in tar_seq_file: out_file = self._check_long_id( os.path.join(current, seq_path, tar_seq_file), long_ids, "tar") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, target_prefixs, args_tar): print("Generating sRNA fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") target_prefixs.append(prefix) detect = False for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar, target_prefixs) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) if os.path.exists(sub_prefix + ".fa"): sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join( [sub_prefix, str(file_num) + ".fa"]), "w") detect = True sub_out.write(line + "\n") sub_out.close() else: open(sub_prefix + ".fa", "w").close() if not detect: print("No assigned features can be found. " "Please check your genome annotation. " "And assign correct features to --target_feature.") sys.exit() print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if ("_target_" in seq) and (".fa_tmp.fa" in seq): print("Running RNAplex with {0}".format( seq.replace(".fa_tmp.fa", ""))) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ]) + "\n") p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _restore_long_ids(self, rnaplex_file, long_ids): out = open(rnaplex_file + "tmp", "w") with open(rnaplex_file, "r") as t_f: for line in t_f: line = line.strip() if (line.startswith(">")): if (line.startswith(">TMPtar_")): header = long_ids["tar"][int(line.split("_")[1]) - 1] elif (line.startswith(">TMPsrna_")): header = long_ids["srna"][int(line.split("_")[1]) - 1] else: header = line[1:] out.write(">" + header + "\n") else: out.write(line + "\n") out.close() shutil.move(rnaplex_file + "tmp", rnaplex_file) def _rna_plex(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") tmp_rnaplfold_folder = os.path.join(self.rnaplex_path, "tmp_RNAplfold") if os.path.exists(tmp_rnaplfold_folder): shutil.rmtree(tmp_rnaplfold_folder) os.mkdir(tmp_rnaplfold_folder) long_ids = {"tar": [], "srna": []} for prefix in target_prefixs: self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, long_ids, self.target_seq_path, prefix, tmp_rnaplfold_folder, log) for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") shutil.copytree(tmp_rnaplfold_folder, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, long_ids, self.srna_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) if (len(long_ids["tar"]) != 0) or (len(long_ids["srna"]) != 0): self._restore_long_ids(rnaplex_file, long_ids) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ]) + "\n") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() for prefix in target_prefixs: self.helper.merge_file( os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join([ self.tmps["tmp"], str(num_up), ".fa" ]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, target_prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write( "Please make sure the version of IntaRNA is at least 2.0.4.\n") all_target = os.path.join(self.target_seq_path, "all_target.fa") if os.path.exists(all_target): os.remove(all_target) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.target_seq_path, prefix + "_target.fa"), all_target) for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([ args_tar.intarna_path, "-q", os.path.join(self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", all_target, "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file ]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, target_prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write( "Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") all_gff = os.path.join(self.gff_path, "all.gff") if os.path.exists(all_gff): os.remove(all_gff) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.gff_path, prefix + ".gff"), all_gff) for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join( self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target( rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), all_gff, target_prefixs) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or (line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup(line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] target_prefixs = [] self._gen_seq(prefixs, target_prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") shutil.rmtree(os.path.join(self.rnaplex_path, "tmp_RNAplfold")) log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, target_prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, target_prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file") os.remove(os.path.join(self.target_seq_path, "all_target.fa"))
class PPINetwork(object): def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join( out_folder, "tmp_specific"), "nospecific": os.path.join( out_folder, "tmp_nospecific"), "wget_action": os.path.join( out_folder, "tmp_action")} def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(1) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir(files["id_list"])) and ( "all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join( [ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write("\t".join( [ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open(os.path.join( best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write("\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove(os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open(os.path.join( all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1) and (row[0] != "Location") and ( name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and ( strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder( paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv"]) files["all_nospecific"] = open(os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open(os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder( "_".join([self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join( self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join( self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join( self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder( paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder( paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join( [strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv"]) files["all_specific"] = open(os.path.join( paths["all"], filename_strain), "w") files["best_specific"] = open(os.path.join( paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open(os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open(os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist(os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist(os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget( action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = {"specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True} print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or ( row_a[1] != pre_row[1]): self._get_pubmed( pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed( row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi(os.path.join(self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({"file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3]}) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class Screen(object): '''generation of screenshot''' def __init__(self, args_sc, out_folder): self.helper = Helper() args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder( os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file) def _import_libs(self, texs, strand, lib_dict): if strand == "+": tex = "ft" notex = "fn" else: tex = "rt" notex = "rn" for flib in texs: if (flib[1] == "tex"): lib_dict[tex].append(flib[0]) for nlib in texs: if (nlib[1] == "notex") and \ (flib[2] == nlib[2]) and \ (flib[3] == nlib[3]): lib_dict[notex].append(nlib[0]) def screenshot(self, args_sc, log): lib_dict = {"ft": [], "fn": [], "rt": [], "rn": [], "ff": [], "rf": []} f_texs = [] r_texs = [] if args_sc.tlibs is not None: for lib in args_sc.tlibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): log.write("Wiggle files should end with .wig.\n") print("Error: Wiggle files should end with .wig!") sys.exit() else: if lib_datas[-1] == "+": f_texs.append(lib_datas) else: r_texs.append(lib_datas) f_texs = sorted(f_texs, key=lambda x: (x[1], x[2], x[3])) r_texs = sorted(r_texs, key=lambda x: (x[1], x[2], x[3])) self._import_libs(f_texs, "+", lib_dict) self._import_libs(r_texs, "-", lib_dict) if args_sc.flibs is not None: for lib in args_sc.flibs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): log.write("Wiggle files should end with .wig.\n") print("Error: Wiggle files should end with .wig!") sys.exit() else: if lib_datas[-1] == "+": lib_dict["ff"].append(lib_datas[0]) else: lib_dict["rf"].append(lib_datas[0]) log.write("Running gen_screenshots.py to generate IGV batch script.\n") gen_screenshot(args_sc, lib_dict, self.forward_file + ".txt", self.reverse_file + ".txt", self.strain) log.write("\t" + self.forward_file + ".txt is generated.\n") log.write("\t" + self.reverse_file + ".txt is generated.\n") if (args_sc.tlibs is None) and (args_sc.flibs is None): log.write("No wig files can be found.\n") print("Error: There is no wig file assigned!") sys.exit()
class TranscriptDetection(object): '''doing for transcript detection''' def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = { "gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join(self.gff_outfolder, "tmp_overlap") } self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcript for {0}".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join([strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join([strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of Transcript and TSS file") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join(["stat_compare_transcript_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join(args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) def _compare_cds(self, tas, args_tran): self.multiparser.parser_gff(args_tran.gffs, None) self.multiparser.combine_gff(self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"), "transcript", None) print("Comaring of Transcript and genome annotation") cds_folder = os.path.join(args_tran.gffs, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join( self.stat_path, "".join(["stat_compare_transcript_genome_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.gffs, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join(args_tran.gffs, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) def _compare_tss_cds(self, tas, args_tran): '''compare transcript with CDS and TSS''' if (args_tran.compare_tss is not None) and (args_tran.c_feature is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) self._compare_tss(tas, args_tran) elif (args_tran.c_feature is not None) and (args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran) elif (args_tran.c_feature is None) and (args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran) def _for_one_wig(self, type_, args_tran): '''running transcript detection to one type of wig files''' if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Computing {0} wig files".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join( self.gff_outfolder, "_".join([strain, "transcript", type_ + ".gff"])) self.helper.sort_gff( os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove( os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran): '''merge the results of fragemented and tex treated libs''' if (args_tran.frag_wigs is not None) and (args_tran.tex_wigs is not None): print("Merging fragment and tex treat one") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "_transcript_" in gff: filename = gff.split("_transcript_") if (strain == filename[0]) and ("tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ("fragment.gff" == filename[1]): frag_file = gff combine( os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) def _post_modify(self, tas, args_tran): '''modify the transcript by comparing with genome annotation''' for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} refering to {1}".format(ta, gff)) fill_gap( os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"]) fill_gap( os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"]) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move( final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if "tmp_wig" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig")) self.helper.remove_tmp_dir(args_tran.gffs) self.helper.remove_tmp_dir(args_tran.compare_tss) self.helper.remove_tmp_dir(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran): '''searching the associated terminator to transcript''' if args_tran.terms is not None: print("Comparing between terminators and transcripts") self.multiparser.parser_gff(args_tran.terms, "term") if args_tran.gffs is not None: self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript", args_tran.terms, self.gff_outfolder) def run_transcript(self, args_tran): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): print("Error: There is no wigs files!!!!\n") sys.exit() if args_tran.frag_wigs is not None: strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran) tas = [] if args_tran.gffs is not None: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join(args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran) self._compare_term_tran(args_tran) print("Generating table for the details") gen_table_transcript(self.gff_outfolder, args_tran) plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist) self._remove_file(args_tran)
class sRNADetection(object): '''detection of sRNA''' def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") } def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([ os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file ], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or (database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): '''merge the results of fragmented and tex treated libs''' if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _read_lib_wig(self, args_srna): libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) return [libs, texs, wigs_f, wigs_r] def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): '''detection of intergenic and antisense sRNA''' tex_datas = None frag_datas = None if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if self.tss_path is not None: tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file(self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) frag_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) tex_datas = self._read_lib_wig(args_srna) intergenic_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if ("TSS_class" in os.listdir( args_srna.out_folder)) and (not args_srna.tss_source): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss, frag_datas, tex_datas def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas): '''detection of UTR-derived sRNA''' if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = { "frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None } if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join(args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna, tex_datas[0], tex_datas[1], tex_datas[2], tex_datas[3]) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna, frag_datas[0], frag_datas[1], frag_datas[2], frag_datas[3]) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.import_info is not None: if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( args_srna.nr_database is not None) or (args_srna.srna_database is not None): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _merge_tex_frag_datas(self, tex_datas, frag_datas): if (tex_datas is not None) and (frag_datas is not None): for index in [2, 3]: for strain, conds in frag_datas[index].items(): if strain not in tex_datas[index].keys(): tex_datas[index][strain] = conds else: for cond, tracks in conds.items(): tex_datas[index][strain][cond] = tracks elif (tex_datas is None) and (frag_datas is not None): tex_datas = frag_datas return tex_datas def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file(self.tran_path, "_transcript.gff", prefix, None, None) gffs = { "merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix]) } csvs = { "merge": "_".join([self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([self.prefixs["normal_table"], prefix]) } tss, frag_datas, tex_datas = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna, frag_datas, tex_datas) tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas) del frag_datas gc.collect() self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss, tex_datas) del tex_datas filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss, tex_datas): print("merging data of sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join([ "cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file ])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): '''extract the sec str energy''' detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy( os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return { "sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path) } def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([ os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file) ])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content(tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([ os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file) ], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec(self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([ os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file ]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0} in {1}".format(prefix, database)) seq_file = os.path.join(args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): '''classify the sRNA based on the filters''' if (args_srna.import_info is not None) or (args_srna.srna_database is not None) or ( args_srna.nr_database is not None) or (self.sorf_path is not None) or ( self.tss_path is not None) or (self.term_path is not None) or ( args_srna.promoter_table is not None): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join( class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join(["stat_sRNA_class", prefix + ".csv"])) classify_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join(class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _get_best_result(self, prefixs, args_srna): '''get the best results based on the filters''' for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table( os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table, self.term_path) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) self.helper.remove_tmp(self.gff_output) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): '''set the filter of sRNA''' if args_srna.import_info is not None: if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if args_srna.nr_database is not None: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if self.sorf_path is not None: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) if args_srna.srna_database is not None: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table, self.term_path) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): '''filter out the low expressed UTR-derived sRNA''' for prefix in prefixs: filter_utr( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): '''detection of antisense''' for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): '''do statistics for blast result''' for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): '''compare sRNA with terminator and promoter''' if self.term_path is not None: compare_srna_term( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if (args_srna.promoter_table is not None): compare_srna_promoter( os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") if args_srna.import_info is not None: args_srna.import_info = self._import_info_format( args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter( "_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if args_srna.srna_database is not None: if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class PPINetwork(object): '''detection of PPI''' def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") } def _make_folder_no_exist(self, path, folder): if folder not in os.listdir(path): os.mkdir(os.path.join(path, folder)) def _make_subfolder(self, path, strain, ptt): os.mkdir(os.path.join(path, strain)) os.mkdir(os.path.join(path, strain, ptt)) def _run_wget(self, source, folder, log): call(["wget", source, "-O", folder], stderr=log) time.sleep(2) def _wget_id(self, strain, locus, strain_id, files): detect_id = False if strain == strain_id["ptt"]: print("Retrieving STRING ID for {0} of {1} -- {2}".format( locus, strain_id["string"], strain_id["file"])) id_source = ("http://string-db.org/api/tsv/resolve?" "identifier={0}&species={1}").format( locus, strain_id["string"]) self._run_wget(id_source, os.path.join(files["id_list"], locus), files["id_log"]) detect_id = True return detect_id def _retrieve_id(self, strain_id, genes, files): for gene in genes: detect_id = self._wget_id(gene["strain"], gene["locus_tag"], strain_id, files) if not detect_id: print("Error:there is no {0} in {1}".format( gene, strain_id["file"])) def _get_prefer_name(self, row_a, strain_id, files, querys): prefername = "" filename = row_a.split(".") if (filename[1] not in os.listdir( files["id_list"])) and ("all" not in querys): self._wget_id(strain_id["ptt"], filename[1], strain_id, files) if filename[1] in os.listdir(files["id_list"]): id_h = open(os.path.join(files["id_list"], filename[1]), "r") for row_i in csv.reader(id_h, delimiter="\t"): if row_a == row_i[0]: prefername = row_i[3] id_h.close() return prefername def _print_title(self, out, id_file, id_folder): id_h = open(os.path.join(id_folder, id_file), "r") prefername = id_file for row_i in csv.reader(id_h, delimiter="\t"): prefername = row_i[3] id_h.close() out.write("Interaction of {0} | {1}\n".format(id_file, prefername)) out.write("strain\titem_id_a\titem_id_b\tmode\taction\ta_is_acting\t" "STRING_action_score\tpubmed_id\tpubmed_score\n") def _get_pubmed(self, row, strain_id, mode, actor, id_file, first_output, ptt, files, paths, args_ppi): prefer1 = self._get_prefer_name(row[0], strain_id, files, args_ppi.querys) prefer2 = self._get_prefer_name(row[1], strain_id, files, args_ppi.querys) if (len(prefer1) > 0) and (len(prefer2) > 0): if args_ppi.no_specific: pubmed_source = ( "http://www.ncbi.nlm.nih.gov/CBBresearch/" "Wilbur/IRET/PIE/getppi.cgi?term={0}+{1}").format( prefer1, prefer2) self._run_wget(pubmed_source, self.tmp_files["nospecific"], files["pubmed_log"]) strain_id["pie"] = "+".join(strain_id["pie"].split(" ")) pubmed_source = ("http://www.ncbi.nlm.nih.gov/CBBresearch/Wilbur" "/IRET/PIE/getppi.cgi?term={0}+{1}+{2}").format( prefer1, prefer2, strain_id["pie"]) self._run_wget(pubmed_source, self.tmp_files["specific"], files["pubmed_log"]) row[2] = mode row[4] = actor row[0] = prefer1 row[1] = prefer2 self._merge_information( first_output, self.tmp_files["specific"], files["all_specific"], files["best_specific"], row, args_ppi.score, id_file, files["id_list"], "specific", os.path.join(paths["all"], self.with_strain), os.path.join(paths["best"], self.with_strain), ptt) if args_ppi.no_specific: self._merge_information( first_output, self.tmp_files["nospecific"], files["all_nospecific"], files["best_nospecific"], row, args_ppi.score, id_file, files["id_list"], "nospecific", os.path.join(paths["all"], self.without_strain), os.path.join(paths["best"], self.without_strain), ptt) def _print_single_file(self, out_single, row_a, ptt, row): if row == "NA": out_single.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") else: out_single.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") def _merge_information(self, first_output, filename, out_all, out_best, row_a, score, id_file, id_folder, file_type, all_folder, best_folder, ptt): if os.path.getsize(filename) != 0: f_h = open(filename, "r") out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") out_best_single = open( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_title(out_best_single, id_file, id_folder) detect = False for row in csv.reader(f_h, delimiter="\t"): self._print_single_file(out_all_single, row_a, ptt, row) if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") if (float(row[1]) >= score): detect = True self._print_single_file(out_best_single, row_a, ptt, row) if first_output["_".join([file_type, "best"])]: first_output["_".join([file_type, "best"])] = False self._print_title(out_best, id_file, id_folder) out_best.write( "\t".join([ptt, "\t".join(row_a), "\t".join(row)]) + "\n") f_h.close() if not detect: os.remove( os.path.join(best_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"]))) out_all_single.close() out_best_single.close() else: out_all_single = open( os.path.join(all_folder, ptt, "_".join([row_a[0], row_a[1] + ".csv"])), "w") self._print_title(out_all_single, id_file, id_folder) self._print_single_file(out_all_single, row_a, ptt, "NA") if first_output["_".join([file_type, "all"])]: first_output["_".join([file_type, "all"])] = False self._print_title(out_all, id_file, id_folder) out_all.write("\t".join([ptt, "\t".join(row_a), "NA", "NA"]) + "\n") out_all_single.close() def _detect_protein(self, strain_id, args_ppi): fh = open(os.path.join(args_ppi.ptts, strain_id["file"]), "r") genes = [] for row in csv.reader(fh, delimiter="\t"): if (len(row) == 1) and ("-" in row[0]) and (".." in row[0]): name = (row[0].split("-"))[0].strip().split(",")[0].strip() if ("all" in args_ppi.querys): if (len(row) > 1) and (row[0] != "Location"): genes.append({"strain": name, "locus_tag": row[5]}) else: for query in args_ppi.querys: datas = query.split(":") strain = datas[0] start = datas[1] end = datas[2] strand = datas[3] if (len(row) > 1 ) and (row[0] != "Location") and (name == strain) and ( start == row[0].split("..")[0]) and ( end == row[0].split("..")[1]) and (strand == row[1]): genes.append({"strain": name, "locus_tag": row[5]}) fh.close() return genes def _setup_nospecific(self, paths, strain_id, files): self._make_subfolder(paths["all"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.without_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.without_strain, strain_id["ptt"]) filename_nostrain = "_".join([ strain_id["file"].replace(".ptt", ""), self.without_strain + ".csv" ]) files["all_nospecific"] = open( os.path.join(paths["all"], filename_nostrain), "w") files["best_nospecific"] = open( os.path.join(paths["best"], filename_nostrain), "w") def _setup_folder_and_read_file(self, strain_id, pre_file, files, paths, args_ppi): if strain_id["file"].endswith(".ptt"): if strain_id["file"] != pre_file: self.helper.check_make_folder("_".join( [self.tmp_id, strain_id["file"]])) paths["all"] = os.path.join(self.all_result, strain_id["file"][:-4]) paths["best"] = os.path.join(self.best_result, strain_id["file"][:-4]) paths["fig"] = os.path.join(self.fig, strain_id["file"][:-4]) self.helper.check_make_folder( os.path.join(self.all_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.best_result, strain_id["file"][:-4])) self.helper.check_make_folder( os.path.join(self.fig, strain_id["file"][:-4])) self._make_subfolder(paths["all"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["best"], self.with_strain, strain_id["ptt"]) self._make_subfolder(paths["fig"], self.with_strain, strain_id["ptt"]) filename_strain = "_".join([ strain_id["file"].replace(".ptt", ""), self.with_strain + ".csv" ]) files["all_specific"] = open( os.path.join(paths["all"], filename_strain), "w") files["best_specific"] = open( os.path.join(paths["best"], filename_strain), "w") if args_ppi.no_specific: self._setup_nospecific(paths, strain_id, files) files["id_list"] = "_".join([self.tmp_id, strain_id["file"]]) files["id_log"] = open( os.path.join(files["id_list"], self.tmp_files["log"]), "w") files["action_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["action"]), "w") files["pubmed_log"] = open( os.path.join(args_ppi.out_folder, self.tmp_files["pubmed"]), "w") pre_file = strain_id["file"] if strain_id["file"] in os.listdir(args_ppi.ptts): genes = self._detect_protein(strain_id, args_ppi) else: self._make_folder_no_exist( os.path.join(paths["all"], self.with_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.with_strain), strain_id["ptt"]) if args_ppi.no_specific: self._make_folder_no_exist( os.path.join(paths["all"], self.without_strain), strain_id["ptt"]) self._make_folder_no_exist( os.path.join(paths["best"], self.without_strain), strain_id["ptt"]) else: print("Error:wrong .ptt file!!") sys.exit() return genes def _wget_actions(self, files, id_file, strain_id, out_folder): detect = False t_h = open(os.path.join(files["id_list"], id_file), "r") print("Retrieving STRING actions for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row in csv.reader(t_h, delimiter="\t"): if row[0].startswith("stringId"): continue else: detect = True if row[1] == strain_id["string"]: action_source = ("http://string-db.org/api/tsv/actions?" "identifier={0}&species={1}").format( row[0], row[1]) self._run_wget(action_source, self.tmp_files["wget_action"], files["action_log"]) break t_h.close() if not detect: print("Warning: " + id_file + " can not be found in STRING...") return detect def _retrieve_actions(self, files, strain_id, paths, args_ppi): '''get the interaction of proteins''' for id_file in os.listdir(files["id_list"]): if id_file != self.tmp_files["log"]: detect_id = self._wget_actions(files, id_file, strain_id, args_ppi.out_folder) if detect_id: a_h = open(self.tmp_files["wget_action"], "r") pre_row = [] first = True detect = False first_output = { "specific_all": True, "specific_best": True, "nospecific_all": True, "nospecific_best": True } print("Retrieving Pubmed for {0} of {1} -- {2}".format( id_file, strain_id["string"], strain_id["file"])) for row_a in csv.reader(a_h, delimiter="\t"): if row_a == []: print("No interaction can be detected...") break if row_a[0].startswith("item_id_a"): continue else: detect = True if first: first = False mode = row_a[2] actor = row_a[4] else: if (row_a[0] != pre_row[0]) or (row_a[1] != pre_row[1]): self._get_pubmed(pre_row, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) mode = row_a[2] actor = row_a[4] else: mode = mode + ";" + row_a[2] actor = actor + ";" + row_a[4] pre_row = row_a if detect: detect = False self._get_pubmed(row_a, strain_id, mode, actor, id_file, first_output, strain_id["ptt"], files, paths, args_ppi) if detect_id: a_h.close() def _plot(self, args_ppi, files): if args_ppi.no_specific: files["all_nospecific"].close() files["best_nospecific"].close() files["all_specific"].close() files["best_specific"].close() for folder in os.listdir(self.all_result): if folder in os.listdir(self.fig): print("plotting {0}".format(folder)) plot_ppi( os.path.join(self.all_result, folder, "_".join([folder, self.with_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.with_strain), args_ppi.size) if args_ppi.no_specific: plot_ppi( os.path.join( self.all_result, folder, "_".join([folder, self.without_strain + ".csv"])), args_ppi.score, os.path.join(self.fig, folder, self.without_strain), args_ppi.size) def _remove_tmps(self, args_ppi): self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "file") self.helper.remove_all_content(os.path.join(args_ppi.out_folder), "tmp", "dir") for file_ in os.listdir(args_ppi.ptts): if file_.startswith("PPI_"): os.remove(os.path.join(args_ppi.ptts, file_)) def retrieve_ppi_network(self, args_ppi): '''retrieve PPI from STRING with PIE and draw network''' strain_ids = [] paths = {} files = {} for strain in args_ppi.strains: datas = strain.split(":") ptt_file = "PPI_" + datas[0].replace(".gff", ".ptt") rnt_file = "PPI_" + datas[0].replace(".gff", ".rnt") self.converter.convert_gff2rntptt( os.path.join(args_ppi.ptts, datas[0]), "0", os.path.join(args_ppi.ptts, ptt_file), os.path.join(args_ppi.ptts, rnt_file), None, None) strain_ids.append({ "file": ptt_file, "ptt": datas[1], "string": datas[2], "pie": datas[3] }) strain_ids.sort(key=lambda x: x["file"]) pre_file = "" for strain_id in strain_ids: genes = self._setup_folder_and_read_file(strain_id, pre_file, files, paths, args_ppi) s_h = open(args_ppi.species, "r") for row in csv.reader(s_h, delimiter="\t"): if row[0] != "##": if row[0] == strain_id["string"]: break elif row[2] == strain_id["string"]: strain_id["string"] = row[0] break elif row[3] == strain_id["string"]: strain_id["string"] = row[0] break self._retrieve_id(strain_id, genes, files) self._retrieve_actions(files, strain_id, paths, args_ppi) self._plot(args_ppi, files) self._remove_tmps(args_ppi)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if (pre_strain != strain): num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain == 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": os.system("<".join([ command, os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) ])) else: os.system("<".join([ command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) ])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join( [self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file( os.path.join(self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join( [self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): '''merge the result of RNAup and RNAplex''' for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target( rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") # if (args_tar.program == "RNAplex") or ( # args_tar.program == "both"): # for strain in os.listdir(os.path.join( # args_tar.out_folder, "RNAplex_results")): # shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex_results", # strain, "RNAplfold")) if (args_tar.program == "both") or (args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format( args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write( "--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n" ) print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write( "Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or (not os.path.exists( os.path.join(self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists( os.path.join(self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write( "The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ]) + "\n") call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write( "Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class SNPCalling(object): '''detection of SNP''' def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_prefix = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_]]), "SNP"])) snp_detect(fasta, snp, self.outputs["depth"], out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_prefix, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _get_para(self, args_snp): bams = self.bams["sort"] if args_snp.caller == "c": bcf_para = "-vcO" else: bcf_para = "-vmO" return bams, bcf_para def _run_tools(self, fasta_file, out_raw_prefix, type_, args_snp): bams, bcf_para = self._get_para(args_snp) if type_ == "with": command = [args_snp.samtools_path, "mpileup", "-t", "DP"] elif type_ == "without": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-B"] elif type_ == "extend": command = [args_snp.samtools_path, "mpileup", "-t", "DP", "-E"] if args_snp.rg: command = command + ["-ugf", fasta_file, bams] else: command = command + ["--ignore-RG", "-ugf", fasta_file, bams] os.system(" ".join(command) + ">" + self.outputs["tmp"]) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], bcf_para, "v", "-o", out_vcf]) elif args_snp.chrom == "2": call([args_snp.bcftools_path, "call", self.outputs["tmp"], bcf_para, "v", "-o", out_vcf]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_vcf = self._run_tools(fasta_file, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "with_BAQ": type_ = "with" print("Running SNP calling with BAQ") elif index == "without_BAQ": type_ = "without" print("Running SNP calling without BAQ") elif index == "extend_BAQ": print("Running SNP calling extend BAQ") type_ = "extend" else: print("Error: No correct program, please assign " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\"!") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) os.system(command) self.bams["bams"].append(bam_file.replace(".bam", "_sort.bam")) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: num_bam = 0 for files in args_snp.bams: for bam in glob(files): bams.append(bam) num_bam += 1 if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sorting BAM file now") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merging BAM files now") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sorting BAM file now") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) out_depth = open(self.outputs["depth"], "w") call([args_snp.samtools_path, "index", self.bams["sort"]]) call([args_snp.samtools_path, "depth", self.bams["sort"]], stdout=out_depth) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path, bam, seq_names): command = " ".join([samtools_path, "view", "-H", bam]) os.system(">".join([command, self.header])) fh = open(self.header, "r") for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() def _get_genome_name(self, args_snp): seq_names = [] self._get_header(args_snp.samtools_path, self.bams["sort"], seq_names) return seq_names def _remove_bams(self): if os.path.exists(self.bams["whole"]): os.remove(self.bams["whole"]) if os.path.exists(self.bams["whole"] + ".bai"): os.remove(self.bams["whole"] + ".bai") if os.path.exists(self.bams["sort"]): os.remove(self.bams["sort"]) if os.path.exists(self.bams["sort"] + ".bai"): os.remove(self.bams["sort"] + ".bai") if os.path.exists(self.header): os.remove(self.header) os.remove(self.outputs["depth"]) def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp) if ("with_BAQ" not in args_snp.program) and ( "without_BAQ" not in args_snp.program) and ( "extend_BAQ" not in args_snp.program): print("Error: Please assign a correct programs: " "\"with_BAQ\", \"without_BAQ\", \"extend_BAQ\".") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = {"raw_prefix": os.path.join( self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join( self.outputs["table"], prefix, prefix)} fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp_dir(args_snp.fastas) self._remove_bams()
class Multiparser(object): def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig" def combine_fasta(self, ref_folder, tar_folder, ref_feature): tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging fasta file of " + prefix) for file_ in os.listdir("/".join([ref_folder, folder])): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): if tar.endswith(".fa") or \ tar.endswith(".fna") or \ tar.endswith(".fasta"): filename = ".".join((tar.split("."))[:-1]) for file_ in files: if filename == file_: self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_fa)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_fa), os.path.join(tar_merge, prefix + ".fa")) self.helper.remove_all_content(tar_folder, ".fa", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def get_prefix(self, folder, ref_feature): datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] return prefix def combine_wig(self, ref_folder, tar_folder, ref_feature, libs): tar_merge = os.path.join(tar_folder, "merge_tmp") change_f = False change_r = False if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: prefix = self.get_prefix(folder, ref_feature) print("Merging wig file of " + prefix) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): filename = tar.split("_STRAIN_") for file_ in files: if (tar.endswith(".wig")) and ( file_ == filename[-1][:-4]): for lib in libs: if (filename[0] in lib) and (lib[-1] == "+"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_forward)) change_f = True elif (filename[0] in lib) and (lib[-1] == "-"): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_wig_reverse)) change_r = True if change_f and change_r: change_f = False change_r = False shutil.move(os.path.join(tar_folder, self.tmp_wig_forward), os.path.join(tar_merge, prefix + "_forward.wig")) shutil.move(os.path.join(tar_folder, self.tmp_wig_reverse), os.path.join(tar_merge, prefix + "_reverse.wig")) self.helper.remove_all_content(tar_folder, ".wig", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def combine_gff(self, ref_folder, tar_folder, ref_feature, tar_feature): tar_merge = os.path.join(tar_folder, "merge_tmp") change = False if tar_feature is None: tar_feature = "" else: tar_feature = "_" + tar_feature if ref_feature is None: ref_feature = "" else: ref_feature = "_" + ref_feature self.helper.check_make_folder(tar_merge) for folder in os.listdir(ref_folder): files = [] if "_folder" in folder: datas = folder.split("_folder") if ref_feature == "": prefix = datas[0][:-4] elif ref_feature == "_fasta": if datas[0].endswith(".fa"): prefix = datas[0][:-3] elif datas[0].endswith(".fna"): prefix = datas[0][:-4] elif datas[0].endswith(".fasta"): prefix = datas[0][:-6] else: datas = datas[0][:-4] datas = datas.split(ref_feature) prefix = datas[0] print("Merging gff file of " + prefix + tar_feature) for file_ in os.listdir(os.path.join(ref_folder, folder)): if ref_feature == "": files.append(file_[:-4]) elif ref_feature == "_fasta": files.append(file_[:-3]) else: filename = file_.split(ref_feature) files.append(filename[0]) for tar in os.listdir(tar_folder): for file_ in files: if (".gff" in tar) and ( file_ + tar_feature == tar[:-4]): self.helper.merge_file( os.path.join(tar_folder, tar), os.path.join(tar_folder, self.tmp_gff)) change = True if change: change = False shutil.move(os.path.join(tar_folder, self.tmp_gff), os.path.join(tar_folder, "merge_tmp", prefix + tar_feature + ".gff")) self.helper.remove_all_content(tar_folder, ".gff", "file") self.helper.move_all_content(tar_merge, tar_folder, None) shutil.rmtree(tar_merge) def parser_fasta(self, fastas): par_tmp = os.path.join(fastas, "tmp") first = True out = None out_t = None for fasta in os.listdir(fastas): if (fasta.endswith("fasta") or fasta.endswith("fa") or fasta.endswith("fna")): self.seq_editer.modify_header(os.path.join(fastas, fasta)) self.helper.check_make_folder(par_tmp) for fasta in os.listdir(fastas): if ("_folder" not in fasta) and ("tmp" != fasta): if (fasta.endswith(".fa")) or \ (fasta.endswith(".fna")) or \ (fasta.endswith(".fasta")): out_path = os.path.join(fastas, fasta + "_folder") print("Parser " + fasta + "...") self.helper.check_make_folder(out_path) with open(os.path.join(fastas, fasta), "r") as f_f: for line in f_f: if line[0] == ">": line = line.strip() if ("|" in line) and ( len(line.split("|")) > 4): strain = line.split("|") name = strain[3] else: name = line[1:] if first: first = False else: out.close() out_t.close() out = open(os.path.join( out_path, name + ".fa"), "w") out_t = open(os.path.join( par_tmp, name + ".fa"), "w") out.write(">" + name + "\n") out_t.write(">" + name + "\n") else: out.write(line) out_t.write(line) out.close() out_t.close() def parser_gff(self, gff_folder, feature): par_tmp = os.path.join(gff_folder, "tmp") out = None out_t = None first = True if feature is None: feature = "" else: feature = "_" + feature self.helper.check_make_folder(par_tmp) for filename in os.listdir(gff_folder): pre_seq_id = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(gff_folder, filename + "_folder") if ".gff" in filename: print("Parser " + filename + "...") self.helper.check_make_folder(out_path) self.helper.sort_gff(os.path.join(gff_folder, filename), os.path.join(gff_folder, "tmp.gff")) f_h = open(os.path.join(gff_folder, "tmp.gff"), "r") for row in csv.reader(f_h, delimiter="\t"): if row[0].startswith("#"): continue else: if pre_seq_id == row[0]: out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") else: if first: first = False else: out.close() out_t.close() out = open(os.path.join(out_path, row[0] + feature + ".gff"), "w") out_t = open(os.path.join(par_tmp, row[0] + feature + ".gff"), "w") pre_seq_id = row[0] out.write("\t".join(row) + "\n") out_t.write("\t".join(row) + "\n") f_h.close() if os.path.exists(os.path.join(gff_folder, "tmp.gff")): os.remove(os.path.join(gff_folder, "tmp.gff")) out.close() out_t.close() def parser_wig(self, wig_folder): par_tmp = os.path.join(wig_folder, "tmp") first = True out = None out_t = None self.helper.check_make_folder(par_tmp) for filename in os.listdir(wig_folder): track_info = "" if ("_folder" not in filename) and ("tmp" != filename): out_path = os.path.join(wig_folder, filename + "_folder") if ".wig" in filename: print("Parser {0}...".format(filename)) self.helper.check_make_folder(out_path) with open(os.path.join(wig_folder, filename), "r") as w_f: for line in w_f: line = line.split(" ") if (line[0] == "track"): track_info = " ".join(line) if (line[0] == "variableStep"): strain = line[1].split("=") if first: first = False else: out.close() out_t.close() out = open("".join([ os.path.join(out_path, filename[:-4]), "_STRAIN_", strain[1], ".wig"]), "w") out_t = open("".join([ os.path.join(wig_folder, "tmp", filename[:-4]), "_STRAIN_", strain[1], ".wig"]), "w") if track_info != "": out.write(track_info) out_t.write(track_info) out.write(" ".join(line)) out_t.write(" ".join(line)) if (line[0] != "track") and ( line[0] != "variableStep"): out.write(" ".join(line)) out_t.write(" ".join(line)) out.close() out_t.close()
class ArgsContainer(object): def __init__(self): self.multiparser = Multiparser() self.helper = Helper() def _check_replicates(self, replicates_tex, replicates_frag): if (replicates_tex is not None) and (replicates_frag is not None): replicates = {"tex": int(replicates_tex), "frag": int(replicates_frag)} elif replicates_tex is not None: replicates = {"tex": int(replicates_tex), "frag": -1} elif replicates_frag is not None: replicates = {"tex": -1, "frag": int(replicates_frag)} else: print("Error:No replicates number assign!!!") sys.exit() return replicates def _check_libs(self, tex_notex_libs, frag_libs): if (tex_notex_libs is None) and (frag_libs is None): print("Error: please input proper libraries!!") if (tex_notex_libs is not None) and (frag_libs is not None): libs = tex_notex_libs + frag_libs elif (tex_notex_libs is not None): libs = tex_notex_libs elif (frag_libs is not None): libs = frag_libs return libs def _parser_combine_wigs(self, subcommand): self.tex_path = None self.frag_path = None self.multiparser.parser_gff(self.gffs, None) if subcommand == "terminator": gff_path = os.path.join(self.gffs, "tmp") self.multiparser.parser_gff(gff_path, None) else: gff_path = self.gffs if self.tex_wigs is not None: self.tex_path = os.path.join(self.tex_wigs, "tmp") self.multiparser.parser_wig(self.tex_wigs) self.multiparser.combine_wig(gff_path, self.tex_path, None, self.libs) self.merge_wigs = self.tex_wigs self.wig_path = self.tex_path if self.frag_wigs is not None: self.frag_path = os.path.join(self.frag_wigs, "tmp") self.multiparser.parser_wig(self.frag_wigs) self.multiparser.combine_wig(gff_path, self.frag_path, None, self.libs) self.merge_wigs = self.frag_wigs self.wig_path = self.frag_path if (self.tex_path is not None) and ( self.frag_path is not None): self = self._merge_wig() if (self.tex_path is None) and ( self.frag_path is None): print("Error: There is no proper wig files assigned!!") sys.exit() return self def _merge_wig(self): self.merge_wigs = os.path.join(self.out_folder, "merge_wigs") if (self.tex_wigs is not None) and ( self.frag_wigs is not None): self.helper.check_make_folder(self.merge_wigs) self.wig_path = os.path.join(self.merge_wigs, "tmp") self.helper.check_make_folder(self.wig_path) for wig in os.listdir(self.tex_wigs): if os.path.isfile(os.path.join(self.tex_wigs, wig)): shutil.copy(os.path.join(self.tex_wigs, wig), self.merge_wigs) for wig in os.listdir(self.frag_wigs): if os.path.isfile(os.path.join(self.frag_wigs, wig)): shutil.copy(os.path.join(self.frag_wigs, wig), self.merge_wigs) for wig in os.listdir(self.tex_path): if os.path.isfile(os.path.join(self.tex_path, wig)): shutil.copy(os.path.join(self.tex_path, wig), self.wig_path) for wig in os.listdir(self.frag_path): if os.path.isfile(os.path.join(self.frag_path, wig)): self.helper.merge_file(os.path.join(self.frag_path, wig), os.path.join(self.wig_path, wig)) elif (self.tex_wigs is not None): self.merge_wigs = self.tex_wigs elif (self.frag_wigs is not None): self.merge_wigs = self.frag_wigs return self def _deal_multi_inputs(self, inputs, file_type, num, command): if inputs is not None: datas = inputs.split(",") if num is not None: if (len(datas) != num): print("Error: the amount of {0} is not correct!!".format( command)) new_inputs = [] for data in datas: if file_type == "float": new_inputs.append(float(data.strip())) elif file_type == "int": new_inputs.append(int(data.strip())) else: new_inputs.append(data) return new_inputs else: return inputs def container_ratt(self, ratt_path, element, transfer_type, ref_embl_gbk, target_fasta, ref_fasta, ratt_folder, convert_to_gff_rnt_ptt, tar_annotation_folder, compare_pair): self.ratt_path = ratt_path self.element = element self.transfer_type = transfer_type self.ref_embls = ref_embl_gbk self.tar_fastas = target_fasta self.ref_fastas = ref_fasta self.output_path = ratt_folder self.convert = convert_to_gff_rnt_ptt self.gff_outfolder = tar_annotation_folder self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None) return self def container_tsspredator(self, TSSpredator_path, compute_program, fasta_folder, annotation_folder, wig_folder, lib, output_prefix, height, height_reduction, factor, factor_reduction, base_height, enrichment_factor, processing_factor, replicate_match, out_folder, statistics, validate_gene, merge_manual, compare_transcript_assembly, fuzzy, utr_length, cluster, length, re_check_orphan, overlap_feature, reference_gff_folder, remove_low_expression): self.tsspredator_path = TSSpredator_path self.program = compute_program self.fastas = fasta_folder self.gffs = annotation_folder self.wig_folder = wig_folder self.libs = self._deal_multi_inputs(lib, "str", None, None) self.output_prefixs = self._deal_multi_inputs(output_prefix, "str", None, None) self.height = height self.height_reduction = height_reduction self.factor = factor self.factor_reduction = factor_reduction self.base_height = base_height self.enrichment_factor = enrichment_factor self.processing_factor = processing_factor self.repmatch = replicate_match self.out_folder = out_folder self.stat = statistics self.validate = validate_gene self.manual = merge_manual self.ta_files = compare_transcript_assembly self.fuzzy = fuzzy self.utr_length = utr_length self.cluster = cluster self.nt_length = length self.check_orphan = re_check_orphan self.overlap_feature = overlap_feature self.references = reference_gff_folder self.remove_low_expression = remove_low_expression return self def container_optimize(self, TSSpredator_path, fasta_file, annotation_file, wig_folder, manual, out_folder, strain_name, max_height, max_height_reduction, max_factor, max_factor_reduction, max_base_height, max_enrichment_factor, max_processing_factor, utr_length, lib, output_prefix, cluster, length, core, program, replicate_match, steps): self.tsspredator_path = TSSpredator_path self.fastas = fasta_file self.gffs = annotation_file self.wigs = wig_folder self.manual = manual self.output_folder = out_folder self.project_strain = strain_name self.height = max_height self.height_reduction = max_height_reduction self.factor = max_factor self.factor_reduction = max_factor_reduction self.base_height = max_base_height self.enrichment = max_enrichment_factor self.processing = max_processing_factor self.utr = utr_length self.libs = self._deal_multi_inputs(lib, "str", None, None) self.replicate_name = self._deal_multi_inputs(output_prefix, "str", None, None) self.cluster = cluster self.length = length self.cores = core self.program = program self.replicate = replicate_match self.steps = steps return self def container_terminator( self, TransTermHP_path, expterm_path, RNAfold_path, out_folder, fasta_folder, annotation_folder, transcript_folder, srna, statistics, tex_wig_folder, frag_wig_folder, decrease, highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript, fuzzy_downstream_transcript, fuzzy_within_gene, fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, min_loop_length, max_loop_length, min_stem_length, max_stem_length, min_AT_tail_length, miss_rate, range_u): self.TransTermHP_path = TransTermHP_path self.expterm_path = expterm_path self.RNAfold_path = RNAfold_path self.out_folder = out_folder self.fastas = fasta_folder self.gffs = annotation_folder self.trans = transcript_folder self.srnas = srna self.stat = statistics self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.decrease = decrease self.cutoff_coverage = highest_coverage self.fuzzy = fuzzy_detect_coverage self.fuzzy_up_ta = fuzzy_within_transcript self.fuzzy_down_ta = fuzzy_downstream_transcript self.fuzzy_up_gene = fuzzy_within_gene self.fuzzy_down_gene = fuzzy_downstream_gene self.hp_folder = transtermhp_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.min_loop = min_loop_length self.max_loop = max_loop_length self.min_stem = min_stem_length self.max_stem = max_stem_length self.at_tail = min_AT_tail_length self.miss_rate = miss_rate self.range_u = range_u self = self._parser_combine_wigs("terminator") return self def container_transcript( self, frag_wig_path, tex_wig_path, tex_notex, length, annotation_folder, height, width, tolerance, tolerance_coverage, replicates_tex, replicates_frag, transcript_assembly_output_folder, compare_TSS, compare_genome_annotation, TSS_fuzzy, tex_treated_libs, fragmented_libs, compare_feature_genome, table_best, terminator_folder, fuzzy_term): self.frag_wigs = frag_wig_path self.tex_wigs = tex_wig_path self.tex = tex_notex self.length = length self.gffs = annotation_folder self.height = height self.width = width self.tolerance = tolerance self.low_cutoff = tolerance_coverage self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.out_folder = transcript_assembly_output_folder self.compare_tss = compare_TSS self.compare_cds = compare_genome_annotation self.fuzzy = TSS_fuzzy self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None, None) self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str", None, None) self.table_best = table_best self.terms = terminator_folder self.fuzzy_term = fuzzy_term self = self._parser_combine_wigs("transcript") return self def container_utr(self, tss_folder, annotation_folder, transcript_assembly_folder, terminator_folder, terminator_fuzzy, utr_folder, tss_source, base_5utr, length, base_3utr): self.tsss = tss_folder self.gffs = annotation_folder self.trans = transcript_assembly_folder self.terms = terminator_folder self.fuzzy = terminator_fuzzy self.out_folder = utr_folder self.source = tss_source self.base_5utr = base_5utr self.base_3utr = base_3utr self.length = length return self def container_srna( self, Vienna_folder, Vienna_utils, blast_plus_folder, ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder, TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy, TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info, tex_wig_folder, frag_wig_folder, processing_site_folder, fasta_folder, mountain_plot, nr_format, srna_format, sRNA_database_path, nr_database_path, cutoff_energy, run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage, run_intergenic_fragmented_coverage, run_antisense_TEX_coverage, run_antisense_noTEX_coverage, run_antisense_fragmented_coverage, intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage, run_utr_fragmented_coverage, max_length, min_length, tex_notex_libs, frag_libs, replicates_tex, replicates_frag, tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS, table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic, fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit, best_without_sORF_candidate, overlap_percent_CDS, terminator_folder, terminator_fuzzy_in_CDS, terminator_fuzzy_out_CDS, best_with_terminator, ignore_hypothetical_protein, TSS_source, min_utr_coverage, promoter_table, best_with_promoter, ranking_promoter, promoter_name): self.vienna_path = Vienna_folder self.vienna_util = Vienna_utils self.blast_path = blast_plus_folder self.ps2pdf14_path = ps2pdf14_path self.out_folder = srna_folder self.utr_srna = UTR_derived_sRNA self.gffs = annotation_folder self.tss_folder = TSS_folder self.trans = transcript_assembly_folder self.fuzzy_inter_tss = TSS_intergenic_fuzzy self.fuzzy_5utr_tss = TSS_5UTR_fuzzy self.fuzzy_3utr_tss = TSS_3UTR_fuzzy self.fuzzy_intercds_tss = TSS_interCDS_fuzzy self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss, "3utr": self.fuzzy_3utr_tss, "interCDS": self.fuzzy_intercds_tss, "inter": self.fuzzy_inter_tss} self.import_info = self._deal_multi_inputs(import_info, "str", None, None) self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.pro_folder = processing_site_folder self.fastas = fasta_folder self.mountain = mountain_plot self.nr_format = nr_format self.srna_format = srna_format self.srna_database = sRNA_database_path self.nr_database = nr_database_path self.energy = cutoff_energy self.coverage_tex = self._deal_multi_inputs( run_intergenic_TEX_coverage, "float", 5, "--run_intergenic_TEX_coverage") self.coverage_notex = self._deal_multi_inputs( run_intergenic_noTEX_coverage, "float", 5, "--run_intergenic_noTEX_coverage") self.coverage_frag = self._deal_multi_inputs( run_intergenic_fragmented_coverage, "float", 5, "--run_intergenic_fragmented_coverage") self.anti_cover_tex = self._deal_multi_inputs( run_antisense_TEX_coverage, "float", 5, "--run_antisense_TEX_coverage") self.anti_cover_notex = self._deal_multi_inputs( run_antisense_noTEX_coverage, "float", 5, "--run_antisense_noTEX_coverage") self.anti_cover_frag = self._deal_multi_inputs( run_antisense_fragmented_coverage, "float", 5, "--run_antisense_fragmented_coverage") self.tolerance = intergenic_tolerance self.utr_tex_cover = self._deal_multi_inputs( run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_notex_cover = self._deal_multi_inputs( run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_frag_cover = self._deal_multi_inputs( run_utr_fragmented_coverage, "str", 3, "--run_utr_fragmented_coverage") self.max_len = max_length self.min_len = min_length self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.tex_notex = tex_notex self.e_nr = blast_e_nr self.e_srna = blast_e_srna self.in_cds = detect_sRNA_in_CDS self.table_best = table_best self.decrease_inter = decrease_intergenic self.decrease_utr = decrease_utr self.fuzzy_inter = fuzzy_intergenic self.fuzzy_utr = fuzzy_utr self.nr_hits_num = cutoff_nr_hit self.sorf_file = sORF self.all_hit = best_with_all_sRNAhit self.best_sorf = best_without_sORF_candidate self.cutoff_overlap = overlap_percent_CDS self.terms = terminator_folder self.fuzzy_b = terminator_fuzzy_in_CDS self.fuzzy_a = terminator_fuzzy_out_CDS self.best_term = best_with_terminator self.hypo = ignore_hypothetical_protein self.tss_source = TSS_source self.min_utr = min_utr_coverage self.promoter_table = promoter_table self.best_promoter = best_with_promoter if ranking_promoter < 1: print("Error: --ranking_time_promoter must larger than 1...") sys.exit() self.rank_promoter = ranking_promoter self.promoter_name = self._deal_multi_inputs(promoter_name, "str", None, None) self = self._parser_combine_wigs("srna") return self def container_intersrna(self, file_type, files, args_srna, prefix, gff_file, tran_file, tss_file, pro_file, fuzzy): args_srna.file_type = file_type args_srna.gff_file = gff_file args_srna.tran_file = tran_file args_srna.tss_file = tss_file args_srna.pro_file = pro_file args_srna.fuzzy = fuzzy args_srna.prefix = prefix if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.cutoffs = args_srna.coverage_frag args_srna.tss_source = True args_srna.cut_notex = None args_srna.anti_notex_cutoff = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.cutoffs = args_srna.coverage_tex args_srna.tss_source = args_srna.tss_source args_srna.cut_notex = args_srna.coverage_notex args_srna.anti_notex_cutoff = args_srna.anti_cover_notex return args_srna def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type, prefix, args_srna): args_srna.file_type = file_type args_srna.gff_file = gff args_srna.ta_file = tran args_srna.tss_file = tss args_srna.pro_file = pro args_srna.prefix = prefix args_srna.seq_file = fasta if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.utr_coverages = args_srna.utr_frag_cover args_srna.notex = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.utr_coverages = args_srna.utr_tex_cover args_srna.notex = args_srna.utr_notex_cover args_srna.coverages = {"5utr": args_srna.utr_coverages[0], "3utr": args_srna.utr_coverages[1], "interCDS": args_srna.utr_coverages[2]} if args_srna.notex is not None: args_srna.cover_notex = {"5utr": args_srna.notex[0], "3utr": args_srna.notex[1], "interCDS": args_srna.notex[2]} else: args_srna.cover_notex = None return args_srna def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex): args_srna.tsss = tsss args_srna.pros = pros args_srna.wigs_f = wigs_f args_srna.wigs_r = wigs_r args_srna.nums = nums args_srna.output = output args_srna.out_table = out_table args_srna.texs = texs args_srna.detects = detects args_srna.cutoff_coverage = cutoff_coverage args_srna.notex = notex return args_srna def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs): args_srna.cdss = cdss args_srna.tsss = tsss args_srna.pros = pros args_srna.wig_fs = wig_fs args_srna.wig_rs = wig_rs args_srna.out = out args_srna.out_t = out_t args_srna.texs = texs args_srna.utrs = [] args_srna.srnas = [] return args_srna def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder, annotation_folder, TSS_folder, utr_length, min_length, max_length, tex_wig_folder, frag_wig_folder, cutoff_intergenic_coverage, cutoff_antisense_coverage, cutoff_5utr_coverage, cutoff_3utr_coverage, cutoff_interCDS_coverage, fasta_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, sRNA_folder, start_codon, stop_codon, cutoff_background, fuzzy_rbs, rbs_not_after_TSS, print_all_combination, best_no_sRNA, best_no_TSS, ignore_hypothetical_protein, min_rbs_distance, max_rbs_distance): self.out_folder = sorf_folder self.utr_detect = UTR_derived_sORF self.trans = transcript_folder self.gffs = annotation_folder self.tsss = TSS_folder self.utr_length = utr_length self.min_len = min_length self.max_len = max_length self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.cutoff_inter = cutoff_intergenic_coverage self.cutoff_anti = cutoff_antisense_coverage self.cutoff_5utr = cutoff_5utr_coverage self.cutoff_3utr = cutoff_3utr_coverage self.cutoff_intercds = cutoff_interCDS_coverage self.fastas = fasta_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.srnas = sRNA_folder self.start_codon = self._deal_multi_inputs(start_codon, "str", None, None) self.stop_codon = self._deal_multi_inputs(stop_codon, "str", None, None) self.background = cutoff_background self.fuzzy_rbs = fuzzy_rbs self.noafter_tss = rbs_not_after_TSS self.print_all = print_all_combination self.no_srna = best_no_sRNA self.no_tss = best_no_TSS self.hypo = ignore_hypothetical_protein self.min_rbs = min_rbs_distance self.max_rbs = max_rbs_distance self = self._parser_combine_wigs("sorf") return self def container_srna_target(self, Vienna_folder, annotation_path, fasta_path, sRNA_path, query_sRNA, program, interaction_length, window_size_target, span_target, window_size_srna, span_srna, unstructured_region_RNAplex_target, unstructured_region_RNAplex_srna, unstructured_region_RNAup, energy_threshold, duplex_distance, top, starget_output_folder, process_rnaplex, process_rnaup, continue_rnaup, potential_target_start, potential_target_end, target_feature): self.vienna_path = Vienna_folder self.gffs = annotation_path self.fastas = fasta_path self.srnas = sRNA_path self.query = self._deal_multi_inputs(query_sRNA, "str", None, None) self.program = program self.inter_length = interaction_length self.win_size_t = window_size_target self.span_t = span_target self.win_size_s = window_size_srna self.span_s = span_srna self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna self.unstr_region_rnaup = unstructured_region_RNAup self.energy = energy_threshold self.duplex_dist = duplex_distance self.top = top self.out_folder = starget_output_folder self.core_plex = process_rnaplex self.core_up = process_rnaup self.continue_rnaup = continue_rnaup self.tar_start = potential_target_start self.tar_end = potential_target_end self.features = self._deal_multi_inputs(target_feature, "str", None, None) return self def container_goterm(self, annotation_path, goterm_output_folder, UniProt_id, go_obo, goslim_obo, transcript_path): self.gffs = annotation_path self.out_folder = goterm_output_folder self.uniprot = UniProt_id self.go = go_obo self.goslim = goslim_obo self.trans = transcript_path return self def container_sublocal(self, Psortb_path, gff_path, fasta_path, bacteria_type, difference_multi, merge_to_gff, sublocal_output_folder, transcript_path): self.psortb_path = Psortb_path self.gffs = gff_path self.fastas = fasta_path self.gram = bacteria_type self.fuzzy = difference_multi self.merge = merge_to_gff self.out_folder = sublocal_output_folder self.trans = transcript_path return self def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed, species_STRING, score, ppi_output_folder, node_size, query): self.ptts = gff_path self.strains = self._deal_multi_inputs(proteinID_strains, "str", None, None) self.no_specific = without_strain_pubmed self.species = species_STRING self.score = score self.out_folder = ppi_output_folder self.size = node_size self.querys = self._deal_multi_inputs(query, "str", None, None) return self def container_promoter(self, MEME_path, promoter_output_folder, tex_libs, TSS_folder, fasta_folder, num_motif, nt_before_TSS, motif_width, TSS_source, tex_wig_path, annotation_folder, combine_all, e_value): self.meme_path = MEME_path self.output_folder = promoter_output_folder self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None) self.tsss = TSS_folder self.fastas = fasta_folder self.num_motif = num_motif self.nt_before = nt_before_TSS self.widths = self._deal_multi_inputs(motif_width, "str", None, None) self.source = TSS_source self.wigs = tex_wig_path self.gffs = annotation_folder self.combine = combine_all self.e_value = e_value return self def container_operon(self, TSS_folder, annotation_folder, transcript_folder, UTR5_folder, UTR3_folder, term_folder, TSS_fuzzy, term_fuzzy, min_length, statistics, operon_output_folder, combine_gff, operon_statistics_folder): self.tsss = TSS_folder self.gffs = annotation_folder self.trans = transcript_folder self.utr5s = UTR5_folder self.utr3s = UTR3_folder self.terms = term_folder self.tss_fuzzy = TSS_fuzzy self.term_fuzzy = term_fuzzy self.length = min_length self.statistics = statistics self.output_folder = operon_output_folder self.combine = combine_gff self.stat_folder = operon_statistics_folder return self def container_snp(self, samtools_path, bcftools_path, bam_type, program, fasta_path, tex_bam_path, frag_bam_path, quality, read_depth, snp_output_folder, indel_fraction, chrom): self.samtools_path = samtools_path self.bcftools_path = bcftools_path self.types = bam_type self.program = self._deal_multi_inputs(program, "str", None, None) self.fastas = fasta_path self.normal_bams = tex_bam_path self.frag_bams = frag_bam_path self.quality = quality self.depth = read_depth self.out_folder = snp_output_folder self.fraction = indel_fraction if chrom == "haploid": chrom = "1" elif chrom == "diploid": chrom = "2" self.chrom = chrom return self def container_circrna(self, align, process, fasta_path, annotation_path, tex_bam_path, fragmented_bam_path, read_folder, circrna_stat_folder, support_reads, segemehl_folder, samtools_path, start_ratio, end_ratio, ignore_hypothetical_protein, out_folder): self.align = align self.cores = process self.fastas = fasta_path self.gffs = annotation_path self.normal_bams = tex_bam_path self.frag_bams = fragmented_bam_path self.read_folder = read_folder self.stat_folder = circrna_stat_folder self.support = support_reads self.segemehl_path = segemehl_folder self.samtools_path = samtools_path self.start_ratio = start_ratio self.end_ratio = end_ratio self.hypo = ignore_hypothetical_protein self.output_folder = out_folder return self def container_ribos(self, infernal_path, riboswitch_ID, gff_path, fasta_path, tss_path, transcript_path, Rfam, ribos_output_folder, e_value, output_all, database_folder, fuzzy, start_codon, min_dist_rbs, max_dist_rbs, fuzzy_rbs, UTR_length): self.infernal_path = infernal_path self.ribos_id = riboswitch_ID self.gffs = gff_path self.fastas = fasta_path self.tsss = tss_path self.trans = transcript_path self.rfam = Rfam self.out_folder = ribos_output_folder self.e_value = e_value self.output_all = output_all self.database = database_folder self.fuzzy = fuzzy self.start_codons = self._deal_multi_inputs(start_codon, "str", None, None) self.start_rbs = min_dist_rbs self.end_rbs = max_dist_rbs self.fuzzy_rbs = fuzzy_rbs self.utr = UTR_length return self def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder, tex_wig_folder, height, tex_libs, frag_libs, present, output_folder): self.main_gff = main_gff self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None) self.fasta = fasta self.frag_wigs = frag_wig_folder self.tex_wigs = tex_wig_folder self.height = height self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.present = present self.output_folder = output_folder return self
class Controller(object): """Manage the actions of the subcommands. The Controller take care of providing the argumentes like path names and the parallel processing of tasks. """ def __init__(self, args): """Create an instance.""" self._args = args if (len(args.__dict__) > 3): if not os.path.exists(args.project_path): print("Error: --project_path does not exists!") sys.exit() self._paths = Paths(args.project_path) self.args_container = ArgsContainer() self.helper = Helper() def check_folder(self, folders, flags): '''Check the emtpy or wrong assigned folder''' for folder, flag in zip(folders, flags): if folder is None: print("Error: {0} is wrong. Please check it!".format(flag)) sys.exit() else: if os.path.exists(folder): if len(os.listdir(folder)) == 0: print("Error: {0} is a empty folder!".format(flag)) sys.exit() else: print("Error: {0} is wrong. Please check it!".format( flag)) sys.exit() def check_multi_files(self, input_files, flags): if input_files is not None: for files, flag in zip(input_files, flags): if files is not None: for file_ in files: if not os.path.exists(file_): print("Error: Some files in {0} do " "not exist!".format(flag)) sys.exit() def check_parameter(self, paras, names): '''Check the parameter is assigned correct or not''' for i in range(len(paras)): if paras[i] is None: print("Error: {0} is wrong. " "Please check it!".format(names[i])) sys.exit() def check_no_require_folder(self, folders): '''Check the folders which are not necessary. It should not be assigned a empty or wrong folder''' for folder in folders: if folder is not None: if os.path.exists(folder): if len(os.listdir(folder)) == 0: print("Error: There is a empty folder. " "Please check it!") sys.exit() else: print("Error: There is a wrong folder. " "Please check it!") sys.exit() def check_execute_file(self, exe): detect = False if os.path.exists(exe): detect = True full_exe = os.path.realpath(exe) for folder in os.environ["PATH"].split(":"): if os.path.exists(os.path.join(folder, exe)): detect = True full_exe = exe if not detect: if os.path.exists(os.path.realpath(exe)): full_exe = os.path.realpath(exe) else: print("Error: {0} can't be found!".format(exe)) print("Please assign the correct path!") sys.exit() return full_exe def check_file(self, files, names, require): '''Check the path of file''' for i in range(len(files)): if require: if files[i] is None: print("Error: {0} is wrong. " "Please check it!".format(names[i])) sys.exit() else: if not os.path.isfile(files[i]): print("Error: There is a wrong path of {0}. " "Please check it!".format(names[i])) sys.exit() else: if files[i] is not None: if not os.path.isfile(files[i]): print("Error: There is a wrong path of {0}. " "Please check it!".format(names[i])) sys.exit() def create_project(self, version): """Create a new project.""" project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders("root")) project_creator.create_version_file( self._paths.version_path, version) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( self._args.project_path)) def get_input(self): """Download required files from website.""" print("Running get input files") if self._args.ftp_path is None: print("Error: Please assign the path for downloading the data!") sys.exit() annotation_folder = self._paths.ref_annotation_folder fasta_folder = self._paths.ref_fasta_folder self.helper.check_make_folder(self._paths.ref_annotation_folder) self.helper.check_make_folder(self._paths.ref_fasta_folder) if self._args.ref_gff is True: get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "gff") get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "_genomic.gff.gz") if self._args.ref_fasta is True: get_file(self._args.ftp_path, self._paths.ref_fasta_folder, "fna") get_file(self._args.ftp_path, self._paths.ref_fasta_folder, "_genomic.fna.gz") if self._args.ref_gbk is True: get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "gbk") get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "gbff") get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "_genomic.gbff.gz") if self._args.ref_ptt is True: get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "ptt") if self._args.ref_rnt is True: get_file(self._args.ftp_path, self._paths.ref_annotation_folder, "rnt") if self._args.convert_embl is True: annotation_files = os.listdir(self._paths.ref_annotation_folder) if len(annotation_files) == 0: sys.stdout.write("No gff files!!\n") else: Converter().convert_gbk2embl(self._paths.ref_annotation_folder) def get_target_fasta(self): """Get target fasta""" print("Running update genome fasta") self.check_multi_files([self._args.related_fasta_files], ["--related_fasta_files"]) self.check_file([self._args.mutation_table], "--mutation_table", True) project_creator.create_subfolders( self._paths.required_folders("get_target_fasta")) target = TargetFasta(self._paths.tar_fasta_folder, self._args.related_fasta_files) target.get_target_fasta( self._args.mutation_table, self._paths.tar_fasta_folder, self._args.related_fasta_files, self._args.combine_to_one_fasta, self._paths.target_base_folder) def ratt(self): """Run RATT to transfer annotation file from reference to target.""" print("Running annotation transfer") if (self._args.transfer_type != "Strain") and ( self._args.transfer_type != "Assembly") and ( self._args.transfer_type != "Species") and ( self._args.transfer_type != "Assembly.Repetitive") and ( self._args.transfer_type != "Strain.Repetitive") and ( self._args.transfer_type != "Species.Repetitive") and ( self._args.transfer_type != "Multiple") and ( self._args.transfer_type != "Free"): print("Error: please assign correct --transfer_type!") sys.exit() if (self._args.related_embl_files is None) and ( self._args.related_gbk_files is None): print("Error: please assign proper embl or genbank files") sys.exit() elif (self._args.related_embl_files is not None) and ( self._args.related_gbk_files is not None): print("Error: please choose embl as input or genbank as input") sys.exit() self._args.ratt_path = self.check_execute_file(self._args.ratt_path) self.check_multi_files( [self._args.target_fasta_files, self._args.related_fasta_files], ["--target_fasta_files", "--closed_fasta_files"]) self.check_parameter([self._args.element, self._args.compare_pair], ["--element", "--compare_pair"]) project_creator.create_subfolders( self._paths.required_folders("get_target_fasta")) project_creator.create_subfolders( self._paths.required_folders("annotation_transfer")) args_ratt = self.args_container.container_ratt( self._args.ratt_path, self._args.element, self._args.transfer_type, self._args.related_embl_files, self._args.related_gbk_files, self._args.target_fasta_files, self._args.related_fasta_files, self._paths.ratt_folder, self._args.convert_to_gff_rnt_ptt, self._paths.tar_annotation_folder, self._args.compare_pair) ratt = RATT(args_ratt) ratt.annotation_transfer(args_ratt) def tsspredator(self): """Run TSSpredator for predicting TSS candidates.""" self.check_multi_files( [self._args.fasta_files, self._args.annotation_files, self._args.compare_overlap_gff, self._args.manual_files, self._args.compare_transcript_files], ["--fasta_files", "--annotation_files", "--compare_overlap_gff", "--manual_files","--compare_transcript_files"]) self.check_parameter([self._args.tex_notex_libs, self._args.condition_names], ["--tex_notex_libs", "--condition_names"]) self._args.tsspredator_path = self.check_execute_file( self._args.tsspredator_path) if self._args.program.lower() == "tss": print("Running TSS prediction") project_creator.create_subfolders( self._paths.required_folders("TSS")) out_folder = self._paths.tsspredator_folder elif self._args.program.lower() == "ps": print("Running processing site prediction") out_folder = self._paths.processing_site_folder project_creator.create_subfolders( self._paths.required_folders("processing")) else: print("Error: No such program!") sys.exit() args_tss = self.args_container.container_tsspredator( self._args.tsspredator_path, self._args.program, self._args.fasta_files, self._args.annotation_files, self._args.tex_notex_libs, self._args.condition_names, self._args.height, self._args.height_reduction, self._args.factor, self._args.factor_reduction, self._args.base_height, self._args.enrichment_factor, self._args.processing_factor, self._args.replicate_tex, out_folder, self._args.validate_gene, self._args.manual_files, self._args.curated_sequence_length, self._args.compare_transcript_files, self._args.tolerance, self._args.utr_length, self._args.cluster, self._args.re_check_orphan, self._args.remove_overlap_feature, self._args.compare_overlap_gff, self._args.remove_low_expression) tsspredator = TSSpredator(args_tss) tsspredator.run_tsspredator(args_tss) def optimize(self): """opimize TSSpredator""" self.check_multi_files( [self._args.fasta_files, self._args.annotation_files, self._args.manual_files], ["--fasta_files", "--annotation_files", "--manual_files"]) self._args.tsspredator_path = self.check_execute_file( self._args.tsspredator_path) self.check_parameter([self._args.tex_notex_libs, self._args.condition_names], ["--tex_notex_lib", "--condition_names"]) if self._args.program.lower() == "tss": print("Running optimization of TSS prediction") project_creator.create_subfolders( self._paths.required_folders("TSS")) out_folder = self._paths.tsspredator_folder elif self._args.program.lower() == "ps": print("Running optimization of processing site prediction") out_folder = self._paths.processing_site_folder project_creator.create_subfolders( self._paths.required_folders("processing")) else: print("Error: No such program!") sys.exit() args_ops = self.args_container.container_optimize( self._args.tsspredator_path, self._args.fasta_files, self._args.annotation_files, self._args.manual_files, out_folder, self._args.max_height, self._args.max_height_reduction, self._args.max_factor, self._args.max_factor_reduction, self._args.max_base_height, self._args.max_enrichment_factor, self._args.max_processing_factor, self._args.utr_length, self._args.tex_notex_libs, self._args.condition_names, self._args.cluster, self._args.curated_sequence_length, self._args.parallels, self._args.program, self._args.replicate_tex, self._args.steps) optimize_tss(args_ops) def color(self): """color the screenshots""" print("Running png files coloring") self.check_parameter([self._args.track_number], ["--track_numer"]) self.check_folder([self._args.screenshot_folder], ["--screenshot_folder"]) self._args.imagemagick_covert_path = self.check_execute_file( self._args.imagemagick_covert_path) color = ColorPNG() color.generate_color_png( self._args.track_number, self._args.screenshot_folder, self._args.imagemagick_covert_path) def terminator(self): """Run TransTermHP and Gene converaged for detecting terminators""" print("Running terminator prediction") if self._args.transterm_path is None: print("Please assign the path of transterm in TransTermHP.") self.check_multi_files( [self._args.fasta_files, self._args.annotation_files, self._args.transcript_files, self._args.srna_files], ["--fasta_files", "--annotation_files", "--transcript_files", "--srna_files"]) for prop in ("transterm_path", "expterm_path", "rnafold_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) project_creator.create_subfolders( self._paths.required_folders("terminator")) args_term = self.args_container.container_terminator( self._args.transterm_path, self._args.expterm_path, self._args.rnafold_path, self._paths.transterm_folder, self._args.fasta_files, self._args.annotation_files, self._args.transcript_files, self._args.srna_files, self._args.decrease, self._args.highest_coverage, self._args.tolerance_detect_coverage, self._args.tolerance_within_transcript, self._args.tolerance_downstream_transcript, self._args.tolerance_within_gene, self._args.tolerance_downstream_gene, self._paths.transtermhp_folder, self._args.tex_notex_libs, self._args.frag_libs, self._args.tex_notex, self._args.replicate_tex, self._args.replicate_frag, self._args.table_best, self._args.min_loop_length, self._args.max_loop_length, self._args.min_stem_length, self._args.max_stem_length, self._args.min_u_tail, self._args.miss_rate, self._args.mutation_u_tail, self._args.keep_multi_term, self._args.window_size, self._args.window_shift) terminator = Terminator(args_term) terminator.run_terminator(args_term) def transcript(self): """Run Transcript detection""" print("Running transcript detection") self.check_multi_files( [self._args.annotation_files, self._args.tss_files, self._args.terminator_files], ["--annotation_files", "--tss_files", "--terminator_files"]) project_creator.create_subfolders( self._paths.required_folders("transcript")) args_tran = self.args_container.container_transcript( self._args.tex_notex, self._args.modify_transcript, self._args.length, self._args.annotation_files, self._args.height, self._args.width, self._args.tolerance, self._args.tolerance_coverage, self._args.replicate_tex, self._args.replicate_frag, self._paths.transcript_output_folder, self._args.tss_files, self._args.tss_tolerance, self._args.tex_notex_libs, self._args.frag_libs, self._args.compare_feature_genome, self._args.table_best, self._args.terminator_files, self._args.terminator_tolerance, self._args.max_length_distribution) transcript = TranscriptDetection(args_tran) transcript.run_transcript(args_tran) def utr_detection(self): """Run UTR detection.""" print("Running UTR detection") self.check_multi_files( [self._args.annotation_files, self._args.terminator_files, self._args.transcript_files, self._args.tss_files], ["--annotation_files", "--terminator_files", "--transcript_files", "--tss_files"]) project_creator.create_subfolders(self._paths.required_folders("utr")) args_utr = self.args_container.container_utr( self._args.tss_files, self._args.annotation_files, self._args.transcript_files, self._args.terminator_files, self._args.terminator_tolerance, self._paths.utr_folder, self._args.tss_source, self._args.base_5utr, self._args.utr_length, self._args.base_3utr, self._args.tolerance_3utr, self._args.tolerance_5utr) utr = UTRDetection(args_utr) utr.run_utr_detection(args_utr) def _check_filter_input(self, files, info, filters): if files is None: print("Error: The {0} has to be provided " "if \"{1}\" in --filter_info!".format(info, filters)) sys.exit() def _check_database(self, database, flag, info): wrong = False if database is None: wrong = True elif not os.path.isfile(database): if (os.path.isfile(database + ".fa")) or ( os.path.isfile(database + ".fna")) or ( os.path.isfile(database + ".fasta")): wrong = False else: wrong = True if wrong: print("Error: {0} is required if {1} is in --filter_info. " "But the assignment of {0} is empty or wrong. " "Please check the {0} or remove {1} from " "--filter_info!".format(flag, info)) sys.exit() def srna_detection(self): """sRNA_detection.""" print("Running sRNA prediction") self.check_multi_files( [self._args.annotation_files, self._args.transcript_files, self._args.fasta_files, self._args.sorf_files, self._args.terminator_files, self._args.promoter_tables, self._args.processing_site_files], ["--annotation_files", "--transcript_files", "--fasta_files", "--sorf_files", "--terminator_files", "--promoter_tables", "--processing_site_files"]) for info in self._args.filter_info: if "sec_str" == info: if not self._args.compute_sec_structures: print("Error: --compute_sec_structures is not switch on, " "but sec_str is still in --filter_info.") sys.exit() self._check_filter_input( self._args.fasta_files, "fasta file", "sec_str") for prop in ("rnafold_path", "relplot_path", "mountain_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) elif ("blast_nr" == info) or ( "blast_srna"== info): for prop in ("blastn_path", "blastx_path", "makeblastdb_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) if ("blast_nr" == info): self._check_database(self._args.nr_database_path, "--nr_database_path", "blast_nr") if ("blast_srna" == info): self._check_database(self._args.srna_database_path, "--srna_database_path", "blast_srna") elif "sorf" == info: self._check_filter_input( self._args.sorf_files, "sORF", "sorf") elif "term" == info: self._check_filter_input(self._args.terminator_files, "terminator", "term") elif "promoter" == info: self._check_filter_input(self._args.promoter_tables, "Promoter", "promoter") elif "tss" == info: self._check_filter_input(self._args.tss_files, "TSS", "tss") else: if "none" != info.lower(): print("Error: Please check the --filter_info, " "invalid value was assigned!") sys.exit() if self._args.utr_derived_srna: if self._args.tss_files is None: print("Error: The TSS has to be provided " "if you want to compute UTR-derived sRNA!") sys.exit() if self._args.search_poly_u != 0: if self._args.fasta_files is None: print("Error: The fasta files have to be provided " "if you want to extend 3'end of sRNA by " "searching poly U tail!") sys.exit() project_creator.create_subfolders(self._paths.required_folders("srna")) args_srna = self.args_container.container_srna( self._args.rnafold_path, self._args.relplot_path, self._args.mountain_path, self._args.blastn_path, self._args.blastx_path, self._args.makeblastdb_path, self._paths.srna_folder, self._args.utr_derived_srna, self._args.annotation_files, self._args.tss_files, self._args.transcript_files, self._args.tss_intergenic_antisense_tolerance, self._args.tss_5utr_tolerance, self._args.tss_3utr_tolerance, self._args.tss_intercds_tolerance, self._args.filter_info, self._args.processing_site_files, self._args.fasta_files, self._args.mountain_plot, self._args.nr_format, self._args.srna_format, self._args.srna_database_path, self._args.nr_database_path, self._args.cutoff_energy, self._args.parallel_blast, self._args.min_intergenic_tex_coverage, self._args.min_intergenic_notex_coverage, self._args.min_intergenic_fragmented_coverage, self._args.min_complete_5utr_transcript_coverage, self._args.min_antisense_tex_coverage, self._args.min_antisense_notex_coverage, self._args.min_antisense_fragmented_coverage, self._args.min_utr_tex_coverage, self._args.min_utr_notex_coverage, self._args.min_utr_fragmented_coverage, self._args.max_length, self._args.min_length, self._args.tex_notex_libs, self._args.frag_libs, self._args.replicate_tex, self._args.replicate_frag, self._args.tex_notex, self._args.blast_e_nr, self._args.blast_e_srna, self._args.detect_srna_in_cds, self._args.table_best, self._args.decrease_intergenic_antisense, self._args.decrease_utr, self._args.tolerance_intergenic_antisense, self._args.tolerance_utr, self._args.cutoff_nr_hit, self._args.sorf_files, self._args.overlap_percent_cds, self._args.terminator_files, self._args.terminator_tolerance_in_srna, self._args.terminator_tolerance_out_srna, self._args.ignore_hypothetical_protein, self._args.tss_source, self._args.min_all_utr_coverage, self._args.promoter_tables, self._args.ranking_time_promoter, self._args.promoter_names, self._args.compute_sec_structures, self._args.search_poly_u, self._args.min_u_poly_u, self._args.mutation_poly_u) srna = sRNADetection(args_srna) srna.run_srna_detection(args_srna) def sorf_detection(self): """sORF_detection.""" print("Running sORF prediction") self.check_multi_files( [self._args.transcript_files, self._args.annotation_files, self._args.fasta_files, self._args.srna_files, self._args.tss_files], ["--transcript_files", "--annotation_files", "--fasta_files", "--srna_files", "--tss_files"]) project_creator.create_subfolders( self._paths.required_folders("sorf")) args_sorf = self.args_container.container_sorf( self._paths.sorf_folder, self._args.utr_derived_sorf, self._args.transcript_files, self._args.annotation_files, self._args.tss_files, self._args.utr_length, self._args.min_length, self._args.max_length, self._args.cutoff_intergenic_coverage, self._args.cutoff_antisense_coverage, self._args.cutoff_5utr_coverage, self._args.cutoff_3utr_coverage, self._args.cutoff_intercds_coverage, self._args.fasta_files, self._args.tex_notex_libs, self._args.frag_libs, self._args.tex_notex, self._args.replicate_tex, self._args.replicate_frag, self._args.table_best, self._args.srna_files, self._args.start_codon, self._args.stop_codon, self._args.cutoff_base_coverage, self._args.tolerance_rbs, self._args.rbs_not_after_tss, self._args.print_all_combination, self._args.best_no_srna, self._args.best_no_tss, self._args.ignore_hypothetical_protein, self._args.min_rbs_distance, self._args.max_rbs_distance, self._args.tolerance_3end, self._args.tolerance_5end) sorf = sORFDetection(args_sorf) sorf.run_sorf_detection(args_sorf) def meme(self): """promoter detectopn""" print("Running promoter detection") self.check_multi_files( [self._args.tss_files, self._args.fasta_files], ["--tss_files", "--fasta_files"]) if not self._args.tss_source: self.check_multi_files([self._args.annotation_files], ["--annotation_files"]) if (self._args.program == "both") or ( self._args.program == "meme"): self._args.meme_path = self.check_execute_file(self._args.meme_path) elif (self._args.program == "both") or ( self._args.program == "glam2"): self._args.glam2_path = self.check_execute_file(self._args.glam2_path) project_creator.create_subfolders( self._paths.required_folders("promoter")) args_pro = self.args_container.container_promoter( self._args.meme_path, self._args.glam2_path, self._paths.promoter_output_folder, self._args.tex_libs, self._args.tss_files, self._args.fasta_files, self._args.num_motifs, self._args.nt_before_tss, self._args.motif_width, self._args.tss_source, self._args.annotation_files, self._args.end_run, self._args.combine_all, self._args.e_value, self._args.parallels, self._args.program) meme = MEME(args_pro) meme.run_meme(args_pro) def operon(self): """operon detection""" print("Running operon detection") self.check_multi_files( [self._args.tss_files, self._args.annotation_files, self._args.transcript_files, self._args.utr5_files, self._args.utr3_files, self._args.terminator_files], ["--tss_files", "--annotation_files", "--transcript_files", "--utr5_files", "--utr3_files", "--terminator_files"]) project_creator.create_subfolders( self._paths.required_folders("operon")) args_op = self.args_container.container_operon( self._args.tss_files, self._args.annotation_files, self._args.transcript_files, self._args.utr5_files, self._args.utr3_files, self._args.terminator_files, self._args.tss_tolerance, self._args.terminator_tolerance, self._args.min_length, self._paths.operon_output_folder, self._paths.operon_statistics_folder) operon = OperonDetection(args_op) operon.run_operon(args_op) def circrna(self): """circRNA detection""" print("Running circular RNA prediction") if self._args.read_files: self._args.segemehl_path = self.check_execute_file( self._args.segemehl_path) for prop in ("testrealign_path", "samtools_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) self.check_multi_files( [self._args.fasta_files, self._args.annotation_files], ["--fasta_files", "--annotation_files"]) project_creator.create_subfolders( self._paths.required_folders("circrna")) args_circ = self.args_container.container_circrna( self._args.parallels, self._args.fasta_files, self._args.annotation_files, self._args.bam_files, self._args.read_files, self._paths.circrna_stat_folder, self._args.support_reads, self._args.segemehl_path, self._args.testrealign_path, self._args.samtools_path, self._args.start_ratio, self._args.end_ratio, self._args.ignore_hypothetical_protein, self._paths.circrna_output_folder) circ = CircRNADetection(args_circ) circ.run_circrna(args_circ) def goterm(self): """Go term discovery""" print("Running GO term mapping") self.check_multi_files( [self._args.annotation_files, self._args.transcript_files], ["--annotation_files", "--transcript_files"]) self.check_file([self._args.uniprot_id, self._args.go_obo, self._args.goslim_obo], ["--uniprot_id", "--go.obo", "--goslim_obo"], True) project_creator.create_subfolders( self._paths.required_folders("go_term")) args_go = self.args_container.container_goterm( self._args.annotation_files, self._paths.goterm_output_folder, self._args.uniprot_id, self._args.go_obo, self._args.goslim_obo, self._args.transcript_files) goterm = GoTermFinding(args_go) goterm.run_go_term(args_go) def srna_target(self): """sRNA target prediction""" print("Running sRNA target prediction") self.check_multi_files( [self._args.fasta_files, self._args.srna_files, self._args.annotation_files], ["--fasta_files", "--srna_files", "--annotation_files"]) if "RNAup" in self._args.program: self._args.rnaup_path = self.check_execute_file( self._args.rnaup_path) if "RNAplex" in self._args.program: for prop in ("rnaplfold_path", "rnaplex_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) if "IntaRNA" in self._args.program: self._args.intarna_path = self.check_execute_file( self._args.intarna_path) if self._args.mode_intarna is None: print("Error: --mode_IntaRNA need to be assigned!") sys.exit() project_creator.create_subfolders( self._paths.required_folders("srna_target")) args_tar = self.args_container.container_srna_target( self._args.rnaplfold_path, self._args.rnaplex_path, self._args.rnaup_path, self._args.intarna_path, self._args.annotation_files, self._args.fasta_files, self._args.srna_files, self._args.query_srnas, self._args.program, self._args.interaction_length, self._args.window_size_target_rnaplex, self._args.span_target_rnaplex, self._args.window_size_srna_rnaplfold, self._args.span_srna_rnaplfold, self._args.unstructured_region_rnaplex_target, self._args.unstructured_region_rnaplex_srna, self._args.unstructured_region_rnaup, self._args.energy_threshold_rnaplex, self._args.duplex_distance_rnaplex, self._args.top, self._paths.starget_output_folder, self._args.parallels_rnaplex, self._args.parallels_rnaup, self._args.parallels_intarna, self._args.continue_rnaup, self._args.slide_window_size_srna_intarna, self._args.max_loop_length_srna_intarna, self._args.slide_window_size_target_intarna, self._args.max_loop_length_target_intarna, self._args.mode_intarna, self._args.potential_target_start, self._args.potential_target_end, self._args.target_feature) srnatarget = sRNATargetPrediction(args_tar) srnatarget.run_srna_target_prediction(args_tar) def snp(self): """SNP transcript detection""" print("Running SNP/mutations calling") self.check_multi_files( [self._args.fasta_files], ["--fasta_files"]) if (self._args.bam_type != "related_genome") and ( self._args.bam_type != "reference_genome"): print("Error: Please assign \"related_genome\" or" " \"reference_genome\" to --bam_type!") sys.exit() if (self._args.ploidy != "haploid") and ( self._args.ploidy != "diploid"): print("Error: Please assign \"haploid\" or" " \"diploid\" to --chromosome_type!") if (self._args.caller != "c") and ( self._args.caller != "m"): print("Error: Please assign \"c\" or" " \"m\" to --caller!") for prop in ("bcftools_path", "samtools_path"): setattr(self._args, prop, self.check_execute_file(getattr(self._args, prop))) project_creator.create_subfolders(self._paths.required_folders("snp")) args_snp = self.args_container.container_snp( self._args.samtools_path, self._args.bcftools_path, self._args.bam_type, self._args.program, self._args.fasta_files, self._args.bam_files, self._args.quality, self._args.read_depth_range, self._paths.snp_output_folder, self._args.indel_fraction, self._args.ploidy, self._args.rg_tag, self._args.caller, self._args.filter_tag_info, self._args.dp4_cutoff) snp = SNPCalling(args_snp) snp.run_snp_calling(args_snp) def ppi(self): """PPI network retrieve""" print("Running protein-protein interaction networks prediction") self.check_multi_files([self._args.annotation_files], ["--annotation_files"]) self.check_parameter([self._args.query_strains, self._args.species_string], ["--query_strains", "--species_string"]) project_creator.create_subfolders( self._paths.required_folders("ppi_network")) args_ppi = self.args_container.container_ppi( self._args.annotation_files, self._args.query_strains, self._args.without_strain_pubmed, self._args.species_string, self._args.score, self._paths.ppi_output_folder, self._args.node_size, self._args.query) ppi = PPINetwork(self._paths.ppi_output_folder) ppi.retrieve_ppi_network(args_ppi) def sublocal(self): """Subcellular Localization prediction""" print("Running subcellular localization prediction") self.check_multi_files( [self._args.annotation_files, self._args.fasta_files, self._args.transcript_files], ["--annotation_files", "--fasta_files", "--transcript_files"]) if (self._args.bacteria_type != "positive") and ( self._args.bacteria_type != "negative"): print("Error: Please assign \"positive\" or" " \"negative\" to --bacteria_type!") sys.exit() self._args.psortb_path = self.check_execute_file(self._args.psortb_path) project_creator.create_subfolders( self._paths.required_folders("subcellular_localization")) args_sub = self.args_container.container_sublocal( self._args.psortb_path, self._args.annotation_files, self._args.fasta_files, self._args.bacteria_type, self._args.difference_multi, self._paths.sublocal_output_folder, self._args.transcript_files) sublocal = SubLocal(args_sub) sublocal.run_sub_local(args_sub) def ribos(self): """riboswitch and RNA thermometer prediction""" print("Running riboswitch and RNA thermometer prediction") self.check_multi_files( [self._args.annotation_files, self._args.fasta_files, self._args.tss_files, self._args.transcript_files], ["--annotation_files", "--fasta_files", "--tss_files", "--transcript_files"]) if (self._args.program == "both"): self.check_file([self._args.riboswitch_id_file, self._args.rfam_path], ["--riboswitch_id_file", "--rfam_path"], True) self.check_file([self._args.rna_thermometer_id_file, self._args.rfam_path], ["--rna_thermometer_id_file", "--rfam_path"], True) project_creator.create_subfolders( self._paths.required_folders("riboswitch")) project_creator.create_subfolders( self._paths.required_folders("thermometer")) ribos_path = self._paths.ribos_output_folder thermo_path = self._paths.thermo_output_folder elif (self._args.program == "thermometer"): self.check_file([self._args.rna_thermometer_id_file, self._args.rfam_path], ["--thermometer_id_file", "--rfam_path"], True) project_creator.create_subfolders( self._paths.required_folders("thermometer")) ribos_path = None thermo_path = self._paths.thermo_output_folder elif (self._args.program == "riboswitch"): self.check_file([self._args.riboswitch_id_file, self._args.rfam_path], ["--riboswitch_id_file", "--rfam_path"], True) project_creator.create_subfolders( self._paths.required_folders("riboswitch")) ribos_path = self._paths.ribos_output_folder thermo_path = None else: print("Error: Please assign \"thermometer\", \"riboswitch\" " "or \"both\" in --program!") sys.exit() self._args.cmscan_path = self.check_execute_file(self._args.cmscan_path) self._args.cmpress_path = self.check_execute_file(self._args.cmpress_path) args_ribo = self.args_container.container_ribos( self._args.program, self._args.rna_thermometer_id_file, self._args.cmscan_path, self._args.cmpress_path, self._args.riboswitch_id_file, self._args.annotation_files, self._args.fasta_files, self._args.tss_files, self._args.transcript_files, self._args.rfam_path, ribos_path, thermo_path, self._args.e_value, self._args.output_all, self._paths.database_folder, self._args.tolerance, self._args.tolerance_rbs, self._args.utr_length) ribos = Ribos(args_ribo) ribos.run_ribos(args_ribo) def crispr(self): """CRISPR prediction""" print("Running CRISPR prediction") self.check_multi_files( [self._args.fasta_files, self._args.annotation_files], ["--fasta_files", "--annotation_files"]) self._args.crt_path = self.check_execute_file(self._args.crt_path) project_creator.create_subfolders( self._paths.required_folders("crispr")) args_cris = self.args_container.container_cris( self._args.fasta_files, self._args.annotation_files, self._args.crt_path, self._args.window_size, self._args.min_number_repeats, self._args.min_length_repeat, self._args.Max_length_repeat, self._args.min_length_spacer, self._args.Max_length_spacer, self._paths.crispr_output_folder, self._args.ignore_hypothetical_protein) cris = Crispr(args_cris) cris.run_crispr(args_cris) def merge(self): """Merge all features""" print("Merging all features to one gff file") merge_folder = os.path.join(self._paths.output_folder, "merge_all_features") self.helper.check_make_folder(merge_folder) other_features = self._args.other_features_files self.check_file([self._args.transcript_file] + other_features, ["--transcript_file", "--other_features_files"], False) self.check_parameter([self._args.output_prefix], ["--output_prefix"]) run_merge(merge_folder, self._args.transcript_file, self._args.other_features_files, self._args.terminator_tolerance, self._args.tss_tolerance, os.path.join(merge_folder, self._args.output_prefix)) def screen(self): """generate screenshot""" print("Running screenshot generation") self.check_file([self._args.main_gff, self._args.fasta_file], ["--main_gff", "--fasta_file"], True) if self._args.side_gffs is not None: for gff in (self._args.side_gffs): gff = gff.strip() if not os.path.isfile(gff): print("Error: The --side_gffs do not exist!") sys.exit() if self._args.output_folder is None: print("Error: Please assign --output_folder!") sys.exit() if (self._args.present != "expand") and ( self._args.present != "collapse") and ( self._args.present != "squish"): print("Error: Please assign \"expand\" or " "\"collapse\" or \"squish\" to --present!") sys.exit() args_sc = self.args_container.container_screen( self._args.main_gff, self._args.side_gffs, self._args.fasta_file, self._args.height, self._args.tex_notex_libs, self._args.frag_libs, self._args.present, self._args.output_folder) screen = Screen(args_sc) screen.screenshot(args_sc)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
class GoTermFinding(object): '''Retrieving the GO term''' def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv" def _retrieve_go(self, uniprot, out_path, type_, log): prefixs = [] log.write("Running gene_ontology.py to retrieve GO terms.\n") for gff in os.listdir(self.gff_path): prefix = gff.replace(".gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(out_path, prefix)) out_file = os.path.join(out_path, prefix, "_".join([prefix, "uniprot.csv"])) print("Extracting GO terms of {0} from UniProt".format(prefix)) if self.tran_path is not None: tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) else: tran_file = None retrieve_uniprot(uniprot, os.path.join(self.gff_path, gff), out_file, tran_file, type_) log.write("\t" + out_file + " is generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["Genome", "Strand", "Start", "End", "Protein_id", "Go_term"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_files(self, gffs, out_path, out_folder, log): '''merge the files according to the input genome folder''' folders = [] log.write("Merging the output files based on the input genome " "information.\n") for folder in os.listdir(gffs): if folder.endswith("gff_folder"): folder_prefix = folder.replace(".gff_folder", "") folder_path = os.path.join(out_folder, folder_prefix) self.helper.check_make_folder(folder_path) folders.append(folder_path) filenames = [] for gff in os.listdir(os.path.join(gffs, folder)): if gff.endswith(".gff"): filenames.append(gff.replace(".gff", "")) out_all = os.path.join(folder_path, self.all_strain) if len(filenames) > 1: if self.all_strain in os.listdir(folder_path): os.remove(out_all) for filename in filenames: csv_file = "_".join([filename, "uniprot.csv"]) self.helper.merge_file(os.path.join(out_path, filename, csv_file), out_all) self._remove_header(out_all) shutil.copy(os.path.join(out_path, filename, csv_file), folder_path) else: shutil.copyfile(os.path.join(out_path, filenames[0], "_".join([filenames[0], "uniprot.csv"])), out_all) self.helper.remove_all_content(out_path, None, "dir") self.helper.remove_all_content(out_path, None, "file") for folder in folders: folder_prefix = folder.split("/")[-1] shutil.move(folder, os.path.join(out_path, folder_prefix)) for file_ in os.listdir(os.path.join(out_path, folder_prefix)): log.write("\t" + os.path.join(out_path, folder_prefix, file_) + " is generated.\n") def _stat(self, out_path, stat_path, go, goslim, out_folder, log): log.write("Running gene_ontology.py to Retrieve GOslim terms and " "do statistics.\n") log.write("The following files are generated:\n") for folder in os.listdir(out_path): strain_stat_path = os.path.join(stat_path, folder) self.helper.check_make_folder(strain_stat_path) fig_path = os.path.join(strain_stat_path, "figs") if "fig" not in os.listdir(strain_stat_path): os.mkdir(fig_path) stat_file = os.path.join(strain_stat_path, "_".join(["stat", folder + ".csv"])) map2goslim(goslim, go, os.path.join(out_path, folder, self.all_strain), stat_file, out_folder) log.write("\t" + stat_file + "\n") self.helper.move_all_content(out_folder, fig_path, ["_three_roots.png"]) self.helper.move_all_content(out_folder, fig_path, ["_molecular_function.png"]) self.helper.move_all_content(out_folder, fig_path, ["_cellular_component.png"]) self.helper.move_all_content(out_folder, fig_path, ["_biological_process.png"]) for file_ in os.listdir(fig_path): log.write("\t" + os.path.join(fig_path, file_) + "\n") def run_go_term(self, args_go, log): for gff in os.listdir(args_go.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_go.gffs, gff)) self.multiparser.parser_gff(args_go.gffs, None) if args_go.trans is not None: self.multiparser.parser_gff(args_go.trans, "transcript") print("Computing all CDSs") log.write("Retrieving GO terms for all CDSs.\n") self._retrieve_go(args_go.uniprot, self.result_all_path, "all", log) self._merge_files(args_go.gffs, self.result_all_path, self.out_all, log) self._stat(self.result_all_path, self.stat_all_path, args_go.go, args_go.goslim, self.out_all, log) if args_go.trans is not None: log.write("Retrieving GO terms only for expressed CDSs.\n") print("Computing express CDSs") self._retrieve_go(args_go.uniprot, self.result_express_path, "express", log) self._merge_files(args_go.gffs, self.result_express_path, self.out_express, log) self._stat(self.result_express_path, self.stat_express_path, args_go.go, args_go.goslim, self.out_express, log) self.helper.remove_tmp_dir(args_go.gffs) if args_go.trans is not None: self.helper.remove_tmp_dir(args_go.trans)
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = { "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join(args_ribo.out_folder, "tmp_scan"), "table": os.path.join(args_ribo.out_folder, "tmp_table") } self.suffixs = { "csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv" } def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([ os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq ], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder( os.path.join(self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join( [entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff( os.path.join(self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join(self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join( os.path.join( self.tmp_files["table"], "_".join( [gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([ os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam ]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)
class CircRNADetection(object): '''Detection of circRNA''' def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp") def _wait_process(self, processes): '''wait for the parallels to finish the process''' for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _deal_zip_file(self, read_files, log): tmp_datas = [] tmp_reads = [] for reads in read_files: zips = [] tmp_datas = reads["files"] for read in reads["files"]: if read.endswith(".bz2"): mod_read = read.replace(".bz2", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["bzcat", read]) + "\n") call(["bzcat", read], stdout=read_out) log.write("\t" + mod_read + " is generated.\n") read_out.close() elif read.endswith(".gz"): mod_read = read.replace(".gz", "") if (".fa" not in mod_read) and ( ".fasta" not in mod_read) and ( ".fna" not in mod_read) and ( ".fq" not in mod_read) and ( ".fastq" not in mod_read): mod_read = mod_read + ".fa" read_out = open(mod_read, "w") tmp_datas.append(mod_read) zips.append(mod_read) print(" ".join(["Uncompressing", read])) log.write(" ".join(["zcat", read]) + "\n") call(["zcat", read], stdout=read_out) read_out.close() log.write("\t" + mod_read + " is generated.\n") tmp_reads.append({"sample": reads["sample"], "files": tmp_datas, "zips": zips}) return tmp_reads def _run_segemehl_fasta_index(self, segemehl_path, fasta_path, index, fasta, log): log.write(" ".join([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) + "\n") call([segemehl_path, "-x", os.path.join(fasta_path, index), "-d", os.path.join(fasta_path, fasta)]) def _run_segemehl_align(self, args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log): out = open(os.path.join(self.alignment_path, fasta_prefix, sam_file), "w") log = open(os.path.join(self.alignment_path, fasta_prefix, log_file), "w") log.write(" ".join([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"]) + "\n") p = Popen([args_circ.segemehl_path, "-i", os.path.join(self.fasta_path, index), "-d", os.path.join(self.fasta_path, fasta), "-q", read, "-S"], stdout=out, stderr=log) return p def _align(self, args_circ, read_datas, log): '''align the read. if the bam files are provided, it can be skipped.''' prefixs = [] align_files = [] log.write("Using segemehl to align the read.\n") log.write("Please make sure the version of segemehl is at least 0.1.9.\n") for fasta in os.listdir(self.fasta_path): index = fasta.replace(".fa", ".idx") self._run_segemehl_fasta_index(args_circ.segemehl_path, self.fasta_path, index, fasta, log) processes = [] num_process = 0 fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) self.helper.check_make_folder(os.path.join( self.alignment_path, fasta_prefix)) log.write("Running for {0}.\n".format(fasta_prefix)) for reads in read_datas: for read in reads["files"]: num_process += 1 read_name = read.split("/")[-1] if read_name.endswith(".fa") or \ read_name.endswith(".fna") or \ read_name.endswith(".fasta") or \ read_name.endswith(".fq") or \ read_name.endswith(".fastq"): filename = read_name.split(".") read_prefix = ".".join(filename[:-1]) sam_file = "_".join([read_prefix, fasta_prefix + ".sam"]) log_file = "_".join([read_prefix, fasta_prefix + ".log"]) align_files.append("_".join([read_prefix, fasta_prefix])) print("Mapping {0}".format(sam_file)) p = self._run_segemehl_align( args_circ, index, fasta, read, sam_file, log_file, fasta_prefix, log) processes.append(p) if num_process == args_circ.cores: self._wait_process(processes) num_process = 0 self._wait_process(processes) log.write("Done!\n") log.write("The following files are generated in {0}:\n".format( os.path.join(self.alignment_path, fasta_prefix))) for file_ in os.listdir(os.path.join( self.alignment_path, fasta_prefix)): log.write("\t" + file_ + "\n") return align_files, prefixs def _run_samtools_convert_bam(self, samtools_path, pre_sam, out_bam, log): log.write(" ".join([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) + "\n") call([samtools_path, "view", "-bS", pre_sam, "-o", out_bam]) def _convert_sam2bam(self, sub_alignment_path, samtools_path, align_files, log): bam_files = [] convert_ones = [] remove_ones = [] log.write("Using Samtools to convert SAM files to BAM files.\n") log.write("Please make sure the version of Samtools is at least 1.3.1.\n") for sam in os.listdir(sub_alignment_path): pre_sam = os.path.join(sub_alignment_path, sam) if sam.endswith(".sam"): bam_file = sam.replace(".sam", ".bam") print("Converting {0} to {1}".format(sam, bam_file)) out_bam = os.path.join(sub_alignment_path, bam_file) self._run_samtools_convert_bam(samtools_path, pre_sam, out_bam, log) bam_files.append(out_bam) if align_files: if bam_file.replace(".bam", "") not in align_files: convert_ones.append(out_bam) else: remove_ones.append(pre_sam) elif sam.endswith(".bam"): if (pre_sam not in convert_ones) and ( pre_sam not in remove_ones): bam_files.append(pre_sam) elif sam.endswith(".log"): os.remove(pre_sam) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_alignment_path): if file_.endswith(".bam"): log.write("\t" + os.path.join(sub_alignment_path, file_) + "\n") return bam_files, convert_ones, remove_ones def _run_samtools_merge_sort(self, samtools_path, prefix, out_folder, bam_datas, log): log.write("Using Samtools for merging, sorting and converting " "the BAM files.\n") log.write("Make sure the version Samtools is at least 1.3.1.\n") for bam_data in bam_datas: print("Merging bam files for {0} of {1}".format( prefix, bam_data["sample"])) sample_bam = os.path.join(out_folder, "_".join([ prefix, bam_data["sample"] + ".bam"])) if len(bam_data["files"]) <= 1: shutil.copyfile(bam_data["files"][0], sample_bam) else: file_line = " ".join(bam_data["files"]) log.write(" ".join([samtools_path, "merge", sample_bam, file_line]) + "\n") os.system(" ".join([samtools_path, "merge", sample_bam, file_line])) print("Sorting bam files for {0} of {1}".format( prefix, bam_data["sample"])) sort_sample = os.path.join(out_folder, "_".join([prefix, bam_data["sample"] + "_sort.bam"])) log.write(" ".join([samtools_path, "sort", "-o", sort_sample, sample_bam]) + "\n") call([samtools_path, "sort", "-o", sort_sample, sample_bam]) os.remove(sample_bam) print("Converting bam files to sam files for {0} of {1}".format( prefix, bam_data["sample"])) log.write(" ".join([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) + "\n") call([samtools_path, "view", "-h", "-o", sort_sample.replace(".bam", ".sam"), sort_sample]) log.write("Done!\n") log.write("\t" + sort_sample.replace(".bam", ".sam") + " is generated.\n") def _merge_sort_aligment_file( self, bam_datas, read_datas, samtools_path, out_folder, convert_ones, tmp_reads, remove_ones, prefix, log): if bam_datas is None: merge_bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: if read.endswith(".gz") or read.endswith(".bz2"): read = ".".join( read.split("/")[-1].split(".")[:-1]) read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"]))) merge_bam_datas.append({"sample": read_data["sample"], "files": bam_files}) elif (bam_datas is not None) and (read_datas is not None): merge_bam_datas = copy.deepcopy(bam_datas) for bam_data in merge_bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: read_prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join( self.alignment_path, prefix, "_".join([read_prefix, prefix + ".bam"])) if (bam not in bam_data["files"]): bam_data["files"].append(bam) else: merge_bam_datas = copy.deepcopy(bam_datas) self._run_samtools_merge_sort(samtools_path, prefix, out_folder, merge_bam_datas, log) for bam in convert_ones: os.remove(bam) for sam in remove_ones: os.remove(sam) def _run_testrealign(self, prefix, testrealign_path, out_folder, log): log.write("Using Segemehl to detect circular RNAs.\n") log.write("Please make sure the version of Segemehl is at least 0.1.9.\n") log.write("Please make sure your testrealign.x exists. If it does not " "exists, please reinstall your Segemehl via using make all.\n") sub_splice_path = os.path.join(self.splice_path, prefix) if not os.path.exists(sub_splice_path): os.mkdir(sub_splice_path) err_log = os.path.join(sub_splice_path, prefix + ".log") print("Running testrealign.x for {0}".format(prefix)) for sam_file in os.listdir(out_folder): if sam_file.endswith("sort.sam"): sample_prefix = sam_file.replace("_sort.sam", "") command = " ".join([ testrealign_path, "-d", os.path.join(self.fasta_path, prefix + ".fa"), "-q", os.path.join(out_folder, sam_file), "-n", "-U", os.path.join(sub_splice_path, sample_prefix + "_splicesites.bed"), "-T", os.path.join(sub_splice_path, sample_prefix + "_transrealigned.bed")]) log.write(command + " 2>" + err_log + "\n") os.system(command + " 2>" + err_log) log.write("Done!\n") log.write("The following files are generated:\n") for file_ in os.listdir(sub_splice_path): log.write("\t" + os.path.join(sub_splice_path, file_) + "\n") self.helper.remove_all_content(out_folder, ".sam", "file") def _merge_bed(self, fastas, splice_path, output_folder): '''Merge the bed files for analysis''' fa_prefixs = [] for fasta in os.listdir(fastas): headers = [] if (fasta.endswith(".fa") or fasta.endswith(".fna") or fasta.endswith(".fasta")): with open(os.path.join(fastas, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): headers.append(line[1:]) filename = fasta.split(".") fasta_prefix = ".".join(filename[:-1]) fa_prefixs.append(fasta_prefix) bed_folder = os.path.join( output_folder, fasta_prefix) self.helper.check_make_folder(bed_folder) samples = [] for header in headers: for splice in os.listdir(os.path.join( splice_path, header)): if splice.endswith(".bed"): if self.splices["file"] in splice: sample = splice.replace(header, "") sample = sample.replace( self.splices["file"], "") if sample not in samples: samples.append(sample) shutil.copyfile( os.path.join( splice_path, header, splice), os.path.join( bed_folder, "tmp_" + splice)) for sample in samples: out_splice = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.splices["file"]])) out_trans = os.path.join(bed_folder, "".join([ fasta_prefix + sample + self.trans["file"]])) if os.path.exists(out_splice): os.remove(out_splice) if os.path.exists(out_trans): os.remove(out_trans) for file_ in os.listdir(bed_folder): if (self.splices["splice"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_splice) elif (self.trans["trans"] in file_) and ( sample in file_): self.helper.merge_file(os.path.join( bed_folder, file_), out_trans) self.helper.remove_all_content(splice_path, None, "dir") return samples, fa_prefixs def _stat_and_gen_gff(self, prefixs, samples, args_circ, log): '''do statistics and print the result to gff file''' log.write("Running circRNA.py to do statistics and generate gff files.\n") log.write("The following files are generated:\n") for prefix in prefixs: self.helper.check_make_folder(os.path.join(self.gff_folder, prefix)) self.helper.check_make_folder(os.path.join(self.splice_path, prefix)) for bed in os.listdir(os.path.join( args_circ.output_folder, prefix)): if (bed.split("_")[0] != "tmp") and (bed.endswith(".bed")): shutil.copy( os.path.join(args_circ.output_folder, prefix, bed), os.path.join(self.splice_path, prefix)) self.helper.check_make_folder(os.path.join( self.candidate_path, prefix)) print("Comparing circular RNAs with annotations of {0}".format( prefix)) for sample in samples: splice_file = os.path.join( self.splice_path, prefix, "".join([prefix, sample, self.splices["file"]])) stat_file = os.path.join(args_circ.stat_folder, "".join(["stat_", prefix, sample, "circRNA.csv"])) csv_all = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])) csv_best = os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_best.csv"])) gff_all = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_all.gff"])) gff_best = os.path.join(self.gff_folder, prefix, "".join([prefix, sample, "circRNA_best.gff"])) detect_circrna(splice_file, os.path.join( self.gff_path, prefix + ".gff"), csv_all, args_circ, stat_file) self.converter.convert_circ2gff( os.path.join(self.candidate_path, prefix, "".join([prefix, sample, "circRNA_all.csv"])), args_circ, gff_all, gff_best) log.write("\t" + stat_file + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") def _extract_input_files(self, inputs): input_datas = [] for input_ in inputs: datas = input_.split(":") if len(datas) != 2: print("Error: the format of --bam_files or " "--read_files is wrong!") sys.exit() for file_ in datas[-1].split(","): if not os.path.exists(file_): print("Error: some files in --bam_files or " "--read_files do not exist!") sys.exit() input_datas.append({"sample": datas[0], "files": datas[-1].split(",")}) return input_datas def _combine_read_bam(self, bam_files, bam_datas, read_datas): if bam_datas is not None: for bam_data in bam_datas: for read_data in read_datas: if bam_data["sample"] == read_data["sample"]: for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam = os.path.join(self.alignment_path, prefix + ".bam") if (bam in bam_files) and ( bam not in bam_data["files"]): bam_data["files"].append(bam) else: bam_datas = [] for read_data in read_datas: bam_files = [] for read in read_data["files"]: prefix = ".".join( read.split("/")[-1].split(".")[:-1]) bam_files.append(os.path.join( self.alignment_path, prefix + ".bam")) bam_datas.append({"sample": read_data["sample"], "files": bam_files}) return bam_datas def _remove_tmp_files(self, args_circ, fa_prefixs): self.helper.remove_tmp_dir(args_circ.fastas) self.helper.remove_tmp_dir(args_circ.gffs) self.helper.remove_all_content(args_circ.output_folder, ".bam", "file") for prefix in fa_prefixs: shutil.rmtree(os.path.join(args_circ.output_folder, prefix)) def run_circrna(self, args_circ, log): '''detection of circRNA''' bam_datas = None read_datas = None if (args_circ.bams is None) and (args_circ.read_files is None): log.write("--bam_files and --read_files can not be both emtpy.\n") print("Error: --bam_files or --read_files should be assigned.") sys.exit() if args_circ.bams is not None: bam_datas = self._extract_input_files(args_circ.bams) if args_circ.read_files is not None: read_datas = self._extract_input_files(args_circ.read_files) for gff in os.listdir(args_circ.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_circ.gffs, gff)) if args_circ.segemehl_path is None: log.write("segemehl does not exists.\n") print("Error: please assign segemehl path!!") sys.exit() self.multiparser.parser_fasta(args_circ.fastas) self.multiparser.parser_gff(args_circ.gffs, None) self.multiparser.combine_gff(args_circ.fastas, self.gff_path, "fasta", None) tmp_reads = [] if args_circ.read_files: log.write("Raw read files are found.\n") tmp_reads = self._deal_zip_file(read_datas, log) align_files, prefixs = self._align(args_circ, tmp_reads, log) else: align_files = None prefixs = [] for fasta in os.listdir(self.fasta_path): if fasta.endswith(".fa"): fasta_prefix = fasta.replace(".fa", "") prefixs.append(fasta_prefix) for prefix in prefixs: if args_circ.read_files: sub_alignment_path = os.path.join(self.alignment_path, prefix) bam_files, convert_ones, remove_ones = self._convert_sam2bam( sub_alignment_path, args_circ.samtools_path, align_files, log) else: convert_ones = [] remove_ones = [] self._merge_sort_aligment_file( bam_datas, read_datas, args_circ.samtools_path, args_circ.output_folder, convert_ones, tmp_reads, remove_ones, prefix, log) self._run_testrealign(prefix, args_circ.testrealign_path, args_circ.output_folder, log) samples, fa_prefixs = self._merge_bed( args_circ.fastas, self.splice_path, args_circ.output_folder) self._stat_and_gen_gff(fa_prefixs, samples, args_circ, log) if len(tmp_reads) != 0: for reads in tmp_reads: for read in reads["zips"]: os.remove(read) self._remove_tmp_files(args_circ, fa_prefixs)