def extract_inter_seq(inter, cds, seq, fuzzy, inters): helper = Helper() start = inter["start"] - fuzzy end = inter["end"] + fuzzy if inter["start"] - fuzzy <= 0: start = 1 if inter["end"] + fuzzy >= len(seq[cds.seq_id]): end = len(seq) if cds.strand == "+": inter_seq = helper.extract_gene(seq[cds.seq_id], start, end, "+") else: inter_seq = helper.extract_gene(seq[cds.seq_id], start, end, "-") inters.append(import_data(inter_seq, cds, inter["start"], inter["end"]))
class TargetFasta(object): def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp")} def get_target_fasta(self, mut_table, tar_folder, ref_folder, output): self.multiparser.parser_fasta(ref_folder) if "tmp" in os.listdir(tar_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("transfer to target fasta...") if output is not None: for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split("_and_") out = open(os.path.join(tar_folder, filename + ".fa"), "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open(os.path.join( self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error:no fasta information of {0}.fa".format( strain)) out.close() else: self.helper.move_all_content(self.folders["tmp_tar"], tar_folder, [".fa"]) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) self.helper.remove_all_content(ref_folder, "_folder", "dir") print("please use the new fasta file to remapping again.") print("Then copy BAMs and wigs back to input/align_results/BAMs " "and input/align_results/wigs")
def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig"
def _print_tssfile(self, nums, tss_features, tss, tss_pro, strain, method, out, tss_libs): '''print gff file of TSS''' tss_merge_type = ",".join(tss_features["tss_types"]) utr_length = ",".join(tss_features["utr_lengths"]) merge_locus_tag = ",".join(tss_features["locus_tags"]) libs = ",".join(tss_libs) strand = Helper().get_strand_name(tss.super_strand) attribute_string = ";".join([ "=".join(items) for items in ([ "Name", "".join( [tss_pro, ":", str(tss.super_pos), "_", strand]) ], ["ID", tss_pro.lower() + str(nums["tss_uni"])], ["type", tss_merge_type], ["utr_length", str(utr_length)], ["associated_gene", merge_locus_tag], ["libs", libs], ["method", "TSSpredator"]) ]) out.write("\t".join([ strain, method, tss_pro, str(tss.super_pos), str(tss.super_pos), ".", tss.super_strand, ".", attribute_string ]) + "\n")
def get_upstream(seq, tss, out, name, nt_before): if tss.strand == "+": if (tss.start - nt_before + 1) <= 0: start = 1 else: start = tss.start - nt_before + 1 fasta = Helper().extract_gene(seq, start, tss.start, tss.strand) else: if (tss.start + nt_before - 1) > len(seq): end = len(seq) else: end = tss.start + nt_before - 1 fasta = Helper().extract_gene(seq, tss.start, end, tss.strand) out.write("{0}\n{1}\n".format(name, fasta))
def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv"
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end): '''for the reverse strand''' pre_id = "" out = None for cds in cdss_r: if cds.seq_id != pre_id: out = open(os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "a") pre_id = cds.seq_id if (len(fasta) - cds.end > tar_start): end = cds.end + tar_start else: end = len(fasta) if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end): start = cds.end - tar_end - 1 elif cds.end - tar_end < 1: start = 1 elif (cds.end - cds.start) < tar_end: start = cds.start seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds target_gene = check_parent_gene(cds, genes) print_fasta(target, seq, out, target_gene) if out is not None: out.close()
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end): '''for forward strand''' pre_id = "" out = None for cds in cdss_f: if cds.seq_id != pre_id: out = open(os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "w") pre_id = cds.seq_id if (cds.start > tar_start): start = cds.start - tar_start else: start = 1 if ((cds.start + tar_end) < len(fasta)) and ( (cds.end - cds.start) >= tar_end): end = cds.start + tar_end - 1 elif cds.start + tar_end >= len(fasta): end = len(fasta) elif (cds.end - cds.start) < tar_end: end = cds.end seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds target_gene = check_parent_gene(cds, genes) print_fasta(target, seq, out, target_gene) if out is not None: out.close()
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database)
def compare_sorf_srna(sorfs, srnas, srna_gff): if srna_gff is not None: for sorf in sorfs: sorf["srna"] = [] for srna in srnas: if (sorf["strain"] == srna.seq_id) and (sorf["strand"] == srna.strand): if ((srna.start <= sorf["start"]) and (srna.end >= sorf["end"])) or ( (srna.start >= sorf["start"]) and (srna.end <= sorf["end"])) or ( (srna.start <= sorf["start"]) and (srna.end >= sorf["start"]) and (srna.end <= sorf["end"])) or ( (srna.start >= sorf["start"]) and (srna.start <= sorf["end"]) and (srna.end >= sorf["end"])): strand = Helper().get_strand_name(srna.strand) sorf["srna"].append(srna.attributes["ID"] + ":" + str(srna.start) + "-" + str(srna.end) + "_" + strand) if len(sorf["srna"]) == 0: sorf["srna"] = ["NA"] else: for sorf in sorfs: sorf["srna"] = ["NA"]
def assign_sorf(sorf, starts, ends, fasta): sorf["starts"] = starts sorf["ends"] = ends sorf["start"] = min(map(int, starts)) sorf["end"] = max(map(int, ends)) sorf["seq"] = Helper().extract_gene(fasta[sorf["strain"]], sorf["start"], sorf["end"], sorf["strand"])
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def get_feature(cds): '''get proper feature name''' if "locus_tag" in cds.attributes.keys(): feature = cds.attributes["locus_tag"] elif "protein_id" in cds.attributes.keys(): feature = cds.attributes["protein_id"] elif "ID" in cds.attributes.keys(): strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.attributes["ID"], ":", str(cds.start), "-", str(cds.end), "_", strand]) else: strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand]) return feature
def merge_libs(input_libs, wig_folder, program): if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") if program == "TSS": type_ = "tex" elif program == "processing": type_ = "notex" for lib in input_libs: datas = lib.split(":") if (datas[1] == type_) and (datas[4] == "+"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join(os.getcwd(), "merge_forward.wig")) elif (datas[1] == type_) and (datas[4] == "-"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join(os.getcwd(), "merge_reverse.wig"))
def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out): sorfs = [] srnas = [] out_r = open(sRNA_out, "w") out_o = open(sORF_out, "w") out_r.write("##gff-version 3\n") out_o.write("##gff-version 3\n") for entry in Gff3Parser().entries(open(sRNA_file)): entry.attributes = del_attributes("sORF", entry) srnas.append(entry) srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in Gff3Parser().entries(open(sORF_file)): entry.attributes = del_attributes("sRNA", entry) sorfs.append(entry) sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for srna in srnas: for sorf in sorfs: if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand): if ((srna.start <= sorf.start) and (srna.end >= sorf.end)) or ( (srna.start >= sorf.start) and (srna.end <= sorf.end)) or ( (srna.start <= sorf.start) and (srna.end >= sorf.start) and (srna.end <= sorf.end)) or ( (srna.start >= sorf.start) and (srna.start <= sorf.end) and (srna.end >= sorf.end)): if "sORF" not in srna.attributes.keys(): srna.attributes["sORF"] = [] strand = Helper().get_strand_name(sorf.strand) srna.attributes["sORF"].append("".join([ sorf.attributes["ID"], ":", str(sorf.start), "-", str(sorf.end), "_", strand ])) if "sRNA" not in sorf.attributes.keys(): sorf.attributes["sRNA"] = [] strand = Helper().get_strand_name(srna.strand) sorf.attributes["sRNA"].append("".join([ srna.attributes["ID"], ":", str(srna.start), "-", str(srna.end), "_", strand ])) print_file(sorfs, out_o, "sRNA") print_file(srnas, out_r, "sORF") out_r.close() out_o.close()
def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best"
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") }
def get_gene_info(cds): if "locus_tag" in cds.attributes.keys(): feature = cds.attributes["locus_tag"] else: strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand]) return feature
def __init__(self, args_cris): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_cris.gffs, "tmp") self.fasta_path = os.path.join(args_cris.fastas, "tmp") self.stat_folder = os.path.join(args_cris.out_folder, "statistics") self.gff_out = os.path.join(args_cris.out_folder, "gffs") self.all_out = os.path.join(args_cris.out_folder, "gffs", "all_candidates") self.best_out = os.path.join(args_cris.out_folder, "gffs", "best_candidates") self.helper.check_make_folder(self.all_out) self.helper.check_make_folder(self.best_out) self.data_folder = os.path.join(args_cris.out_folder, "CRT_results") self.helper.check_make_folder(self.data_folder) self.helper.check_make_folder(self.stat_folder)
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def import_to_tss(tss_type, cds_pos, tss, locus_tag, tss_entry): if cds_pos == "NA": utr = "_".join([tss_type, "NA"]) else: utr = "_".join([tss_type, str(int(math.fabs(cds_pos - tss.start)))]) if len(tss_entry) != 0: tss_dict = tss_entry[1] tss_dict_types = tss_dict["type"].split("&") tss_dict_utrs = tss_dict["UTR_length"].split("&") tss_dict_tags = tss_dict["associated_gene"].split("&") if tss_type == "Primary" and ("Primary" in tss_dict["type"]): index = 0 for tss_dict_type in tss_dict_types: if "Primary" in tss_dict_type: utr_length = tss_dict_utrs[index].split("_") if math.fabs(cds_pos - tss.start) < int(utr_length[1]): tss_dict_utrs[index] = utr tss_dict_tags[index] = locus_tag index += 1 else: tss_dict_types.append(tss_type) tss_dict_utrs.append(utr) tss_dict_tags.append(locus_tag) strand = Helper().get_strand_name(tss.strand) tss_dict = { "Name": "_".join(["TSS:" + str(tss.start), strand]), "type": "&".join(tss_dict_types), "UTR_length": "&".join(tss_dict_utrs), "associated_gene": "&".join(tss_dict_tags) } else: strand = Helper().get_strand_name(tss.strand) tss_dict = { "Name": "_".join(["TSS:" + str(tss.start), strand]), "type": tss_type, "UTR_length": utr, "associated_gene": locus_tag } tss_string = ";".join([ "=".join(["UTR_length", tss_dict["UTR_length"]]), "=".join(["associated_gene", tss_dict["associated_gene"]]), "=".join(["type", tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]]) ]) return (tss_string, tss_dict)
def detect_start_stop(inters, seq, args_sorf): '''check the length is 3 -times or not''' sorfs = [] for inter in inters: if inter.start <= 0: inter.start = 1 if inter.end >= len(seq[inter.seq_id]): inter.end = len(seq[inter.seq_id]) fasta = Helper().extract_gene(seq[inter.seq_id], inter.start, inter.end, inter.strand) starts = [] stops = [] for frame in range(0, 3): for index in range(frame, len(fasta), 3): if fasta[index:index + 3] in args_sorf.start_codon: starts.append(index) elif fasta[index:index + 3] in args_sorf.stop_codon: stops.append(index) for start in starts: for stop in stops: if ((stop - start) > 0) and \ (((stop - start) % 3) == 0) and \ ((stop - start) <= args_sorf.max_len) and \ ((stop - start) >= args_sorf.min_len): rbs = detect_rbs_site(fasta, start, inter, args_sorf) if (len(rbs) == 1) and (rbs[0] == "NA"): pass else: if (inter.source == "intergenic") or (inter.source == "antisense"): if inter.strand == "+": check_terminal_seq(seq[inter.seq_id], inter.start + start, inter.start + stop + 2, args_sorf, inter.source, inter, sorfs, rbs) else: check_terminal_seq( seq[inter.seq_id], inter.start + (len(fasta) - stop - 3), inter.start + (len(fasta) - start - 1), args_sorf, inter.source, inter, sorfs, rbs) elif inter.source == "UTR_derived": if inter.strand == "+": check_terminal_seq( seq[inter.seq_id], inter.start + start, inter.start + stop + 2, args_sorf, inter.attributes["UTR_type"], inter, sorfs, rbs) else: check_terminal_seq( seq[inter.seq_id], inter.start + (len(fasta) - stop - 3), inter.start + (len(fasta) - start - 1), args_sorf, inter.attributes["UTR_type"], inter, sorfs, rbs) return sorfs
def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics")
def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig"))
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def __init__(self, args_sc): self.helper = Helper() out_folder = os.path.join(args_sc.output_folder, "screenshots") if os.path.exists(out_folder): print("Error: The {0} already exist!".format(out_folder)) sys.exit() else: os.mkdir(out_folder) args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder( os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file)
def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") }
def get_feature(cds): if "protein_id" in cds.attributes.keys(): cds_name = cds.attributes["protein_id"] elif "locus_tag" in cds.attributes.keys(): cds_name = cds.attributes["locus_tag"] else: strand = Helper().get_strand_name(cds.strand) cds_name = "".join([cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand]) return cds_name
def get_attributes(tss, cds): if tss.attributes["associated_gene"] == "orphan": if "locus_tag" in cds.attributes.keys(): tss.attributes["associated_gene"] = cds.attributes["locus_tag"] else: strand = Helper().get_strand_name(cds.strand) tss.attributes["associated_gene"] = cds.feature + ":" + \ str(cds.start) + "-" + str(cds.end) + "_" + strand else: if "locus_tag" in cds.attributes.keys(): tss.attributes["associated_gene"] = "&".join([ tss.attributes["associated_gene"], cds.attributes["locus_tag"] ]) else: strand = Helper().get_strand_name(cds.strand) tss.attributes["associated_gene"] = "&".join([ tss.attributes["associated_gene"], cds.feature + ":" + str(cds.start) + "-" + str(cds.end) + "_" + strand ])
def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff"
def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") }
def check_overlap(table_file, gff_file): out = open(table_file + "tmp", "w") gffs = [] gff_f = open(gff_file, "r") for entry in Gff3Parser().entries(gff_f): if Helper().feature_without_notgene(entry): gffs.append(entry) fh = open(table_file, "r") out.write("\t".join([ "Rank", "Genome", "Name", "Start", "End", "Strand", "Start_with_TSS/Cleavage_site", "End_with_cleavage", "Candidates", "Lib_type", "Best_avg_coverage", "Track/Coverage", "Normalized_secondary_energy_change(by_length)", "sRNA_types", "Conflict_sORF", "nr_hit_number", "sRNA_hit_number", "nr_hit_top3|ID|e-value|score", "sRNA_hit|e-value|score", "Overlap_CDS_forward", "Overlap_nts_forward", "Overlap_CDS_reverse", "Overlap_nts_reverse", "End_with_terminator", "Associated_promoter", "sRNA_length" ]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[3] != "Start": overlaps = {"forward": [], "reverse": [], "CDS_f": [], "CDS_r": []} start = int(row[3]) end = int(row[4]) for gff in gffs: if ((gff.end < end) and (gff.end > start) and (gff.start <= start)) or ( (gff.start > start) and (gff.start < end) and (gff.end >= end)) or ((gff.end >= end) and (gff.start <= start)) or ( (gff.end <= end) and (gff.start >= start)): overlap = min(gff.end, end) - max(gff.start, start) + 1 percent = "{0:.0f}%".format( (float(overlap) / float(end - start + 1)) * 100) if gff.strand == "+": overlaps["forward"].append( str(overlap) + "(" + str(percent) + ")") overlaps["CDS_f"].append(import_cds(gff)) else: overlaps["reverse"].append( str(overlap) + "(" + str(percent) + ")") overlaps["CDS_r"].append(import_cds(gff)) if len(overlaps["forward"]) == 0: overlaps["forward"] = ["NA"] overlaps["CDS_f"] = ["NA"] if len(overlaps["reverse"]) == 0: overlaps["reverse"] = ["NA"] overlaps["CDS_r"] = ["NA"] out.write("\t".join(row[0:19] + [ ";".join(overlaps["CDS_f"]), ";".join(overlaps["forward"]), ";".join(overlaps["CDS_r"]), ";".join(overlaps["reverse"]) ] + row[21:]) + "\n") shutil.move(table_file + "tmp", table_file)
def check_terminal_seq(seq, start, end, args_sorf, source, inter, sorfs, rbs): detect = None for i in [0, 1, -1, 2, -2]: fasta = Helper().extract_gene(seq, start + i, end + i, inter.strand) if (fasta[:3] in args_sorf.start_codon) and (fasta[-3:] in args_sorf.stop_codon): detect = i if detect is not None: start = start + detect end = end + detect import_sorf(inter, sorfs, start, end, source, seq, rbs)
def assign_parent(gff, tran, feature): if "Parent" not in gff.attributes.keys(): gff.attributes["Parent"] = tran.attributes["ID"] else: gff.attributes["Parent"] = ( ",".join([gff.attributes["Parent"], tran.attributes["ID"]])) if "_".join(["associated", feature]) not in tran.attributes.keys(): if "locus_tag" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( gff.attributes["locus_tag"]) elif "protein_id" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( gff.attributes["protein_id"]) elif "Name" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( gff.attributes["Name"]) else: strand = Helper().get_strand_name(gff.strand) tran.attributes["_".join(["associated", feature])] = ( "".join([gff.feature, ":", str(gff.start), "-", str(gff.end), "_", strand])) else: if "locus_tag" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( ",".join([tran.attributes["_".join(["associated", feature])], gff.attributes["locus_tag"]])) elif "protein_id" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( ",".join([tran.attributes["_".join(["associated", feature])], gff.attributes["protein_id"]])) elif "Name" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = ( ",".join([tran.attributes["_".join(["associated", feature])], gff.attributes["Name"]])) else: strand = Helper().get_strand_name(gff.strand) tran.attributes["_".join(["associated", feature])] = ( ",".join([tran.attributes["_".join( ["associated", feature])], "".join( [gff.feature, ":", str(gff.start), "-", str(gff.end), "_", strand])]))
def read_gff(gff_file): cdss = [] genes = [] g_f = open(gff_file, "r") for entry in Gff3Parser().entries(g_f): if (Helper().feature_without_notgene(entry)): cdss.append(entry) if entry.feature == "gene": genes.append(entry) cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return cdss, genes
def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None
def __init__(self, args_sc, out_folder): self.helper = Helper() args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder(os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file)
def setUp(self): self.example = ExampleData() self.helper = Helper() self.gff_out = self.example.gff_out self.rev_seq = self.example.rev_seq.replace("\n", "") self.test_folder = "test_folder" if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) self.gff_file = os.path.join(self.test_folder, "test.gff") with open(self.gff_file, "w") as rh: rh.write(self.example.gff_file) self.seq_file = os.path.join(self.test_folder, "test.fa") with open(self.seq_file, "w") as rh: rh.write(self.example.seq)
def __init__(self, gffs): self.multiparser = Multiparser() self.helper = Helper() self.out_folder = os.path.join(gffs, "for_libs") if os.path.exists(self.out_folder): shutil.rmtree(self.out_folder) os.mkdir(self.out_folder) self.stat = os.path.join(self.out_folder, "statistics") os.mkdir(self.stat) self.gff_folder = os.path.join(self.out_folder, "gffs") os.mkdir(self.gff_folder) self.merge_wigs = os.path.join(gffs, "merge_wigs") if os.path.exists(self.merge_wigs): shutil.rmtree(self.merge_wigs)
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv"
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates"
def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")}
def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = {"log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join( out_folder, "tmp_specific"), "nospecific": os.path.join( out_folder, "tmp_nospecific"), "wget_action": os.path.join( out_folder, "tmp_action")}
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")}
def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff"
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def __init__(self, args_sc): self.multiparser = Multiparser() self.helper = Helper() out_folder = os.path.join(args_sc.output_folder, "screenshots") if os.path.exists(out_folder): print("Error: The {0} already exist!!!".format( out_folder)) sys.exit() else: os.mkdir(out_folder) args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder(os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file)
def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")
def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder()
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = {"fasta": os.path.join( args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join( args_ribo.out_folder, "tmp_scan"), "table": os.path.join( args_ribo.out_folder, "tmp_table")} self.suffixs = {"csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv"}
def __init__(self): self.multiparser = Multiparser() self.helper = Helper()
class ArgsContainer(object): def __init__(self): self.multiparser = Multiparser() self.helper = Helper() def _check_replicates(self, replicates_tex, replicates_frag): if (replicates_tex is not None) and (replicates_frag is not None): replicates = {"tex": int(replicates_tex), "frag": int(replicates_frag)} elif replicates_tex is not None: replicates = {"tex": int(replicates_tex), "frag": -1} elif replicates_frag is not None: replicates = {"tex": -1, "frag": int(replicates_frag)} else: print("Error:No replicates number assign!!!") sys.exit() return replicates def _check_libs(self, tex_notex_libs, frag_libs): if (tex_notex_libs is None) and (frag_libs is None): print("Error: please input proper libraries!!") if (tex_notex_libs is not None) and (frag_libs is not None): libs = tex_notex_libs + frag_libs elif (tex_notex_libs is not None): libs = tex_notex_libs elif (frag_libs is not None): libs = frag_libs return libs def _parser_combine_wigs(self, subcommand): self.tex_path = None self.frag_path = None self.multiparser.parser_gff(self.gffs, None) if subcommand == "terminator": gff_path = os.path.join(self.gffs, "tmp") self.multiparser.parser_gff(gff_path, None) else: gff_path = self.gffs if self.tex_wigs is not None: self.tex_path = os.path.join(self.tex_wigs, "tmp") self.multiparser.parser_wig(self.tex_wigs) self.multiparser.combine_wig(gff_path, self.tex_path, None, self.libs) self.merge_wigs = self.tex_wigs self.wig_path = self.tex_path if self.frag_wigs is not None: self.frag_path = os.path.join(self.frag_wigs, "tmp") self.multiparser.parser_wig(self.frag_wigs) self.multiparser.combine_wig(gff_path, self.frag_path, None, self.libs) self.merge_wigs = self.frag_wigs self.wig_path = self.frag_path if (self.tex_path is not None) and ( self.frag_path is not None): self = self._merge_wig() if (self.tex_path is None) and ( self.frag_path is None): print("Error: There is no proper wig files assigned!!") sys.exit() return self def _merge_wig(self): self.merge_wigs = os.path.join(self.out_folder, "merge_wigs") if (self.tex_wigs is not None) and ( self.frag_wigs is not None): self.helper.check_make_folder(self.merge_wigs) self.wig_path = os.path.join(self.merge_wigs, "tmp") self.helper.check_make_folder(self.wig_path) for wig in os.listdir(self.tex_wigs): if os.path.isfile(os.path.join(self.tex_wigs, wig)): shutil.copy(os.path.join(self.tex_wigs, wig), self.merge_wigs) for wig in os.listdir(self.frag_wigs): if os.path.isfile(os.path.join(self.frag_wigs, wig)): shutil.copy(os.path.join(self.frag_wigs, wig), self.merge_wigs) for wig in os.listdir(self.tex_path): if os.path.isfile(os.path.join(self.tex_path, wig)): shutil.copy(os.path.join(self.tex_path, wig), self.wig_path) for wig in os.listdir(self.frag_path): if os.path.isfile(os.path.join(self.frag_path, wig)): self.helper.merge_file(os.path.join(self.frag_path, wig), os.path.join(self.wig_path, wig)) elif (self.tex_wigs is not None): self.merge_wigs = self.tex_wigs elif (self.frag_wigs is not None): self.merge_wigs = self.frag_wigs return self def _deal_multi_inputs(self, inputs, file_type, num, command): if inputs is not None: datas = inputs.split(",") if num is not None: if (len(datas) != num): print("Error: the amount of {0} is not correct!!".format( command)) new_inputs = [] for data in datas: if file_type == "float": new_inputs.append(float(data.strip())) elif file_type == "int": new_inputs.append(int(data.strip())) else: new_inputs.append(data) return new_inputs else: return inputs def container_ratt(self, ratt_path, element, transfer_type, ref_embl_gbk, target_fasta, ref_fasta, ratt_folder, convert_to_gff_rnt_ptt, tar_annotation_folder, compare_pair): self.ratt_path = ratt_path self.element = element self.transfer_type = transfer_type self.ref_embls = ref_embl_gbk self.tar_fastas = target_fasta self.ref_fastas = ref_fasta self.output_path = ratt_folder self.convert = convert_to_gff_rnt_ptt self.gff_outfolder = tar_annotation_folder self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None) return self def container_tsspredator(self, TSSpredator_path, compute_program, fasta_folder, annotation_folder, wig_folder, lib, output_prefix, height, height_reduction, factor, factor_reduction, base_height, enrichment_factor, processing_factor, replicate_match, out_folder, statistics, validate_gene, merge_manual, compare_transcript_assembly, fuzzy, utr_length, cluster, length, re_check_orphan, overlap_feature, reference_gff_folder, remove_low_expression): self.tsspredator_path = TSSpredator_path self.program = compute_program self.fastas = fasta_folder self.gffs = annotation_folder self.wig_folder = wig_folder self.libs = self._deal_multi_inputs(lib, "str", None, None) self.output_prefixs = self._deal_multi_inputs(output_prefix, "str", None, None) self.height = height self.height_reduction = height_reduction self.factor = factor self.factor_reduction = factor_reduction self.base_height = base_height self.enrichment_factor = enrichment_factor self.processing_factor = processing_factor self.repmatch = replicate_match self.out_folder = out_folder self.stat = statistics self.validate = validate_gene self.manual = merge_manual self.ta_files = compare_transcript_assembly self.fuzzy = fuzzy self.utr_length = utr_length self.cluster = cluster self.nt_length = length self.check_orphan = re_check_orphan self.overlap_feature = overlap_feature self.references = reference_gff_folder self.remove_low_expression = remove_low_expression return self def container_optimize(self, TSSpredator_path, fasta_file, annotation_file, wig_folder, manual, out_folder, strain_name, max_height, max_height_reduction, max_factor, max_factor_reduction, max_base_height, max_enrichment_factor, max_processing_factor, utr_length, lib, output_prefix, cluster, length, core, program, replicate_match, steps): self.tsspredator_path = TSSpredator_path self.fastas = fasta_file self.gffs = annotation_file self.wigs = wig_folder self.manual = manual self.output_folder = out_folder self.project_strain = strain_name self.height = max_height self.height_reduction = max_height_reduction self.factor = max_factor self.factor_reduction = max_factor_reduction self.base_height = max_base_height self.enrichment = max_enrichment_factor self.processing = max_processing_factor self.utr = utr_length self.libs = self._deal_multi_inputs(lib, "str", None, None) self.replicate_name = self._deal_multi_inputs(output_prefix, "str", None, None) self.cluster = cluster self.length = length self.cores = core self.program = program self.replicate = replicate_match self.steps = steps return self def container_terminator( self, TransTermHP_path, expterm_path, RNAfold_path, out_folder, fasta_folder, annotation_folder, transcript_folder, srna, statistics, tex_wig_folder, frag_wig_folder, decrease, highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript, fuzzy_downstream_transcript, fuzzy_within_gene, fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, min_loop_length, max_loop_length, min_stem_length, max_stem_length, min_AT_tail_length, miss_rate, range_u): self.TransTermHP_path = TransTermHP_path self.expterm_path = expterm_path self.RNAfold_path = RNAfold_path self.out_folder = out_folder self.fastas = fasta_folder self.gffs = annotation_folder self.trans = transcript_folder self.srnas = srna self.stat = statistics self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.decrease = decrease self.cutoff_coverage = highest_coverage self.fuzzy = fuzzy_detect_coverage self.fuzzy_up_ta = fuzzy_within_transcript self.fuzzy_down_ta = fuzzy_downstream_transcript self.fuzzy_up_gene = fuzzy_within_gene self.fuzzy_down_gene = fuzzy_downstream_gene self.hp_folder = transtermhp_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.min_loop = min_loop_length self.max_loop = max_loop_length self.min_stem = min_stem_length self.max_stem = max_stem_length self.at_tail = min_AT_tail_length self.miss_rate = miss_rate self.range_u = range_u self = self._parser_combine_wigs("terminator") return self def container_transcript( self, frag_wig_path, tex_wig_path, tex_notex, length, annotation_folder, height, width, tolerance, tolerance_coverage, replicates_tex, replicates_frag, transcript_assembly_output_folder, compare_TSS, compare_genome_annotation, TSS_fuzzy, tex_treated_libs, fragmented_libs, compare_feature_genome, table_best, terminator_folder, fuzzy_term): self.frag_wigs = frag_wig_path self.tex_wigs = tex_wig_path self.tex = tex_notex self.length = length self.gffs = annotation_folder self.height = height self.width = width self.tolerance = tolerance self.low_cutoff = tolerance_coverage self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.out_folder = transcript_assembly_output_folder self.compare_tss = compare_TSS self.compare_cds = compare_genome_annotation self.fuzzy = TSS_fuzzy self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None, None) self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str", None, None) self.table_best = table_best self.terms = terminator_folder self.fuzzy_term = fuzzy_term self = self._parser_combine_wigs("transcript") return self def container_utr(self, tss_folder, annotation_folder, transcript_assembly_folder, terminator_folder, terminator_fuzzy, utr_folder, tss_source, base_5utr, length, base_3utr): self.tsss = tss_folder self.gffs = annotation_folder self.trans = transcript_assembly_folder self.terms = terminator_folder self.fuzzy = terminator_fuzzy self.out_folder = utr_folder self.source = tss_source self.base_5utr = base_5utr self.base_3utr = base_3utr self.length = length return self def container_srna( self, Vienna_folder, Vienna_utils, blast_plus_folder, ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder, TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy, TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info, tex_wig_folder, frag_wig_folder, processing_site_folder, fasta_folder, mountain_plot, nr_format, srna_format, sRNA_database_path, nr_database_path, cutoff_energy, run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage, run_intergenic_fragmented_coverage, run_antisense_TEX_coverage, run_antisense_noTEX_coverage, run_antisense_fragmented_coverage, intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage, run_utr_fragmented_coverage, max_length, min_length, tex_notex_libs, frag_libs, replicates_tex, replicates_frag, tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS, table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic, fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit, best_without_sORF_candidate, overlap_percent_CDS, terminator_folder, terminator_fuzzy_in_CDS, terminator_fuzzy_out_CDS, best_with_terminator, ignore_hypothetical_protein, TSS_source, min_utr_coverage, promoter_table, best_with_promoter, ranking_promoter, promoter_name): self.vienna_path = Vienna_folder self.vienna_util = Vienna_utils self.blast_path = blast_plus_folder self.ps2pdf14_path = ps2pdf14_path self.out_folder = srna_folder self.utr_srna = UTR_derived_sRNA self.gffs = annotation_folder self.tss_folder = TSS_folder self.trans = transcript_assembly_folder self.fuzzy_inter_tss = TSS_intergenic_fuzzy self.fuzzy_5utr_tss = TSS_5UTR_fuzzy self.fuzzy_3utr_tss = TSS_3UTR_fuzzy self.fuzzy_intercds_tss = TSS_interCDS_fuzzy self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss, "3utr": self.fuzzy_3utr_tss, "interCDS": self.fuzzy_intercds_tss, "inter": self.fuzzy_inter_tss} self.import_info = self._deal_multi_inputs(import_info, "str", None, None) self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.pro_folder = processing_site_folder self.fastas = fasta_folder self.mountain = mountain_plot self.nr_format = nr_format self.srna_format = srna_format self.srna_database = sRNA_database_path self.nr_database = nr_database_path self.energy = cutoff_energy self.coverage_tex = self._deal_multi_inputs( run_intergenic_TEX_coverage, "float", 5, "--run_intergenic_TEX_coverage") self.coverage_notex = self._deal_multi_inputs( run_intergenic_noTEX_coverage, "float", 5, "--run_intergenic_noTEX_coverage") self.coverage_frag = self._deal_multi_inputs( run_intergenic_fragmented_coverage, "float", 5, "--run_intergenic_fragmented_coverage") self.anti_cover_tex = self._deal_multi_inputs( run_antisense_TEX_coverage, "float", 5, "--run_antisense_TEX_coverage") self.anti_cover_notex = self._deal_multi_inputs( run_antisense_noTEX_coverage, "float", 5, "--run_antisense_noTEX_coverage") self.anti_cover_frag = self._deal_multi_inputs( run_antisense_fragmented_coverage, "float", 5, "--run_antisense_fragmented_coverage") self.tolerance = intergenic_tolerance self.utr_tex_cover = self._deal_multi_inputs( run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_notex_cover = self._deal_multi_inputs( run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_frag_cover = self._deal_multi_inputs( run_utr_fragmented_coverage, "str", 3, "--run_utr_fragmented_coverage") self.max_len = max_length self.min_len = min_length self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.tex_notex = tex_notex self.e_nr = blast_e_nr self.e_srna = blast_e_srna self.in_cds = detect_sRNA_in_CDS self.table_best = table_best self.decrease_inter = decrease_intergenic self.decrease_utr = decrease_utr self.fuzzy_inter = fuzzy_intergenic self.fuzzy_utr = fuzzy_utr self.nr_hits_num = cutoff_nr_hit self.sorf_file = sORF self.all_hit = best_with_all_sRNAhit self.best_sorf = best_without_sORF_candidate self.cutoff_overlap = overlap_percent_CDS self.terms = terminator_folder self.fuzzy_b = terminator_fuzzy_in_CDS self.fuzzy_a = terminator_fuzzy_out_CDS self.best_term = best_with_terminator self.hypo = ignore_hypothetical_protein self.tss_source = TSS_source self.min_utr = min_utr_coverage self.promoter_table = promoter_table self.best_promoter = best_with_promoter if ranking_promoter < 1: print("Error: --ranking_time_promoter must larger than 1...") sys.exit() self.rank_promoter = ranking_promoter self.promoter_name = self._deal_multi_inputs(promoter_name, "str", None, None) self = self._parser_combine_wigs("srna") return self def container_intersrna(self, file_type, files, args_srna, prefix, gff_file, tran_file, tss_file, pro_file, fuzzy): args_srna.file_type = file_type args_srna.gff_file = gff_file args_srna.tran_file = tran_file args_srna.tss_file = tss_file args_srna.pro_file = pro_file args_srna.fuzzy = fuzzy args_srna.prefix = prefix if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.cutoffs = args_srna.coverage_frag args_srna.tss_source = True args_srna.cut_notex = None args_srna.anti_notex_cutoff = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.cutoffs = args_srna.coverage_tex args_srna.tss_source = args_srna.tss_source args_srna.cut_notex = args_srna.coverage_notex args_srna.anti_notex_cutoff = args_srna.anti_cover_notex return args_srna def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type, prefix, args_srna): args_srna.file_type = file_type args_srna.gff_file = gff args_srna.ta_file = tran args_srna.tss_file = tss args_srna.pro_file = pro args_srna.prefix = prefix args_srna.seq_file = fasta if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.utr_coverages = args_srna.utr_frag_cover args_srna.notex = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.utr_coverages = args_srna.utr_tex_cover args_srna.notex = args_srna.utr_notex_cover args_srna.coverages = {"5utr": args_srna.utr_coverages[0], "3utr": args_srna.utr_coverages[1], "interCDS": args_srna.utr_coverages[2]} if args_srna.notex is not None: args_srna.cover_notex = {"5utr": args_srna.notex[0], "3utr": args_srna.notex[1], "interCDS": args_srna.notex[2]} else: args_srna.cover_notex = None return args_srna def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex): args_srna.tsss = tsss args_srna.pros = pros args_srna.wigs_f = wigs_f args_srna.wigs_r = wigs_r args_srna.nums = nums args_srna.output = output args_srna.out_table = out_table args_srna.texs = texs args_srna.detects = detects args_srna.cutoff_coverage = cutoff_coverage args_srna.notex = notex return args_srna def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs): args_srna.cdss = cdss args_srna.tsss = tsss args_srna.pros = pros args_srna.wig_fs = wig_fs args_srna.wig_rs = wig_rs args_srna.out = out args_srna.out_t = out_t args_srna.texs = texs args_srna.utrs = [] args_srna.srnas = [] return args_srna def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder, annotation_folder, TSS_folder, utr_length, min_length, max_length, tex_wig_folder, frag_wig_folder, cutoff_intergenic_coverage, cutoff_antisense_coverage, cutoff_5utr_coverage, cutoff_3utr_coverage, cutoff_interCDS_coverage, fasta_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, sRNA_folder, start_codon, stop_codon, cutoff_background, fuzzy_rbs, rbs_not_after_TSS, print_all_combination, best_no_sRNA, best_no_TSS, ignore_hypothetical_protein, min_rbs_distance, max_rbs_distance): self.out_folder = sorf_folder self.utr_detect = UTR_derived_sORF self.trans = transcript_folder self.gffs = annotation_folder self.tsss = TSS_folder self.utr_length = utr_length self.min_len = min_length self.max_len = max_length self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.cutoff_inter = cutoff_intergenic_coverage self.cutoff_anti = cutoff_antisense_coverage self.cutoff_5utr = cutoff_5utr_coverage self.cutoff_3utr = cutoff_3utr_coverage self.cutoff_intercds = cutoff_interCDS_coverage self.fastas = fasta_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.srnas = sRNA_folder self.start_codon = self._deal_multi_inputs(start_codon, "str", None, None) self.stop_codon = self._deal_multi_inputs(stop_codon, "str", None, None) self.background = cutoff_background self.fuzzy_rbs = fuzzy_rbs self.noafter_tss = rbs_not_after_TSS self.print_all = print_all_combination self.no_srna = best_no_sRNA self.no_tss = best_no_TSS self.hypo = ignore_hypothetical_protein self.min_rbs = min_rbs_distance self.max_rbs = max_rbs_distance self = self._parser_combine_wigs("sorf") return self def container_srna_target(self, Vienna_folder, annotation_path, fasta_path, sRNA_path, query_sRNA, program, interaction_length, window_size_target, span_target, window_size_srna, span_srna, unstructured_region_RNAplex_target, unstructured_region_RNAplex_srna, unstructured_region_RNAup, energy_threshold, duplex_distance, top, starget_output_folder, process_rnaplex, process_rnaup, continue_rnaup, potential_target_start, potential_target_end, target_feature): self.vienna_path = Vienna_folder self.gffs = annotation_path self.fastas = fasta_path self.srnas = sRNA_path self.query = self._deal_multi_inputs(query_sRNA, "str", None, None) self.program = program self.inter_length = interaction_length self.win_size_t = window_size_target self.span_t = span_target self.win_size_s = window_size_srna self.span_s = span_srna self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna self.unstr_region_rnaup = unstructured_region_RNAup self.energy = energy_threshold self.duplex_dist = duplex_distance self.top = top self.out_folder = starget_output_folder self.core_plex = process_rnaplex self.core_up = process_rnaup self.continue_rnaup = continue_rnaup self.tar_start = potential_target_start self.tar_end = potential_target_end self.features = self._deal_multi_inputs(target_feature, "str", None, None) return self def container_goterm(self, annotation_path, goterm_output_folder, UniProt_id, go_obo, goslim_obo, transcript_path): self.gffs = annotation_path self.out_folder = goterm_output_folder self.uniprot = UniProt_id self.go = go_obo self.goslim = goslim_obo self.trans = transcript_path return self def container_sublocal(self, Psortb_path, gff_path, fasta_path, bacteria_type, difference_multi, merge_to_gff, sublocal_output_folder, transcript_path): self.psortb_path = Psortb_path self.gffs = gff_path self.fastas = fasta_path self.gram = bacteria_type self.fuzzy = difference_multi self.merge = merge_to_gff self.out_folder = sublocal_output_folder self.trans = transcript_path return self def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed, species_STRING, score, ppi_output_folder, node_size, query): self.ptts = gff_path self.strains = self._deal_multi_inputs(proteinID_strains, "str", None, None) self.no_specific = without_strain_pubmed self.species = species_STRING self.score = score self.out_folder = ppi_output_folder self.size = node_size self.querys = self._deal_multi_inputs(query, "str", None, None) return self def container_promoter(self, MEME_path, promoter_output_folder, tex_libs, TSS_folder, fasta_folder, num_motif, nt_before_TSS, motif_width, TSS_source, tex_wig_path, annotation_folder, combine_all, e_value): self.meme_path = MEME_path self.output_folder = promoter_output_folder self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None) self.tsss = TSS_folder self.fastas = fasta_folder self.num_motif = num_motif self.nt_before = nt_before_TSS self.widths = self._deal_multi_inputs(motif_width, "str", None, None) self.source = TSS_source self.wigs = tex_wig_path self.gffs = annotation_folder self.combine = combine_all self.e_value = e_value return self def container_operon(self, TSS_folder, annotation_folder, transcript_folder, UTR5_folder, UTR3_folder, term_folder, TSS_fuzzy, term_fuzzy, min_length, statistics, operon_output_folder, combine_gff, operon_statistics_folder): self.tsss = TSS_folder self.gffs = annotation_folder self.trans = transcript_folder self.utr5s = UTR5_folder self.utr3s = UTR3_folder self.terms = term_folder self.tss_fuzzy = TSS_fuzzy self.term_fuzzy = term_fuzzy self.length = min_length self.statistics = statistics self.output_folder = operon_output_folder self.combine = combine_gff self.stat_folder = operon_statistics_folder return self def container_snp(self, samtools_path, bcftools_path, bam_type, program, fasta_path, tex_bam_path, frag_bam_path, quality, read_depth, snp_output_folder, indel_fraction, chrom): self.samtools_path = samtools_path self.bcftools_path = bcftools_path self.types = bam_type self.program = self._deal_multi_inputs(program, "str", None, None) self.fastas = fasta_path self.normal_bams = tex_bam_path self.frag_bams = frag_bam_path self.quality = quality self.depth = read_depth self.out_folder = snp_output_folder self.fraction = indel_fraction if chrom == "haploid": chrom = "1" elif chrom == "diploid": chrom = "2" self.chrom = chrom return self def container_circrna(self, align, process, fasta_path, annotation_path, tex_bam_path, fragmented_bam_path, read_folder, circrna_stat_folder, support_reads, segemehl_folder, samtools_path, start_ratio, end_ratio, ignore_hypothetical_protein, out_folder): self.align = align self.cores = process self.fastas = fasta_path self.gffs = annotation_path self.normal_bams = tex_bam_path self.frag_bams = fragmented_bam_path self.read_folder = read_folder self.stat_folder = circrna_stat_folder self.support = support_reads self.segemehl_path = segemehl_folder self.samtools_path = samtools_path self.start_ratio = start_ratio self.end_ratio = end_ratio self.hypo = ignore_hypothetical_protein self.output_folder = out_folder return self def container_ribos(self, infernal_path, riboswitch_ID, gff_path, fasta_path, tss_path, transcript_path, Rfam, ribos_output_folder, e_value, output_all, database_folder, fuzzy, start_codon, min_dist_rbs, max_dist_rbs, fuzzy_rbs, UTR_length): self.infernal_path = infernal_path self.ribos_id = riboswitch_ID self.gffs = gff_path self.fastas = fasta_path self.tsss = tss_path self.trans = transcript_path self.rfam = Rfam self.out_folder = ribos_output_folder self.e_value = e_value self.output_all = output_all self.database = database_folder self.fuzzy = fuzzy self.start_codons = self._deal_multi_inputs(start_codon, "str", None, None) self.start_rbs = min_dist_rbs self.end_rbs = max_dist_rbs self.fuzzy_rbs = fuzzy_rbs self.utr = UTR_length return self def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder, tex_wig_folder, height, tex_libs, frag_libs, present, output_folder): self.main_gff = main_gff self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None) self.fasta = fasta self.frag_wigs = frag_wig_folder self.tex_wigs = tex_wig_folder self.height = height self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.present = present self.output_folder = output_folder return self
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([ prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)