def intergenic_seq(seq_file, tran_file, gff_file, out_file): out = open(out_file, "w") seq, tas, merges, genes = read_file(seq_file, tran_file, gff_file) inter_tas = get_inter(tas, seq, "tran") inter_genes = get_inter(genes, seq, "gene") merges = merge_inter(inter_tas, inter_genes) num = 0 for tmp_merge in merges: corr_merges = detect_confliction(tmp_merge, genes, seq) for merge in corr_merges: if merge["start"] < merge["end"]: if merge["strand"] == "+": inter_seq = Helper().extract_gene(seq[merge["strain"]], merge["start"], merge["end"], "+") out.write(">" + "|".join([ "inter_" + str(num), str(merge["start"]), str(merge["end"]), merge["strain"], merge["parent_p"], merge["parent_m"], "+" ]) + "\n") out.write(inter_seq + "\n") num += 1 else: inter_seq = Helper().extract_gene(seq[merge["strain"]], merge["start"], merge["end"], "-") out.write(">" + "|".join([ "inter_" + str(num), str(merge["start"]), str(merge["end"]), merge["strain"], merge["parent_p"], merge["parent_m"], "-" ]) + "\n") out.write(inter_seq + "\n") num += 1
def assign_tss(tss, tran): if "ID" in tran.attributes.keys(): tran_id = tran.attributes["ID"] else: strand = Helper().get_strand_name(tran.strand) tran_id = "".join([ tran.feature, ":", str(tran.start), "-", str(tran.end), "_", strand ]) if "Parent" not in tss.attributes.keys(): tss.attributes["Parent"] = tran_id else: tss.attributes["Parent"] = \ ",".join([tss.attributes["Parent"], tran_id]) if "Name" in tss.attributes.keys(): tss_name = tss.attributes["Name"] else: strand = Helper().get_strand_name(tss.strand) tss_name = "".join(["TSS:", str(tss.start), "_", strand]) if "associated_tss" not in tran.attributes.keys(): tran.attributes["associated_tss"] = tss_name else: tran.attributes["associated_tss"] = \ ",".join([tran.attributes["associated_tss"], tss_name])
def get_fasta(seq, merge, num, strand, args_term, out, out_i): if (merge["end"] - merge["start"]) > args_term.window: detect_out = False for start in range(merge["start"], merge["end"] + 1, args_term.shift): if (merge["end"] - (start + args_term.window)) < args_term.shift: end = merge["end"] detect_out = True else: end = start + args_term.window inter_seq = Helper().extract_gene(seq[merge["strain"]], start, end, strand) out_i.write(">" + "|".join([ "inter_" + str(num), str(start), str(end), merge["strain"], merge["parent_p"], merge["parent_m"], merge["p_pos"], merge["m_pos"], strand ]) + "\n") out.write(">inter_" + str(num) + "\n") out.write(inter_seq + "\n") num += 1 if detect_out: break else: inter_seq = Helper().extract_gene(seq[merge["strain"]], merge["start"], merge["end"], strand) out_i.write(">" + "|".join([ "inter_" + str(num), str(merge["start"]), str(merge["end"]), merge["strain"], merge["parent_p"], merge["parent_m"], merge["p_pos"], merge["m_pos"], strand ]) + "\n") out.write(">inter_" + str(num) + "\n") out.write(inter_seq + "\n") num += 1 return num
def print_file(final_tsss, program, out_gff): num_final = 0 out = open(out_gff, "w") out.write("##gff-version 3\n") for tss in final_tsss: if "print" in tss.attributes.keys(): del tss.attributes["print"] tss.attributes["ID"] = "_".join( [tss.seq_id, program.lower() + str(num_final)]) num_final += 1 if program == "TSS": strand = Helper().get_strand_name(tss.strand) tss.attributes["Name"] = "TSS:" + "_".join( [str(tss.start), strand]) else: strand = Helper().get_strand_name(tss.strand) tss.attributes["Name"] = "processing:" + "_".join( [str(tss.start), strand]) tss.attribute_string = ";".join( ["=".join(items) for items in tss.attributes.items()]) out.write("\t".join([ str(field) for field in [ tss.seq_id, "ANNOgesic", tss.feature, tss.start, tss.end, tss.score, tss.strand, tss.phase, tss.attribute_string ] ]) + "\n")
def get_upstream(seq, tss, out, name, nt_before): if tss.strand == "+": fasta = Helper().extract_gene(seq, tss.start - nt_before + 1, tss.start, tss.strand) else: fasta = Helper().extract_gene(seq, tss.start, tss.start + nt_before - 1, tss.strand) out.write("{0}\n{1}\n".format(name, fasta))
def compare_cds_check_orphan(tsss, cdss): for tss in tsss: if tss.attributes["type"] == "Orphan": for cds in cdss: if (tss.seq_id == cds.seq_id) and \ (tss.strand == cds.strand): if is_primary(cds.start, cds.end, tss.start, tss.strand): if tss.attributes["type"] != "Orphan": tss.attributes["type"] = "&".join( [tss.attributes["type"], "Primary"]) if tss.strand == "+": tss.attributes["UTR_length"] = "&".join([ tss.attributes["UTR_length"], "Primary_" + str(cds.start - tss.start) ]) else: tss.attributes["UTR_length"] = "&".join([ tss.attributes["UTR_length"], "Primary_" + str(tss.start - cds.end) ]) else: tss.attributes["type"] = "Primary" if tss.strand == "+": tss.attributes["UTR_length"] = ( "Primary_" + str(cds.start - tss.start)) else: tss.attributes["UTR_length"] = ( "Primary_" + str(tss.start - cds.end)) get_attributes(tss, cds) if is_internal(cds.start, cds.end, tss.start, tss.strand): if "locus_tag" in cds.attributes.keys(): if (cds.attributes["locus_tag"] not in tss.attributes["associated_gene"]): get_attributes_int_anti(tss, cds, "Internal") else: strand = Helper().get_strand_name(cds.strand) if ("".join([ cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand ]) not in tss.attributes["associated_gene"]): get_attributes_int_anti(tss, cds, "Internal") if is_antisense(cds.start, cds.end, tss.start, tss.strand): if "locus_tag" in cds.attributes.keys(): if (cds.attributes["locus_tag"] not in tss.attributes["associated_gene"]): get_attributes_int_anti(tss, cds, "Antisense") else: strand = Helper().get_strand_name(cds.strand) if ("".join([ cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand ]) not in tss.attributes["associated_gene"]): get_attributes_int_anti(tss, cds, "Antisense")
def get_upstream(seq, tss, out, name, nt_before): if tss.strand == "+": if (tss.start - nt_before + 1) <= 0: start = 1 else: start = tss.start - nt_before + 1 fasta = Helper().extract_gene(seq, start, tss.start, tss.strand) else: if (tss.start + nt_before - 1) > len(seq): end = len(seq) else: end = tss.start + nt_before - 1 fasta = Helper().extract_gene(seq, tss.start, end, tss.strand) out.write("{0}\n{1}\n".format(name, fasta))
def srna_sorf_comparison(sRNA_file, sORF_file, sRNA_out, sORF_out): '''Comparison of sRNA and sORF. It can be a filter of sRNA detection''' sorfs = [] srnas = [] out_r = open(sRNA_out, "w") out_o = open(sORF_out, "w") out_r.write("##gff-version 3\n") out_o.write("##gff-version 3\n") for entry in Gff3Parser().entries(open(sRNA_file)): entry.attributes = del_attributes("sORF", entry) srnas.append(entry) srnas = sorted(srnas, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in Gff3Parser().entries(open(sORF_file)): entry.attributes = del_attributes("sRNA", entry) sorfs.append(entry) sorfs = sorted(sorfs, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for srna in srnas: for sorf in sorfs: if (srna.seq_id == sorf.seq_id) and (srna.strand == sorf.strand): if ((srna.start <= sorf.start) and ( srna.end >= sorf.end)) or ( (srna.start >= sorf.start) and ( srna.end <= sorf.end)) or ( (srna.start <= sorf.start) and ( srna.end >= sorf.start) and ( srna.end <= sorf.end)) or ( (srna.start >= sorf.start) and ( srna.start <= sorf.end) and ( srna.end >= sorf.end)): if "sORF" not in srna.attributes.keys(): srna.attributes["sORF"] = [] strand = Helper().get_strand_name(sorf.strand) srna.attributes["sORF"].append("".join( [sorf.attributes["ID"], ":", str(sorf.start), "-", str(sorf.end), "_", strand])) if "sRNA" not in sorf.attributes.keys(): sorf.attributes["sRNA"] = [] strand = Helper().get_strand_name(srna.strand) sorf.attributes["sRNA"].append("".join( [srna.attributes["ID"], ":", str(srna.start), "-", str(srna.end), "_", strand])) print_file(sorfs, out_o, "sRNA") print_file(srnas, out_r, "sORF") out_r.close() out_o.close()
def read_libs(input_libs, wig_folder): libs = {} if "merge_forward.wig" in os.listdir(os.path.join(os.getcwd(), "tmp")): os.remove("tmp/merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.path.join(os.getcwd(), "tmp")): os.remove("tmp/merge_reverse.wig") for lib in input_libs: datas = lib.split(":") if (datas[1] == "tex") and (datas[4] == "+"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join("tmp", "merge_forward.wig")) elif (datas[1] == "tex") and (datas[4] == "-"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join("tmp", "merge_reverse.wig")) return libs
def assign_parent(gff, tran, feature): if "Parent" not in gff.attributes.keys(): gff.attributes["Parent"] = tran.attributes["ID"] else: gff.attributes["Parent"] = (",".join( [gff.attributes["Parent"], tran.attributes["ID"]])) if "_".join(["associated", feature]) not in tran.attributes.keys(): if "locus_tag" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature ])] = (gff.attributes["locus_tag"]) elif "protein_id" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature ])] = (gff.attributes["protein_id"]) elif "Name" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = (gff.attributes["Name"]) else: strand = Helper().get_strand_name(gff.strand) tran.attributes["_".join(["associated", feature])] = ("".join([ gff.feature, ":", str(gff.start), "-", str(gff.end), "_", strand ])) else: if "locus_tag" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = (",".join([ tran.attributes["_".join(["associated", feature])], gff.attributes["locus_tag"] ])) elif "protein_id" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = (",".join([ tran.attributes["_".join(["associated", feature])], gff.attributes["protein_id"] ])) elif "Name" in gff.attributes.keys(): tran.attributes["_".join(["associated", feature])] = (",".join([ tran.attributes["_".join(["associated", feature])], gff.attributes["Name"] ])) else: strand = Helper().get_strand_name(gff.strand) tran.attributes["_".join(["associated", feature])] = (",".join([ tran.attributes["_".join(["associated", feature])], "".join([ gff.feature, ":", str(gff.start), "-", str(gff.end), "_", strand ]) ]))
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth") } self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": [] } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" }
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or (args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or (args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database)
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end): '''for forward strand''' pre_id = "" out = None for cds in cdss_f: if cds.seq_id != pre_id: out = open( os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "w") pre_id = cds.seq_id if (cds.start > tar_start): start = cds.start - tar_start else: start = 1 if ((cds.start + tar_end) < len(fasta)) and ( (cds.end - cds.start) >= tar_end): end = cds.start + tar_end - 1 elif cds.start + tar_end >= len(fasta): end = len(fasta) elif (cds.end - cds.start) < tar_end: end = cds.end seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds for gene in genes: if "Parent" in cds.attributes.keys(): if (gene.attributes["ID"] in cds.attributes["Parent"].split(",")): target = gene break print_fasta(target, seq, out) if out is not None: out.close()
def deal_cds_forward(cdss_f, target_folder, fasta, genes, tar_start, tar_end): '''for forward strand''' pre_id = "" out = None for cds in cdss_f: if cds.seq_id != pre_id: out = open( os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "w") pre_id = cds.seq_id if (cds.start > tar_start): start = cds.start - tar_start else: start = 1 if ((cds.start + tar_end) < len(fasta)) and ( (cds.end - cds.start) >= tar_end): end = cds.start + tar_end - 1 elif cds.start + tar_end >= len(fasta): end = len(fasta) elif (cds.end - cds.start) < tar_end: end = cds.end seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds target_gene = check_parent_gene(cds, genes) print_fasta(target, seq, out, target_gene) if out is not None: out.close()
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end): '''for the reverse strand''' pre_id = "" out = None for cds in cdss_r: if cds.seq_id != pre_id: out = open( os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "a") pre_id = cds.seq_id if (len(fasta) - cds.end > tar_start): end = cds.end + tar_start else: end = len(fasta) if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end): start = cds.end - tar_end - 1 elif cds.end - tar_end < 1: start = 1 elif (cds.end - cds.start) < tar_end: start = cds.start seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds target_gene = check_parent_gene(cds, genes) print_fasta(target, seq, out, target_gene) if out is not None: out.close()
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def deal_cds_reverse(cdss_r, target_folder, fasta, genes, tar_start, tar_end): pre_id = "" out = None for cds in cdss_r: if cds.seq_id != pre_id: out = open( os.path.join(target_folder, "_".join([cds.seq_id, "target.fa"])), "a") pre_id = cds.seq_id if (len(fasta) - cds.end > tar_start): end = cds.end + tar_start else: end = len(fasta) if ((cds.end - tar_end) > 1) and ((cds.end - cds.start) >= tar_end): start = cds.end - tar_end - 1 elif cds.end - tar_end < 1: start = 1 elif (cds.end - cds.start) < tar_end: start = cds.start seq = Helper().extract_gene(fasta, start, end, cds.strand) target = cds for gene in genes: if "Parent" in cds.attributes.keys(): if cds.attributes["Parent"] == gene.attributes["ID"]: target = gene break print_fasta(target, seq, out) if out is not None: out.close()
def compare_sorf_srna(sorfs, srnas, srna_gff): if srna_gff is not None: for sorf in sorfs: sorf["srna"] = [] for srna in srnas: if (sorf["strain"] == srna.seq_id) and ( sorf["strand"] == srna.strand): if ((srna.start <= sorf["start"]) and ( srna.end >= sorf["end"])) or ( (srna.start >= sorf["start"]) and ( srna.end <= sorf["end"])) or ( (srna.start <= sorf["start"]) and ( srna.end >= sorf["start"]) and ( srna.end <= sorf["end"])) or ( (srna.start >= sorf["start"]) and ( srna.start <= sorf["end"]) and ( srna.end >= sorf["end"])): strand = Helper().get_strand_name(srna.strand) sorf["srna"].append(srna.attributes["ID"] + ":" + str(srna.start) + "-" + str(srna.end) + "_" + strand) if len(sorf["srna"]) == 0: sorf["srna"] = ["NA"] else: for sorf in sorfs: sorf["srna"] = ["NA"]
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def get_feature(cds): if "locus_tag" in cds.attributes.keys(): feature = cds.attributes["locus_tag"] elif "protein_id" in cds.attributes.keys(): feature = cds.attributes["protein_id"] elif "ID" in cds.attributes.keys(): strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.attributes["ID"], ":", str(cds.start), "-", str(cds.end), "_", strand]) else: strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand]) return feature
def __init__(self): self.seq_editer = SeqEditer() self.helper = Helper() self.tmp_fa = "tmp.fa" self.tmp_gff = "tmp.gff" self.tmp_wig_forward = "tmp_forward.wig" self.tmp_wig_reverse = "tmp_reverse.wig"
def assign_sorf(sorf, starts, ends, fasta): sorf["starts"] = starts sorf["ends"] = ends sorf["start"] = min(map(int, starts)) sorf["end"] = max(map(int, ends)) sorf["seq"] = Helper().extract_gene(fasta[sorf["strain"]], sorf["start"], sorf["end"], sorf["strand"])
def _print_tssfile(self, nums, tss_features, tss, tss_pro, strain, method, out, tss_libs): tss_pro = tss_pro[0].upper() + tss_pro[1:] tss_merge_type = "&".join(tss_features["tss_types"]) utr_length = "&".join(tss_features["utr_lengths"]) merge_locus_tag = "&".join(tss_features["locus_tags"]) libs = "&".join(tss_libs) strand = Helper().get_strand_name(tss.super_strand) attribute_string = ";".join([ "=".join(items) for items in ([ "Name", "".join( [tss_pro, ":", str(tss.super_pos), "_", strand]) ], ["ID", tss_pro.lower() + str(nums["tss_uni"])], ["type", tss_merge_type], ["UTR_length", str(utr_length)], ["associated_gene", merge_locus_tag], ["libs", libs], ["Method", "TSSpredator"]) ]) out.write("\t".join([ strain, method, tss_pro, str(tss.super_pos), str(tss.super_pos), ".", tss.super_strand, ".", attribute_string ]) + "\n")
def read_gff(tss_predict_file, tss_manual_file, gff_file, lengths): tsss = {"tsss_p": [], "tsss_m": [], "merge": []} cdss = [] genes = [] gff_parser = Gff3Parser() tssp_fh = open(tss_predict_file, "r") tssm_fh = open(tss_manual_file, "r") g_f = open(gff_file, "r") for entry in gff_parser.entries(tssp_fh): entry.attributes["print"] = False tsss["tsss_p"].append(entry) tssp_fh.close() tsss["tsss_p"] = sorted(tsss["tsss_p"], key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in gff_parser.entries(tssm_fh): if (entry.seq_id in lengths.keys()) or ("all" in lengths.keys()): entry.attributes["print"] = False entry.attributes["libs"] = "manual" entry.attributes["method"] = "manual" tsss["tsss_m"].append(entry) tssm_fh.close() tsss["tsss_m"] = sorted(tsss["tsss_m"], key=lambda k: (k.seq_id, k.start, k.end, k.strand)) for entry in gff_parser.entries(g_f): if (Helper().feature_without_notgene(entry)): cdss.append(entry) if entry.feature == "gene": genes.append(entry) g_f.close() cdss = sorted(cdss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) genes = sorted(genes, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) return tsss, cdss, genes
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = { "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join(args_ribo.out_folder, "tmp_scan"), "table": os.path.join(args_ribo.out_folder, "tmp_table") } self.suffixs = { "csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv" }
def remove_primary(tss, tss_entry): final_types = [] final_utrs = [] final_genes = [] tss_dict = tss_entry[1] types = tss_dict["type"].split("&") utrs = tss_dict["UTR_length"].split("&") genes = tss_dict["associated_gene"].split("&") index = 0 for type_ in types: if type_ != "Primary": final_types.append(type_) final_utrs.append(utrs[index]) final_genes.append(genes[index]) index += 1 strand = Helper().get_strand_name(tss.strand) tss_dict = { "Name": "_".join(["TSS:" + str(tss.start), strand]), "type": "&".join(final_types), "UTR_length": "&".join(final_utrs), "associated_gene": "&".join(final_genes) } tss_string = ";".join([ "=".join(["UTR_length", tss_dict["UTR_length"]]), "=".join(["associated_gene", tss_dict["associated_gene"]]), "=".join(["type", tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]]) ]) return (tss_string, tss_dict)
def get_gene_info(cds): if "locus_tag" in cds.attributes.keys(): feature = cds.attributes["locus_tag"] else: strand = Helper().get_strand_name(cds.strand) feature = "".join([cds.feature, ":", str(cds.start), "-", str(cds.end), "_", strand]) return feature
def merge_libs(input_libs, wig_folder, program): if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") if program == "TSS": type_ = "tex" elif program == "processing": type_ = "notex" for lib in input_libs: datas = lib.split(":") if (datas[1] == type_) and (datas[4] == "+"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join(os.getcwd(), "merge_forward.wig")) elif (datas[1] == type_) and (datas[4] == "-"): Helper().merge_file(os.path.join(wig_folder, datas[0]), os.path.join(os.getcwd(), "merge_reverse.wig"))
def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = { "tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp") }
def import_to_tss(tss_type, cds_pos, tss, locus_tag, tss_entry): if cds_pos == "NA": utr = "_".join([tss_type, "NA"]) else: utr = "_".join([tss_type, str(int(math.fabs(cds_pos - tss.start)))]) if len(tss_entry) != 0: tss_dict = tss_entry[1] tss_dict_types = tss_dict["type"].split("&") tss_dict_utrs = tss_dict["UTR_length"].split("&") tss_dict_tags = tss_dict["associated_gene"].split("&") if tss_type == "Primary" and ("Primary" in tss_dict["type"]): index = 0 for tss_dict_type in tss_dict_types: if "Primary" in tss_dict_type: utr_length = tss_dict_utrs[index].split("_") if math.fabs(cds_pos - tss.start) < int(utr_length[1]): tss_dict_utrs[index] = utr tss_dict_tags[index] = locus_tag index += 1 else: tss_dict_types.append(tss_type) tss_dict_utrs.append(utr) tss_dict_tags.append(locus_tag) strand = Helper().get_strand_name(tss.strand) tss_dict = { "Name": "_".join(["TSS:" + str(tss.start), strand]), "type": "&".join(tss_dict_types), "UTR_length": "&".join(tss_dict_utrs), "associated_gene": "&".join(tss_dict_tags) } else: strand = Helper().get_strand_name(tss.strand) tss_dict = { "Name": "_".join(["TSS:" + str(tss.start), strand]), "type": tss_type, "UTR_length": utr, "associated_gene": locus_tag } tss_string = ";".join([ "=".join(["UTR_length", tss_dict["UTR_length"]]), "=".join(["associated_gene", tss_dict["associated_gene"]]), "=".join(["type", tss_dict["type"]]), "=".join(["Name", tss_dict["Name"]]) ]) return (tss_string, tss_dict)