def merge_srna_table(srna_file, csvs, wigs_f, wigs_r, tss_file, args_srna): libs, texs = read_libs(args_srna.libs, args_srna.merge_wigs) srnas = read_gff(srna_file, "sRNA", args_srna.ex_srna) if tss_file is not None: tsss = read_gff(tss_file, "tss", args_srna.ex_srna) else: tsss = None inters = read_table(csvs["normal"], "inter") utrs = read_table(csvs["utr"], "utr") out = open(csvs["merge"], "w") for srna in srnas: if ("5utr" in srna.attributes["sRNA_type"]) or ( "3utr" in srna.attributes["sRNA_type"]) or ( "interCDS" in srna.attributes["sRNA_type"]): compare_table(srna, utrs, "utr", wigs_f, wigs_r, texs, out, tsss, args_srna) elif ("intergenic" in srna.attributes["sRNA_type"]) or ( "in_CDS" in srna.attributes["sRNA_type"]) or ( "antisense" in srna.attributes["sRNA_type"]): compare_table(srna, inters, "inter", wigs_f, wigs_r, texs, out, tsss, args_srna) out.close() paras = [wigs_r, wigs_f, srnas, tsss, inters, utrs] free_memory(paras)
def gen_table_transcript(gff_folder, args_tran): '''generate the detail table of transcript''' libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs) for gff in os.listdir(gff_folder): if os.path.isfile(os.path.join(gff_folder, gff)): wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "forward.wig"])), "+", libs) wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "reverse.wig"])), "-", libs) th = open(os.path.join(gff_folder, gff), "r") trans = [] out = open(os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")), "w") out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w") out_gff.write("##gff-version 3\n") out.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect_lib_type", "Associated_gene", "Associated_tss", "Associated_term", "Coverage_details"]) + "\n") gff_parser = Gff3Parser() for entry in gff_parser.entries(th): trans.append(entry) print_coverage(trans, out, out_gff, wigs_f, wigs_r, args_tran.table_best) out.close() out_gff.close() shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"), os.path.join(gff_folder, gff)) if os.path.exists(os.path.join(args_tran.out_folder, "merge_wigs")): shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs"))
def gen_table_transcript(gff_folder, args_tran): libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs) for gff in os.listdir(gff_folder): if os.path.isfile(os.path.join(gff_folder, gff)): wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "forward.wig"])), "+", libs) wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "reverse.wig"])), "-", libs) th = open(os.path.join(gff_folder, gff), "r") trans = [] out = open(os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")), "w") out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w") out_gff.write("##gff-version 3\n") out.write("\t".join(["strain", "Name", "start", "end", "strand", "detect_lib_type", "associated_gene", "associated_tss", "associated_term", "coverage_details"]) + "\n") gff_parser = Gff3Parser() for entry in gff_parser.entries(th): trans.append(entry) print_coverage(trans, out, out_gff, wigs_f, wigs_r, args_tran.table_best) out.close() out_gff.close() shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"), os.path.join(gff_folder, gff)) if os.path.exists(os.path.join(args_tran.out_folder, "merge_wigs")): shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs"))
def filter_low_expression(gff_file, args_tss, wig_f_file, wig_r_file, out_file): '''filter the low expressed TSS''' tars = read_gff(gff_file) refs = read_gff(args_tss.manual_file) libs, texs = read_libs(args_tss.input_lib, args_tss.wig_folder) wig_fs = read_wig(wig_f_file, "+", args_tss.libs) wig_rs = read_wig(wig_r_file, "-", args_tss.libs) compare_wig(tars, wig_fs, wig_rs) cutoff = 1 first = True while True: stat_value, num_ref = stat(tars, refs, cutoff, args_tss.gene_length, args_tss.cluster) if first: first = False best = stat_value.copy() continue else: best, change = change_best(num_ref, best, stat_value) if not change: break cutoff = cutoff + 0.1 print_file(tars, cutoff, out_file) return cutoff
def merge_srna_table(srna_file, csvs, wig_f_file, wig_r_file, tss_file, args_srna): libs, texs = read_libs(args_srna.libs, args_srna.merge_wigs) wigs_f = read_wig(wig_f_file, "+", libs) wigs_r = read_wig(wig_r_file, "-", libs) srnas = read_gff(srna_file, "sRNA") if tss_file is not None: tsss = read_gff(tss_file, "tss") else: tsss = None inters = read_table(csvs["normal"], "inter") utrs = read_table(csvs["utr"], "utr") out = open(csvs["merge"], "w") for srna in srnas: if (srna.attributes["sRNA_type"] == "5utr") or ( srna.attributes["sRNA_type"] == "3utr") or ( srna.attributes["sRNA_type"] == "interCDS"): compare_table(srna, utrs, "utr", wigs_f, wigs_r, texs, out, tsss, args_srna) elif (srna.attributes["sRNA_type"] == "intergenic") or ( srna.attributes["sRNA_type"] == "in_CDS") or ( srna.attributes["sRNA_type"] == "antisense"): compare_table(srna, inters, "inter", wigs_f, wigs_r, texs, out, tsss, args_srna) out.close() paras = [wigs_r, wigs_f, srnas, tsss, inters, utrs] free_memory(paras)
def utr_derived_srna(args_srna): inters = [] cdss, tas, tsss, pros, seq = read_data(args_srna) libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wig_fs = read_wig(args_srna.wig_f_file, "+", libs) wig_rs = read_wig(args_srna.wig_r_file, "-", libs) out = open(args_srna.output_file, "w") out.write("##gff-version 3\n") out_t = open(args_srna.output_table, "w") get_terminal(cdss, inters, seq, "start") get_inter(cdss, inters) get_terminal(cdss, inters, seq, "end") inters = sorted(inters, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) args_srna = ArgsContainer().extend_utr_container( args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs) for inter in inters: for ta in tas: if (inter["strain"] == ta.seq_id) and ( inter["strand"] == ta.strand): class_utr(inter, ta, args_srna) covers = get_utr_coverage(args_srna.utrs) mediandict = set_cutoff(covers, args_srna) print_median(args_srna.out_folder, mediandict) detect_srna(mediandict, args_srna) args_srna.out.close() args_srna.out_t.close() paras = [wig_fs, wig_rs, args_srna.srnas, args_srna.utrs, args_srna.wig_fs, args_srna.wig_rs, seq, inters, tas, cdss, tas, tsss, pros, covers] free_memory(paras)
def filter_low_expression(gff_file, args_tss, wig_f_file, wig_r_file, out_file): '''filter the low expressed TSS''' tars = read_gff(gff_file) refs = read_gff(args_tss.manual_file) libs, texs = read_libs(args_tss.input_lib, args_tss.wig_folder) wig_fs = read_wig(wig_f_file, "+", args_tss.libs) wig_rs = read_wig(wig_r_file, "-", args_tss.libs) compare_wig(tars, wig_fs, wig_rs) cutoff = 1 first = True while True: stat_value, num_ref = stat(tars, refs, cutoff, args_tss.gene_length, args_tss.cluster) if first: first = False best = stat_value.copy() continue else: best, change = change_best(num_ref, best, stat_value) if not change: break cutoff = cutoff + 0.1 print_file(tars, cutoff, out_file) return cutoff
def utr_derived_srna(args_srna): inters = [] cdss, tas, tsss, pros, seq = read_data(args_srna) libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wig_fs = read_wig(args_srna.wig_f_file, "+", libs) wig_rs = read_wig(args_srna.wig_r_file, "-", libs) out = open(args_srna.output_file, "w") out.write("##gff-version 3\n") out_t = open(args_srna.output_table, "w") get_terminal(cdss, inters, seq, "start") get_inter(cdss, inters) get_terminal(cdss, inters, seq, "end") inters = sorted(inters, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) args_srna = ArgsContainer().extend_utr_container(args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs) for inter in inters: for ta in tas: if (inter["strain"] == ta.seq_id) and (inter["strand"] == ta.strand): class_utr(inter, ta, args_srna) covers = get_utr_coverage(args_srna.utrs) mediandict = set_cutoff(covers, args_srna) print_median(args_srna.out_folder, mediandict) detect_srna(mediandict, args_srna) args_srna.out.close() args_srna.out_t.close() paras = [ wig_fs, wig_rs, args_srna.srnas, args_srna.utrs, args_srna.wig_fs, args_srna.wig_rs, seq, inters, tas, cdss, tas, tsss, pros, covers ] free_memory(paras)
def upstream(tss_file, fasta_file, gff_file, out_class, args_pro, prefix): '''get the upstream sequence of TSS''' if fasta_file is not None: files = { "pri": open("tmp/primary.fa", "w"), "sec": open("tmp/secondary.fa", "w"), "inter": open("tmp/internal.fa", "w"), "anti": open("tmp/antisense.fa", "w"), "orph": open("tmp/orphan.fa", "w") } tsss, seq = read_data(tss_file, fasta_file) num_tss = 0 if not args_pro.source: out = open(out_class, "w") out.write("##gff-version 3\n") cdss, genes = read_gff(gff_file) for tss in tsss: if ("type" not in tss.attributes.keys()) and (args_pro.source): print("Error: The TSS gff file may not generated from ANNOgesic." "Please run with --tss_source!") sys.exit() if args_pro.source: name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id]) print_fasta(seq, tss, files, name, args_pro.nt_before) else: tss_type = compare_tss_cds(tss, cdss, genes) tss.attributes = tss_type[1] tss.attributes["ID"] = tss.seq_id + "_tss" + str(num_tss) tss.attribute_string = "".join( [tss_type[0], ";ID=", tss.seq_id, "_tss", str(num_tss)]) num_tss += 1 if not args_pro.source: if args_pro.tex_wigs is not None: libs, texs = read_libs(args_pro.input_libs, args_pro.tex_wigs) wigs_f = read_wig( os.path.join(args_pro.wig_path, prefix + "_forward.wig"), "+", libs) wigs_r = read_wig( os.path.join(args_pro.wig_path, prefix + "_reverse.wig"), "+", libs) else: wigs_f = None wigs_r = None sort_tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) final_tsss = fix_primary_type(sort_tsss, wigs_f, wigs_r) for tss in final_tsss: name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id]) tss.attribute_string = ";".join( ["=".join(items) for items in tss.attributes.items()]) out.write("\t".join([ str(field) for field in [ tss.seq_id, tss.source, tss.feature, tss.start, tss.end, tss.score, tss.strand, tss.phase, tss.attribute_string ] ]) + "\n") if fasta_file is not None: print_fasta(seq, tss, files, name, args_pro.nt_before)
def gene_expression(input_libs, gff_folder, percent_tex, percent_frag, wig_f_file, wig_r_file, features, wigs, cutoff_coverage, tex_notex, replicates, stat_folder, out_gff_folder, cover_type, max_color, min_color): print("Loading wiggle file...") libs, texs = read_libs(input_libs, wigs) wig_fs = read_wig(wig_f_file, "+", libs) wig_rs = read_wig(wig_r_file, "-", libs) plots = {} repeat = {} for gff in os.listdir(gff_folder): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Computing " + prefix) gff_list, stats, outs = read_data(os.path.join(gff_folder, gff), features) for feature, gffs in gff_list.items(): plots[feature] = [] repeat[feature] = {} tags = [] stats[feature]["total"] = { "total": 0, "least_one": 0, "all": 0, "none": 0 } num = 0 for gff in gffs: if gff.seq_id not in stats[feature].keys(): stats[feature][gff.seq_id] = { "total": 0, "least_one": 0, "all": 0, "none": 0 } stats[feature]["total"]["total"] += 1 stats[feature][gff.seq_id]["total"] += 1 name = get_name(plots, gff, feature, repeat[feature], tags) if gff.strand == "+": compare_wigs(wig_fs, gff, tex_notex, texs, replicates, stats[feature], outs[feature], plots[feature][num][name], cover_type, cutoff_coverage, percent_tex, percent_frag) elif gff.strand == "-": compare_wigs(wig_rs, gff, tex_notex, texs, replicates, stats[feature], outs[feature], plots[feature][num][name], cover_type, cutoff_coverage, percent_tex, percent_frag) num += 1 output_stat(stats, stat_folder, prefix) output_gff(outs, out_gff_folder, prefix) plot(plots, stat_folder, max_color, min_color, cover_type)
def upstream(tss_file, fasta_file, gff_file, out_class, args_pro, prefix): '''get the upstream sequence of TSS''' if fasta_file is not None: files = {"pri": open("tmp/primary.fa", "w"), "sec": open("tmp/secondary.fa", "w"), "inter": open("tmp/internal.fa", "w"), "anti": open("tmp/antisense.fa", "w"), "orph": open("tmp/orphan.fa", "w")} tsss, seq = read_data(tss_file, fasta_file) num_tss = 0 if not args_pro.source: out = open(out_class, "w") out.write("##gff-version 3\n") cdss, genes = read_gff(gff_file) for tss in tsss: if ("type" not in tss.attributes.keys()) and (args_pro.source): print("Error: The TSS gff file may not generated from ANNOgesic." "Please run with --tss_source!") sys.exit() if args_pro.source: name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id]) print_fasta(seq, tss, files, name, args_pro.nt_before) else: tss_type = compare_tss_cds(tss, cdss, genes) tss.attributes = tss_type[1] tss.attributes["ID"] = tss.seq_id + "_tss" + str(num_tss) tss.attribute_string = "".join([ tss_type[0], ";ID=", tss.seq_id, "_tss", str(num_tss)]) num_tss += 1 if not args_pro.source: if args_pro.tex_wigs is not None: libs, texs = read_libs(args_pro.input_libs, args_pro.tex_wigs) wigs_f = read_wig(os.path.join( args_pro.wig_path, prefix + "_forward.wig"), "+", libs) wigs_r = read_wig(os.path.join( args_pro.wig_path, prefix + "_reverse.wig"), "+", libs) else: wigs_f = None wigs_r = None sort_tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start, k.end, k.strand)) final_tsss = fix_primary_type(sort_tsss, wigs_f, wigs_r) for tss in final_tsss: name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id]) tss.attribute_string = ";".join( ["=".join(items) for items in tss.attributes.items()]) out.write("\t".join([str(field) for field in [ tss.seq_id, tss.source, tss.feature, tss.start, tss.end, tss.score, tss.strand, tss.phase, tss.attribute_string]]) + "\n") if fasta_file is not None: print_fasta(seq, tss, files, name, args_pro.nt_before)
def detect_coverage(term_table, gff_file, tran_file, seq_file, wig_f_file, wig_r_file, tranterm_file, wig_folder, output_file, output_table, args_term): gffs, tas, hps, fr_terms, seq = read_data(gff_file, tran_file, tranterm_file, seq_file, term_table) terms = compare_transtermhp(hps, fr_terms) compare_ta(terms, tas, args_term.fuzzy) libs, texs = read_libs(args_term.libs, wig_folder) compute_wig(wig_f_file, libs, terms, "+", texs, args_term) compute_wig(wig_r_file, libs, terms, "-", texs, args_term) out = open(output_file, "w") out_t = open(output_table, "w") print_term(terms, out, out_t, args_term)
def detect_coverage(term_table, gff_file, tran_file, seq_file, wig_f_file, wig_r_file, tranterm_file, wig_folder, output_file, output_table, args_term): gffs, tas, hps, fr_terms, seq = read_data(gff_file, tran_file, tranterm_file, seq_file, term_table) terms = compare_transtermhp(hps, fr_terms) compare_ta(terms, tas, args_term.fuzzy) libs, texs = read_libs(args_term.libs, wig_folder) compute_wig(wig_f_file, libs, terms, "+", texs, args_term) compute_wig(wig_r_file, libs, terms, "-", texs, args_term) out = open(output_file, "w") out_t = open(output_table, "w") print_term(terms, out, out_t, args_term)
def assembly(wig_f_file, wig_r_file, wig_folder, input_lib, out_file, wig_type, args_tran): out = open(out_file, "w") out.write("##gff-version 3\n") libs, texs = read_libs(input_lib, wig_folder) wig_fs = read_wig(wig_f_file, "+", libs) wig_rs = read_wig(wig_r_file, "-", libs) tolers_f, tran_fs = transfer_to_tran(wig_fs, libs, texs, "+", args_tran) tolers_r, tran_rs = transfer_to_tran(wig_rs, libs, texs, "-", args_tran) fill_gap_and_print(tran_fs, "+", out, tolers_f, wig_type, args_tran) fill_gap_and_print(tran_rs, "-", out, tolers_r, wig_type, args_tran) out.close() del wig_fs del wig_rs
def sorf_detection(fasta, srna_gff, inter_gff, tss_file, wig_f_file, wig_r_file, out_prefix, args_sorf): coverages = set_coverage(args_sorf) libs, texs = read_libs(args_sorf.libs, args_sorf.merge_wigs) inters, tsss, srnas, seq = read_data(inter_gff, tss_file, srna_gff, fasta, args_sorf.utr_detect) wigs = { "forward": read_wig(wig_f_file, "+", libs), "reverse": read_wig(wig_r_file, "-", libs) } med_inters = detect_inter_type(inters, wigs, args_sorf.background) inter_covers = {} mediandict = {} for strain, meds in med_inters.items(): inter_covers[strain] = {"5utr": {}, "3utr": {}, "interCDS": {}} for type_, covers in meds.items(): get_inter_coverage(covers, inter_covers[strain][type_]) set_median(inter_covers, mediandict, coverages) out_ag = open("_".join([out_prefix, "all.gff"]), "w") out_at = open("_".join([out_prefix, "all.csv"]), "w") out_bg = open("_".join([out_prefix, "best.gff"]), "w") out_bt = open("_".join([out_prefix, "best.csv"]), "w") sorfs = detect_start_stop(inters, seq, args_sorf) sorfs_all, sorfs_best = compare_sorf_tss(sorfs, tsss, tss_file, args_sorf) compare_sorf_srna(sorfs_all, srnas, srna_gff) compare_sorf_srna(sorfs_best, srnas, srna_gff) sorfs_all = sorted(sorfs_all, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) sorfs_best = sorted(sorfs_best, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) final_all = coverage_and_output(sorfs_all, mediandict, wigs, out_ag, out_at, "all", seq, coverages, args_sorf, texs, "first") final_best = coverage_and_output(sorfs_best, mediandict, wigs, out_bg, out_bt, "best", seq, coverages, args_sorf, texs, "first") final_all = merge(final_all, seq) final_best = merge(final_best, seq) final_best = get_best(final_best, tss_file, srna_gff, args_sorf) coverage_and_output(final_all, mediandict, wigs, out_ag, out_at, "all", seq, coverages, args_sorf, texs, "final") coverage_and_output(final_best, mediandict, wigs, out_bg, out_bt, "best", seq, coverages, args_sorf, texs, "final") out_ag.close() out_at.close() out_bg.close() out_bt.close()
def detect_transcript(wig_f_file, wig_r_file, wig_folder, input_lib, out_file, wig_type, args_tran): out = open(out_file, "w") out.write("##gff-version 3\n") finals = {} libs, texs = read_libs(input_lib, wig_folder) wig_fs = read_wig(wig_f_file, "+", libs) wig_rs = read_wig(wig_r_file, "-", libs) tolers_f, tran_fs = transfer_to_tran(wig_fs, libs, texs, "+", args_tran) tolers_r, tran_rs = transfer_to_tran(wig_rs, libs, texs, "-", args_tran) fill_gap_and_print(tran_fs, "+", finals, tolers_f, wig_type, args_tran) fill_gap_and_print(tran_rs, "-", finals, tolers_r, wig_type, args_tran) print_transcript(finals, out) out.close() del wig_fs del wig_rs
def gene_expression(input_libs, gff_folder, percent_tex, percent_frag, wig_f_file, wig_r_file, features, wigs, cutoff_coverage, tex_notex, replicates, stat_folder, out_gff_folder, cover_type, max_color, min_color): print("Loading wiggle file...") libs, texs = read_libs(input_libs, wigs) wig_fs = read_wig(wig_f_file, "+", libs) wig_rs = read_wig(wig_r_file, "-", libs) plots = {} repeat = {} for gff in os.listdir(gff_folder): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Computing " + prefix) gff_list, stats, outs = read_data(os.path.join(gff_folder, gff), features) for feature, gffs in gff_list.items(): plots[feature] = [] repeat[feature] = {} tags = [] stats[feature]["total"] = {"total": 0, "least_one": 0, "all": 0, "none": 0} num = 0 for gff in gffs: if gff.seq_id not in stats[feature].keys(): stats[feature][gff.seq_id] = { "total": 0, "least_one": 0, "all": 0, "none": 0} stats[feature]["total"]["total"] += 1 stats[feature][gff.seq_id]["total"] += 1 name = get_name(plots, gff, feature, repeat[feature], tags) if gff.strand == "+": compare_wigs( wig_fs, gff, tex_notex, texs, replicates, stats[feature], outs[feature], plots[feature][num][name], cover_type, cutoff_coverage, percent_tex, percent_frag) elif gff.strand == "-": compare_wigs( wig_rs, gff, tex_notex, texs, replicates, stats[feature], outs[feature], plots[feature][num][name], cover_type, cutoff_coverage, percent_tex, percent_frag) num += 1 output_stat(stats, stat_folder, prefix) output_gff(outs, out_gff_folder, prefix) plot(plots, stat_folder, max_color, min_color, cover_type)
def sorf_detection(fasta, srna_gff, inter_gff, tss_file, wig_f_file, wig_r_file, out_prefix, args_sorf): coverages = set_coverage(args_sorf) libs, texs = read_libs(args_sorf.libs, args_sorf.merge_wigs) inters, tsss, srnas, seq = read_data(inter_gff, tss_file, srna_gff, fasta, args_sorf.utr_detect) wigs = {"forward": read_wig(wig_f_file, "+", libs), "reverse": read_wig(wig_r_file, "-", libs)} med_inters = detect_inter_type(inters, wigs, args_sorf.background) inter_covers = {} mediandict = {} for strain, meds in med_inters.items(): inter_covers[strain] = {"5utr": {}, "3utr": {}, "interCDS": {}} for type_, covers in meds.items(): get_inter_coverage(covers, inter_covers[strain][type_]) set_median(inter_covers, mediandict, coverages) out_ag = open("_".join([out_prefix, "all.gff"]), "w") out_at = open("_".join([out_prefix, "all.csv"]), "w") out_bg = open("_".join([out_prefix, "best.gff"]), "w") out_bt = open("_".join([out_prefix, "best.csv"]), "w") sorfs = detect_start_stop(inters, seq, args_sorf) sorfs_all, sorfs_best = compare_sorf_tss(sorfs, tsss, tss_file, args_sorf) compare_sorf_srna(sorfs_all, srnas, srna_gff) compare_sorf_srna(sorfs_best, srnas, srna_gff) sorfs_all = sorted(sorfs_all, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) sorfs_best = sorted(sorfs_best, key=lambda k: (k["strain"], k["start"], k["end"], k["strand"])) final_all = coverage_and_output( sorfs_all, mediandict, wigs, out_ag, out_at, "all", seq, coverages, args_sorf, texs, "first") final_best = coverage_and_output( sorfs_best, mediandict, wigs, out_bg, out_bt, "best", seq, coverages, args_sorf, texs, "first") final_all = merge(final_all, seq) final_best = merge(final_best, seq) final_best = get_best(final_best, tss_file, srna_gff, args_sorf) coverage_and_output(final_all, mediandict, wigs, out_ag, out_at, "all", seq, coverages, args_sorf, texs, "final") coverage_and_output(final_best, mediandict, wigs, out_bg, out_bt, "best", seq, coverages, args_sorf, texs, "final") out_ag.close() out_at.close() out_bg.close() out_bt.close()
def reorganize_table(input_libs, wigs, cover_header, table_file): libs, texs = read_libs(input_libs, wigs) fh = open(table_file, "r") first = True headers = [] tracks, track_list = get_lib_name(libs) out = open(table_file + "tmp", "w") for row in csv.reader(fh, delimiter='\t'): if first: detect = False header_num = 0 for header in row: if header == cover_header: index = header_num detect = True header_num += 1 if not detect: headers.append(header) else: detect = False first = False for track in tracks: headers.append("Avg_coverage:" + track) out.write("\t".join(headers) + "\n") else: if len(row) < (index + 1): cover_names = [] covers = [] else: cover_names, covers = import_covers(row[index]) if len(row) == index + 1: row = row[:index] else: row = row[:index] + row[index + 1:] detects = ["Not_detect"] * len(tracks) for name, cover in zip(cover_names, covers): num_track = 0 for track in track_list: if name in track: detects[num_track] = cover num_track += 1 out.write("\t".join(row + detects) + "\n") out.close() shutil.move(table_file + "tmp", table_file)
def reorganize_table(input_libs, wigs, cover_header, table_file): libs, texs = read_libs(input_libs, wigs) fh = open(table_file, "r") first = True headers = [] tracks, track_list = get_lib_name(libs) out = open(table_file + "tmp", "w") for row in csv.reader(fh, delimiter='\t'): if first: detect = False header_num = 0 for header in row: if header == cover_header: index = header_num detect = True header_num += 1 if not detect: headers.append(header) else: detect = False first = False for track in tracks: headers.append("Avg_coverage:" + track) out.write("\t".join(headers) + "\n") else: if len(row) < (index + 1): cover_names = [] covers = [] else: cover_names, covers = import_covers(row[index]) if len(row) == index + 1: row = row[:index] else: row = row[:index] + row[index + 1:] detects = ["Not_detect"] * len(tracks) for name, cover in zip(cover_names, covers): num_track = 0 for track in track_list: if name in track: detects[num_track] = cover num_track += 1 out.write("\t".join(row + detects) + "\n") out.close() shutil.move(table_file + "tmp", table_file)
def intergenic_srna(args_srna): inter_cutoff_coverage, inter_notex = get_intergenic_antisense_cutoff( args_srna) anti_cutoff_coverage, anti_notex = get_intergenic_antisense_cutoff( args_srna) libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) nums, cdss, tas, pros, genes = read_data(args_srna) if not args_srna.tss_source: compute_tss_type(args_srna, cdss, genes, wigs_f, wigs_r) tsss, num_tss = read_tss(args_srna.tss_file) detects = {"overlap": False, "uni_with_tss": False, "anti": False} output = open(args_srna.output_file, "w") out_table = open(args_srna.output_table, "w") output.write("##gff-version 3\n") for ta in tas: detects["overlap"] = False detects["anti"] = False compare_ta_cds(cdss, ta, detects) if (detects["overlap"]) and (not args_srna.in_cds): continue else: if not detects["anti"]: cutoff_coverage = inter_cutoff_coverage notex = inter_notex else: cutoff_coverage = anti_cutoff_coverage notex = anti_notex args_srna = ArgsContainer().extend_inter_container( args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex) check_srna_condition(ta, args_srna) file_name = args_srna.output_file.split(".") file_name = file_name[0] + ".stat" output.close() out_table.close() paras = [ wigs_f, wigs_r, tsss, tas, pros, genes, cdss, args_srna.wigs_f, args_srna.wigs_r ] free_memory(paras)
def intergenic_srna(args_srna): inter_cutoff_coverage, inter_notex = get_intergenic_antisense_cutoff( args_srna) anti_cutoff_coverage, anti_notex = get_intergenic_antisense_cutoff( args_srna) libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) nums, cdss, tas, pros, genes = read_data(args_srna) if not args_srna.tss_source: compute_tss_type(args_srna, cdss, genes, wigs_f, wigs_r) tsss, num_tss = read_tss(args_srna.tss_file) detects = {"overlap": False, "uni_with_tss": False, "anti": False} output = open(args_srna.output_file, "w") out_table = open(args_srna.output_table, "w") output.write("##gff-version 3\n") for ta in tas: detects["overlap"] = False detects["anti"] = False compare_ta_cds(cdss, ta, detects) if (detects["overlap"]) and (not args_srna.in_cds): continue else: if not detects["anti"]: cutoff_coverage = inter_cutoff_coverage notex = inter_notex else: cutoff_coverage = anti_cutoff_coverage notex = anti_notex args_srna = ArgsContainer().extend_inter_container( args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex) check_srna_condition(ta, args_srna) file_name = args_srna.output_file.split(".") file_name = file_name[0] + ".stat" output.close() out_table.close() paras = [wigs_f, wigs_r, tsss, tas, pros, genes, cdss, args_srna.wigs_f, args_srna.wigs_r] free_memory(paras)
def gen_table_transcript(gff_folder, args_tran): '''generate the detail table of transcript''' libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs) for gff in os.listdir(gff_folder): if os.path.isfile(os.path.join(gff_folder, gff)): wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "forward.wig"])), "+", libs) wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([ gff.replace("_transcript.gff", ""), "reverse.wig"])), "-", libs) th = open(os.path.join(gff_folder, gff), "r") trans = [] out = open(os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")), "w") out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w") out_gff.write("##gff-version 3\n") out.write("\t".join(["Genome", "Name", "Start", "End", "Strand", "Detect_lib_type", "Associated_gene", "Associated_tss", "Associated_term", "Coverage_details"]) + "\n") gff_parser = Gff3Parser() for entry in gff_parser.entries(th): trans.append(entry) if args_tran.gffs is not None: gff_file = os.path.join(args_tran.gffs, gff.replace("_transcript", "")) if not os.path.isfile(gff_file): gff_file = None else: gff_file = None print_coverage(trans, out, out_gff, wigs_f, wigs_r, gff_file) out.close() out_gff.close() shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"), os.path.join(gff_folder, gff))
def _read_lib_wig(self, args_srna): libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder) wigs_f = read_wig(args_srna.wig_f_file, "+", libs) wigs_r = read_wig(args_srna.wig_r_file, "-", libs) return [libs, texs, wigs_f, wigs_r]