def train_vblur_from_meta(hist_fn, cds_fa, vblur_fname): """ procedure pipeline """ cds_range = get_cds_range(cds_fa) tlist = parse_rlen_hist(hist_fn) b, ptrue, eps = meta_pipeline(tlist, cds_range, utr5_offset, imax, rlen_min, rlen_max, converge_cutoff) write_vblur(b, vblur_fname)
def deblur_pipeline(bam_fname, cds_fa, oprefix, force): """ full pipeline for deblur ribo profiles for a given sample """ # step 0: prepare input parameters odir = os.path.dirname(oprefix) if odir and not os.path.exists(odir): os.makedirs(odir) if oprefix.endswith("/"): oprefix += "ribo" raw_hist = "{}_raw.hist".format(oprefix) high_cov_hist = "{}_hc.hist".format(oprefix) vblur_fname = "{}.vblur".format(oprefix) eps_fname = "{}.eps".format(oprefix) profile_fname = "{}.profile".format(oprefix) # step 1: generate length-specific profiles if not os.path.exists(raw_hist) or force == True: group_reads_by_length(bam_fname, raw_hist) if not os.path.exists(raw_hist) or os.path.getsize(raw_hist) == 0: print("FATAL: deblur_pipeline failed at group_reads_by_len!", file=sys.stderr) print("abort program!", file=sys.stderr) exit(1) else: print("length-specific profile exists, use cached", file=sys.stderr) # step 2: filter high-coverage profiles if not os.path.exists(high_cov_hist) or force == True: filter_high_cover_profile(raw_hist, cds_fa, cover_ratio, cnt_threshold, high_cov_hist) if not os.path.exists(high_cov_hist) or os.path.getsize(high_cov_hist) == 0: print("FATAL: deblur_pipeline failed at filter_high_cover_profile!", file=sys.stderr) print("abort program!", file=sys.stderr) exit(1) else: print("high-coverage profile exists, use cached", file=sys.stderr) # step 3: train blur vector from meta profiles if not os.path.exists(vblur_fname) or force == True: train_vblur_from_meta(high_cov_hist, cds_fa, vblur_fname) if not os.path.exists(vblur_fname) or os.path.getsize(vblur_fname) == 0: print("FATAL: deblur_pipeline failed at train_vblur_from_meta!", file=sys.stderr) print("abort program!", file=sys.stderr) exit(1) else: print("vblur file exists, use cached", file=sys.stderr) # step 4: deblur high-coverage profiles if not os.path.exists(eps_fname) or force == True: deblur_transcripts(high_cov_hist, cds_fa, vblur_fname, eps_fname) if not os.path.exists(eps_fname) or os.path.getsize(eps_fname) == 0: print("FATAL: deblur_pipeline failed at deblur_transcripts!", file=sys.stderr) print("abort program!", file=sys.stderr) exit(1) else: print("deblur file exits, use cached", file=sys.stderr) # step 5: combine length-specific profiles if not os.path.exists(profile_fname) or force == True: cds_range = get_cds_range(cds_fa) ctrue_merge = construct_deblur_profiles(eps_fname, vblur_fname, raw_hist, cds_range) aprof = batch_build_Aprof(ctrue_merge, cds_range, -utr5_offset, asite_offset) write_profiles(aprof, profile_fname) if not os.path.exists(profile_fname) or os.path.getsize(profile_fname) == 0: print("FATAL: deblur_pipeline failed at combine_profile!", file=sys.stderr) print("abort program!", file=sys.stderr) exit(1) else: print("final results exists, nothing needs to be done", file=sys.stderr)
def filter_high_cover_profile(hist_fn, cds_fa, cover_ratio, cnt_threshold, ofname): """ pipeline for filtering high coverage profiles """ cds_range = get_cds_range(cds_fa) tlist = parse_rlen_hist(hist_fn) tprofile = get_transcript_profiles(tlist, cds_range, utr5_offset, utr3_offset) pcelebrity = filter_transcript_profiles(tprofile, cds_range, cnt_threshold, cover_ratio) tid2rid = {t['tid']: rid for rid, t in tlist.items()} write_rlen_hist(pcelebrity, cds_range, tid2rid, ofname)
plt.show() if __name__ == "__main__": if len(sys.argv) != 6: print "Usage: python frameshift_celebrity.py rlen.hist cds_range.txt rlen.vblur rlen.eps output_dir" exit(1) hist_fn = sys.argv[1] cds_txt = sys.argv[2] vblur_fname = sys.argv[3] deblur_fname = sys.argv[4] odir = sys.argv[5] ensure_dir(odir) fname = get_file_core(hist_fn) print "get cds range" cds_range = get_cds_range(cds_txt) print "parse read len hist file" tlist = parse_rlen_hist(hist_fn) print "get pre-computed blur vector" b = read_vblur(vblur_fname) print "get pre-computed deblur results" ptrue, eps = read_essentials(deblur_fname) print "construct cobs all at once" tprofile = get_transcript_profiles(tlist, cds_range, utr5_offset, utr3_offset) #cobs = build_cobs_with_shifts(tprofile, cds_range, utr5_offset, utr3_offset, rlen_min, rlen_max, klist) cobs = construct_all_cobs(tprofile, cds_range, utr5_offset, utr3_offset, rlen_min, rlen_max) print "construct ctrue all at once" ctrue = batch_build_ctrue(ptrue, eps, cobs) tid_list = np.array(cobs.keys())
return means, varrs, skews if __name__ == "__main__": if len(sys.argv) != 7: print "Usage: python elongation_rate.py transcript.fasta cds_range.txt rlen.hist rlen.vblur rlen.eps output_dir" exit(1) tfasta = sys.argv[1] cds_txt = sys.argv[2] hist_fn = sys.argv[3] vblur_fname = sys.argv[4] deblur_fname = sys.argv[5] odir = sys.argv[6] ensure_dir(odir) fname = get_file_core(hist_fn) print "get cds range" cds_range = get_cds_range(cds_txt) tseq = get_tseq(tfasta, cds_range) print "parse read len hist file" tlist = parse_rlen_hist(hist_fn) print "get pre-computed blur vector" b = read_vblur(vblur_fname) print "get pre-computed deblur results" ptrue, eps = read_essentials(deblur_fname) print "construct cobs all at once" tprofile = get_transcript_profiles(tlist, cds_range, utr5_offset, utr3_offset) # cobs = build_cobs_with_shifts(tprofile, cds_range, utr5_offset, utr3_offset, rlen_min, rlen_max, klist) cobs = construct_all_cobs(tprofile, cds_range, utr5_offset, utr3_offset, rlen_min, rlen_max) print "construct ctrue all at once" ctrue_rlen = batch_build_ctrue(ptrue, eps, cobs) mprof = { tid: merge_profiles(plist) for tid, plist in ctrue_rlen.iteritems() } base_prof = batch_build_Aprof(mprof, cds_range, -utr5_offset, asite_offset)