def main(): # read params log.info('Parsing QC logs and reading QC plots...') args = parse_arguments() # make a root QCCategory cat_root = make_cat_root(args) # make QCCategory for each category make_cat_align(args, cat_root) make_cat_lib_complexity(args, cat_root) make_cat_replication(args, cat_root) make_cat_peak_stat(args, cat_root) make_cat_align_enrich(args, cat_root) make_cat_peak_enrich(args, cat_root) make_cat_etc(args, cat_root) log.info('Creating HTML report...') write_txt(args.out_qc_html, cat_root.to_html()) log.info('Creating QC JSON file...') j = cat_root.to_dict() write_txt(args.out_qc_json, json.dumps(j, indent=4)) if args.qc_json_ref: log.info('Comparing QC JSON file with reference...') # exclude general section from comparing # because it includes metadata like date, pipeline_ver, ... # we want to compare actual quality metrics only j.pop('general') # exclude JSD (last 3 columns are random) # JSD is tested in task level test. if 'align_enrich' in j and 'jsd' in j['align_enrich']: j['align_enrich'].pop('jsd') with open(args.qc_json_ref, 'r') as fp: j_ref = json.load(fp, object_pairs_hook=OrderedDict) if 'general' in j_ref: j_ref.pop("general") if 'align_enrich' in j_ref and 'jsd' in j_ref['align_enrich']: j_ref['align_enrich'].pop('jsd') match_qc_json_ref = j == j_ref else: match_qc_json_ref = False run_shell_cmd('echo {} > qc_json_ref_match.txt'.format(match_qc_json_ref)) log.info('All done.')
def frip_shifted(ta, peak, chrsz, fraglen, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) half_fraglen = (fraglen + 1) / 2 if get_num_lines(peak) == 0: val1 = 0.0 else: # due to bedtools bug when .gz is given for -a and -b tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools slop -i {} -g {} ' cmd += '-s -l {} -r {} | ' cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' cmd += '-wa -u | wc -l' cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2) # peak val1 = run_shell_cmd(cmd) rm_f(tmp2) val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) return frip_qc
def frip(ta, peak, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak))) frip_qc = '{}.frip.qc'.format(prefix) if get_num_lines(peak) == 0: val1 = 0.0 tmp_files = [] else: # due to bedtools bug when .gz is given for -a and -b tmp1 = gunzip(ta, 'tmp1', out_dir) tmp2 = gunzip(peak, 'tmp2', out_dir) cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' cmd = cmd.format( tmp1, # ta tmp2) # peak val1 = run_shell_cmd(cmd) tmp_files = [tmp1, tmp2] val2 = get_num_lines(ta) write_txt(frip_qc, str(float(val1) / float(val2))) rm_f(tmp_files) return frip_qc
def main(): # read params args = parse_arguments() log.info('Initializing and making output directory...') # make out_dir (root of all outputs) mkdir_p(args.out_dir) # reproducibility QC log.info('Choosing appropriate control for each IP replicate...') num_rep = len(args.tas) num_ctl = len(args.ctl_tas) # num lines in tagaligns depths = [get_num_lines(ta) for ta in args.tas] # num lines in control tagaligns depths_ctl = [get_num_lines(ctl_ta) for ctl_ta in args.ctl_tas] depth_rep_pooled = sum(depths) depth_ctl_pooled = sum(depths_ctl) # make them dicts including -1 key (meaning pooled one) depths = dict(enumerate(depths)) depths_ctl = dict(enumerate(depths_ctl)) depths[-1] = depth_rep_pooled depths_ctl[-1] = depth_ctl_pooled ctl_ta_idx = [0]*num_rep if num_ctl == 1: # if only one control, use it for all replicates pass elif args.always_use_pooled_ctl: # if --always-use-pooled-ctl, then always use pooled control ctl_ta_idx = [-1]*num_rep else: # if multiple controls, # check # of lines in replicate/control tagaligns and # apply ctl_depth_ratio # make depths dicts including pooled ones # check every num lines in every pair of control tagaligns # if ratio of two entries in any pair > ctl_depth_ratio then # use pooled control for all use_pooled_ctl = False for i in range(num_ctl): for j in range(i+1, num_ctl): if depths_ctl[i]/float(depths_ctl[j]) > \ args.ctl_depth_ratio or \ depths_ctl[j]/float(depths_ctl[i]) > \ args.ctl_depth_ratio: use_pooled_ctl = True log.info( 'Number of reads in controls differ by a factor of {}.' 'Using pooled controls.'.format( args.ctl_depth_ratio)) break if use_pooled_ctl: # use pooled control for all exp replicates ctl_ta_idx = [-1]*num_rep else: for i in range(num_rep): if i > num_ctl-1: ctl_ta_idx[i] = -1 # use pooled control elif depths_ctl[i] < depths[i]: log.info( 'Fewer reads in control {} than experiment replicate ' '{}. Using pooled control for replicate {}.'.format( i+1, i+1, i+1)) ctl_ta_idx[i] = -1 # use pooled control else: ctl_ta_idx[i] = i ctl_ta_subsample = [0] * num_rep ctl_ta_subsampled_pooled = 0 if args.exp_ctl_depth_ratio_limit or args.ctl_depth_limit: # subsampling chosen control for each replicate for rep in range(num_rep): chosen_ctl = ctl_ta_idx[rep] depth = depths[rep] depth_ctl = depths_ctl[chosen_ctl] limit = int(max(depth * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit)) if depth_ctl > limit: ctl_ta_subsample[rep] = limit # subsampling pooled control for pooled replicate limit = int(max(depth_rep_pooled * args.exp_ctl_depth_ratio_limit, args.ctl_depth_limit)) if depth_ctl_pooled > limit: ctl_ta_subsampled_pooled = limit # for each replicate check log.info('Writing idx.txt...') out_txt = os.path.join(args.out_dir, args.out_tsv_basename) write_txt(out_txt, ctl_ta_idx) log.info('Writing subsample txt...') out_subsample_txt = os.path.join(args.out_dir, args.out_tsv_subsample_basename) write_txt(out_subsample_txt, ctl_ta_subsample) log.info('Writing subsample_pooled txt...') out_subsample_pooled_txt = os.path.join(args.out_dir, args.out_txt_subsample_pooled_basename) write_txt(out_subsample_pooled_txt, ctl_ta_subsampled_pooled) log.info('List all files in output directory...') ls_l(args.out_dir) log.info('All done.')