def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = fragment_size( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) if opts.fast_fragment: reads = fname1 counts_multis = [ '#' in line.split('\t')[0] for line in open(reads) ] count = len(counts_multis) multiples = {} multiples[1] = sum( [count_mult for count_mult in counts_multis if count_mult]) del counts_multis else: # compute the intersection of the two read ends print('Getting intersection between read 1 and read 2') count, multiples = get_intersection(fname1, fname2, reads, compress=opts.compress_input) # compute insert size print('Get insert size...') hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) try: median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) except ZeroDivisionError: warn('WARNING: cannot compute fragment length, too few ' 'dangling-ends. Setting median length to 400 nt.') median = max_f = mad = 0 if median < 50: warn('WARNING: fragment length too short ({}). ' 'Setting median length to 400 nt.'.format(mad)) median, max_f, mad = 400, 100, 40 if opts.median: median = opts.median if opts.max_f: max_f = opts.max_f if opts.mad: mad = opts.mad print(' - median insert size =', median) print(' - median absolution of insert size =', mad) print( ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f) max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print(' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends' % max_mole) print(' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks' % min_dist) print("identify pairs to filter...") masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, strict_duplicates=opts.strict_duplicates, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print(median, max_f, mad) # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)