def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def test_19_matrix_manip(self): if ONLY and not "19" in ONLY: return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000) hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf") hic_map(hic_data1, by_chrom="intra", savedata="lala-maps~", savefig="lalalo~") hic_map(hic_data1, by_chrom="inter", savedata="lala-maps~", savefig="lalala~") # slowest part of the all test: hic_data2 = read_matrix("lala-map.tsv~", resolution=10000) self.assertEqual(hic_data1, hic_data2) # vals = plot_distance_vs_interactions(hic_data1) # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in # reduce(lambda x, y: x + y, vals)], # [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes("lala-map~") self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv", resolution=20000) hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv", resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2, savefig='lala3.pdf') ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system("rm -rf lala*") if CHKTIME: self.assertEqual(True, True) print "19", time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([ round(i, 2) if str(i) != 'nan' else 0.0 for i in reduce(lambda x, y: x + y, vals) ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in reduce(lambda x, y: x + y, vals)], [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a),int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i,3) for i in corr[0]] self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)] self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=0.001, max_frag_size=100000, min_frag_size=50, re_proximity=5, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
savefig=outfile) reads_mapped_per_iteration = pd.DataFrame.from_dict(reads_mapped_per_iteration) reads_mapped_per_iteration.columns = ['read1', 'read2'] fraction_mapped_read1 = list( reads_mapped_per_iteration['read1'])[-1] / float(n_reads_trimmed) fraction_mapped_read2 = list( reads_mapped_per_iteration['read2'])[-1] / float(n_reads_trimmed) fraction_mapped_str = ",".join( [str(i) for i in [fraction_mapped_read1, fraction_mapped_read2]]) # Plot: distribution of dangling-end lengths plt.rcParams['font.size'] = 12 infile = '%s/%s_both_map.tsv' % (PROCESSED, pair_id) outfile = '%s/%s_plot_distribution_dangling_ends_lengths.png' % ( POSTMAPPING_PLOTS, pair_id) insert_sizes(infile, xlog=False, max_size=99.9, savefig=outfile) # Plot: Decay of interaction counts with genomic distamce plt.rcParams['font.size'] = 12 outfile = '%s/%s_plot_decay_interaction_counts_genomic_distance.png' % ( POSTMAPPING_PLOTS, pair_id) myvalues = plot_distance_vs_interactions(infile, max_diff=50000000, resolution=10000, savefig=outfile) slope = str(myvalues[1][0]) # Plot: sequencing coverage along chromosomes outfile = '%s/%s_plot_genomic_coverage_mapped_%s.png' % ( POSTMAPPING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 20