def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([ round(i, 2) if str(i) != 'nan' else 0.0 for i in reduce(lambda x, y: x + y, vals) ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in reduce(lambda x, y: x + y, vals)], [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a),int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i,3) for i in corr[0]] self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)] self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns( perc_zero=opts.perc_zeros, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) except ValueError: hic_data.filter_columns( perc_zero=100, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' % (opts.reso, opts.perc_zeros, param_hash)) if not opts.fast_filter else None) # bad columns bad_columns_file = path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=True, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = open(bias_file, 'w') out_bias.write( '\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: inter_dir_nrm_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) inter_dir_nrm_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join( opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join( opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) else: genom_map_nrm_fig = None genom_map_raw_fig = None genom_map_nrm_txt = path.join( opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join( opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception( 'ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception( 'ERROR: missing restriction enzyme name for oneD normalization' ) if not opts.mappability: raise Exception( 'ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % ( len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception( 'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception( "ERROR: chromosomes in FASTA different the ones in BAM") # get mappability ~2 min printime(' - Parsing mappability') fh = open(opts.mappability) mappability = dict((c, []) for c in refs) line = fh.next() crmM, begM, endM, val = line.split() crm = crmM if crmM not in mappability: print(' skipping %s' % crmM) while crmM not in mappability: line = fh.next() crmM, begM, endM, val = line.split() crm = crmM while any(not mappability[c] for c in mappability): for begB in xrange(0, len(genome[crmM]), opts.reso): endB = begB + opts.reso tmp = 0 try: while True: crmM, begM, endM, val = line.split() if crm != crmM: try: while crmM not in refs: line = fh.next() crmM, _ = line.split('\t', 1) except StopIteration: pass break begM = int(begM) endM = int(endM) if endM > endB: weight = endB - begM if weight >= 0: tmp += weight * float(val) break weight = endM - (begM if begM > begB else begB) if weight < 0: break tmp += weight * float(val) line = fh.next() except StopIteration: pass mappability[crm].append(tmp / opts.reso) crm = crmM mappability = reduce(lambda x, y: x + y, (mappability[c] for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos - 200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols) bad_col_image = path.join( outdir, 'filtered_bins_%s_%s.png' % (nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join( opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % (opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print(' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join( outdir, 'biases_%s_%s.pickle' % (nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump( { 'biases': biases, 'decay': decay, 'badcol': badcol, 'resolution': opts.reso }, out) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))) if len(fas - bam) <= 50: print('\n'.join([(' - ' + c) for c in (fas - bam)])) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1) if len(mappability[c]) < len(refs) // opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) // opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # pad mappability at the end if the size is close to gc_content if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95: mappability += [float('nan')] * (len(gc_content)-len(mappability)) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in range(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1` # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path, cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'wb') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bam: mreads = path.realpath(opts.bam) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) filter_exclude = opts.filter outdir = path.join(opts.workdir, '04_normalization') mkdir(outdir) mappability = gc_content = n_rsites = None if opts.normalization == 'oneD': if not opts.fasta: raise Exception('ERROR: missing path to FASTA for oneD normalization') if not opts.renz: raise Exception('ERROR: missing restriction enzyme name for oneD normalization') if not opts.mappability: raise Exception('ERROR: missing path to mappability for oneD normalization') bamfile = AlignmentFile(mreads, 'rb') refs = bamfile.references bamfile.close() # get genome sequence ~1 min printime(' - parsing FASTA') genome = parse_fasta(opts.fasta, verbose=False) fas = set(genome.keys()) bam = set(refs) if fas - bam: print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)) if len(fas - bam) <= 50: print '\n'.join([(' - ' + c) for c in (fas - bam)]) if bam - fas: txt = ('\n'.join([(' - ' + c) for c in (bam - fas)]) if len(bam - fas) <= 50 else '') raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % ( len(bam - fas), txt)) refs = [crm for crm in refs if crm in genome] if len(refs) == 0: raise Exception("ERROR: chromosomes in FASTA different the ones" " in BAM") # get mappability ~2 min printime(' - Parsing mappability') mappability = parse_mappability_bedGraph( opts.mappability, opts.reso, wanted_chrom=refs[0] if len(refs)==1 else None) # resize chomosomes for c in refs: if not c in mappability: mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1) if len(mappability[c]) < len(refs) / opts.reso + 1: mappability[c] += [float('nan')] * ( (len(refs) / opts.reso + 1) - len(mappability[c])) # concatenates mappability = reduce(lambda x, y: x + y, (mappability.get(c, []) for c in refs)) printime(' - Computing GC content per bin (removing Ns)') gc_content = get_gc_content(genome, opts.reso, chromosomes=refs, n_cpus=opts.cpus) # compute r_sites ~30 sec # TODO: read from DB printime(' - Computing number of RE sites per bin (+/- 200 bp)') n_rsites = [] re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '') for crm in refs: for pos in xrange(200, len(genome[crm]) + 200, opts.reso): seq = genome[crm][pos-200:pos + opts.reso + 200] n_rsites.append(seq.count(re_site)) ## CHECK TO BE REMOVED # out = open('tmp_mappability.txt', 'w') # i = 0 # for crm in refs: # for pos in xrange(len(genome[crm]) / opts.reso + 1): # out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i])) # i += 1 # out.close() # compute GC content ~30 sec # TODO: read from DB biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam( mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2, factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus, normalization=opts.normalization, mappability=mappability, p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites, min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed, normalize_only=opts.normalize_only, max_njobs=opts.max_njobs, extra_bads=opts.badcols, biases_path=opts.biases_path) bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % ( nicer(opts.reso).replace(' ', ''), param_hash)) inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.png_%s_%s.png' % ( opts.reso, param_hash)) # get and plot decay if not opts.normalize_only: printime(' - Computing interaction decay vs genomic distance') (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print (' -> Decay slope 0.7-10 Mb\t%s' % a2) else: a2 = 0. printime(' - Saving biases and badcol columns') # biases bias_file = path.join(outdir, 'biases_%s_%s.pickle' % ( nicer(opts.reso).replace(' ', ''), param_hash)) out = open(bias_file, 'w') dump({'biases' : biases, 'decay' : decay, 'badcol' : badcol, 'resolution': opts.reso}, out, HIGHEST_PROTOCOL) out.close() finish_time = time.localtime() try: save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol), len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter, launch_time, finish_time) except: # release lock anyway print_exc() try: remove(path.join(opts.workdir, '__lock_db')) except OSError: pass exit(1)
def run(opts): check_options(opts) launch_time = time.localtime() param_hash = digest_parameters(opts) if opts.bed: mreads = path.realpath(opts.bed) else: mreads = path.join(opts.workdir, load_parameters_fromdb(opts)) print 'loading', mreads hic_data = load_hic_data_from_reads(mreads, opts.reso) mkdir(path.join(opts.workdir, '04_normalization')) print 'Get poor bins...' try: hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count, draw_hist=True, by_mean=not opts.fast_filter, savefig=path.join( opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.pdf' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) if not opts.fast_filter else None) except ValueError: raise ValueError('ERROR: probably all columns filtered out...') # bad columns bad_columns_file = path.join(opts.workdir, '04_normalization', 'bad_columns_%s_%d_%d_%s.tsv' % ( opts.reso, opts.perc_zeros, opts.min_count, param_hash)) out_bad = open(bad_columns_file, 'w') out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()])) out_bad.close() # Identify biases if not opts.filter_only: print 'Get biases using ICE...' hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0, factor=opts.factor) print 'Getting cis/trans...' cis_trans_N_D = cis_trans_N_d = float('nan') if not opts.filter_only: cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True ) cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False) cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True ) cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False) if not opts.filter_only: print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d # Plot genomic distance vs interactions print 'Plot genomic distance vs interactions...' inter_vs_gcoord = path.join(opts.workdir, '04_normalization', 'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % ( opts.reso, param_hash)) (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions( hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only, savefig=inter_vs_gcoord) print 'Decay slope 0.7-10 Mb\t%s' % a2 # write biases bias_file = path.join(opts.workdir, '04_normalization', 'bias_%s_%s.tsv' % (opts.reso, param_hash)) out_bias = 'NA' if not opts.filter_only: out_bias = open(bias_file, 'w') out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]) + '\n') out_bias.close() # pickle the HiC-data object print 'Saving genomic matrix' pickle_path = path.join(opts.workdir, '04_normalization', 'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash)) out = open(pickle_path, 'w') dump(hic_data, out) out.close() # to feed the save_to_db funciton intra_dir_nrm_fig = intra_dir_nrm_txt = None inter_dir_nrm_fig = inter_dir_nrm_txt = None genom_map_nrm_fig = genom_map_nrm_txt = None intra_dir_raw_fig = intra_dir_raw_txt = None inter_dir_raw_fig = inter_dir_raw_txt = None genom_map_raw_fig = genom_map_raw_txt = None if "intra" in opts.keep: print " Saving intra chromosomal raw and normalized matrices..." if opts.only_txt: intra_dir_nrm_fig = None intra_dir_raw_fig = None else: intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) intra_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet', name=path.split(opts.workdir)[-1], savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt) if "inter" in opts.keep: print " Saving inter chromosomal raw and normalized matrices..." if opts.only_txt: inter_dir_nrm_fig = None inter_dir_raw_fig = None else: if not opts.filter_only: inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_fig = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash)) inter_dir_raw_txt = path.join(opts.workdir, '04_normalization', 'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt) hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet', name=path.split(opts.workdir)[-1], savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt) if "genome" in opts.keep: print " Saving normalized genomic matrix..." if opts.only_txt: genom_map_nrm_fig = None genom_map_raw_fig = None else: if not opts.filter_only: genom_map_nrm_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash)) genom_map_raw_fig = path.join(opts.workdir, '04_normalization', 'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash)) if not opts.filter_only: genom_map_nrm_txt = path.join(opts.workdir, '04_normalization', 'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash)) genom_map_raw_txt = path.join(opts.workdir, '04_normalization', 'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash)) if not opts.filter_only: hic_map(hic_data, normalized=True, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt) hic_map(hic_data, normalized=False, cmap='jet', name=path.split(opts.workdir)[-1], savefig=genom_map_raw_fig, savedata=genom_map_raw_txt) finish_time = time.localtime() save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads, len(hic_data.bads.keys()), len(hic_data), intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time, finish_time)
fraction_mapped_str = ",".join( [str(i) for i in [fraction_mapped_read1, fraction_mapped_read2]]) # Plot: distribution of dangling-end lengths plt.rcParams['font.size'] = 12 infile = '%s/%s_both_map.tsv' % (PROCESSED, pair_id) outfile = '%s/%s_plot_distribution_dangling_ends_lengths.png' % ( POSTMAPPING_PLOTS, pair_id) insert_sizes(infile, xlog=False, max_size=99.9, savefig=outfile) # Plot: Decay of interaction counts with genomic distamce plt.rcParams['font.size'] = 12 outfile = '%s/%s_plot_decay_interaction_counts_genomic_distance.png' % ( POSTMAPPING_PLOTS, pair_id) myvalues = plot_distance_vs_interactions(infile, max_diff=50000000, resolution=10000, savefig=outfile) slope = str(myvalues[1][0]) # Plot: sequencing coverage along chromosomes outfile = '%s/%s_plot_genomic_coverage_mapped_%s.png' % ( POSTMAPPING_PLOTS, pair_id, genomic_coverage_resolution) plt.rcParams['font.size'] = 20 coverages = plot_genomic_distribution(infile, name='mapped', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id) outfile = '%s/%s_plot_genomic_coverage_mapped_%s.bed' % ( COVERAGES, pair_id, genomic_coverage_resolution) coverages.to_csv(outfile, sep='\t', index=False)