Example #1
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1,
                by_chrom='intra',
                savedata='lala-maps~',
                savefig='lalalo~')
        hic_map(hic_data1,
                by_chrom='inter',
                savedata='lala-maps~',
                savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)

        self.assertEqual([
            round(i, 2) if str(i) != 'nan' else 0.0
            for i in reduce(lambda x, y: x + y, vals)
        ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
Example #2
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~')
        hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)
        
        self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in
                          reduce(lambda x, y: x + y, vals)],
                         [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0])
        
        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a),int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)
        
        corr = correlate_matrices(hic_data1, hic_data2)
        corr =  [round(i,3) for i in corr[0]]
        self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828,
                                0.757, 0.797, 0.832])
        
        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)]
        self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014,
                                 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451,
                                 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
                                 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013,
                                 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0,
                                 0.028, 0.034, 0.89])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
Example #3
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(
            perc_zero=opts.perc_zeros,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)
    except ValueError:
        hic_data.filter_columns(
            perc_zero=100,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)

    # bad columns
    bad_columns_file = path.join(
        opts.workdir, '04_normalization',
        'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    print 'Get biases using ICE...'
    hic_data.normalize_hic(silent=False,
                           max_dev=0.1,
                           iterations=0,
                           factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True)
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
    print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.pdf_%s_%s.pdf' %
        (opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data,
        max_diff=10000,
        resolution=opts.reso,
        normalized=True,
        savefig=inter_vs_gcoord)

    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = open(bias_file, 'w')
    out_bias.write(
        '\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                   for i in hic_data.bias]) + '\n')
    out_bias.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_nrm_fig,
                savedata=intra_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig,
                savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            inter_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        inter_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_nrm_fig,
                savedata=inter_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig,
                savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        else:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        genom_map_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig,
                savedata=genom_map_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig,
                savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time)
Example #4
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception(
                "ERROR: chromosomes in FASTA different the ones in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        fh = open(opts.mappability)
        mappability = dict((c, []) for c in refs)
        line = fh.next()
        crmM, begM, endM, val = line.split()
        crm = crmM
        if crmM not in mappability:
            print('     skipping %s' % crmM)
            while crmM not in mappability:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
        while any(not mappability[c] for c in mappability):
            for begB in xrange(0, len(genome[crmM]), opts.reso):
                endB = begB + opts.reso
                tmp = 0
                try:
                    while True:
                        crmM, begM, endM, val = line.split()
                        if crm != crmM:
                            try:
                                while crmM not in refs:
                                    line = fh.next()
                                    crmM, _ = line.split('\t', 1)
                            except StopIteration:
                                pass
                            break
                        begM = int(begM)
                        endM = int(endM)
                        if endM > endB:
                            weight = endB - begM
                            if weight >= 0:
                                tmp += weight * float(val)
                            break
                        weight = endM - (begM if begM > begB else begB)
                        if weight < 0:
                            break
                        tmp += weight * float(val)
                        line = fh.next()
                except StopIteration:
                    pass
                mappability[crm].append(tmp / opts.reso)
                crm = crmM
        mappability = reduce(lambda x, y: x + y,
                             (mappability[c] for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Example #5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # pad mappability at the end if the size is close to gc_content
        if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95:
            mappability += [float('nan')] * (len(gc_content)-len(mappability))

        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1`
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path, 
        cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, 
        min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter)

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Example #6
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Example #7
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count,
                                draw_hist=True,
                                by_mean=not opts.fast_filter, savefig=path.join(
                                    opts.workdir, '04_normalization',
                                    'bad_columns_%s_%d_%d_%s.pdf' % (
                                        opts.reso, opts.perc_zeros, opts.min_count,
                                        param_hash)) if
                                not opts.fast_filter else None)
    except ValueError:
        raise ValueError('ERROR: probably all columns filtered out...')
    # bad columns
    bad_columns_file = path.join(opts.workdir, '04_normalization',
                                 'bad_columns_%s_%d_%d_%s.tsv' % (
                                     opts.reso, opts.perc_zeros, opts.min_count, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    if not opts.filter_only:
        print 'Get biases using ICE...'
        hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                               factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = cis_trans_N_d = float('nan')
    if not opts.filter_only:
        cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
        cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)
        
    if not opts.filter_only:
        print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
        print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (
                                    opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
        savefig=inter_vs_gcoord)
    
    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = 'NA'
    if not opts.filter_only:
        out_bias = open(bias_file, 'w')
        out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                                  for i in hic_data.bias])
                       + '\n')
        out_bias.close()


    # pickle the HiC-data object
    print 'Saving genomic matrix'
    pickle_path = path.join(opts.workdir, '04_normalization',
                            'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash))
    out = open(pickle_path, 'w')
    dump(hic_data, out)
    out.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            if not opts.filter_only:
                inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                      'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                  'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        else:
            if not opts.filter_only:
                genom_map_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        if not opts.filter_only:
            genom_map_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, cmap='jet',
                    name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt)
        hic_map(hic_data, normalized=False, cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig, savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
                a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
                len(hic_data.bads.keys()), len(hic_data),
                intra_dir_nrm_fig, intra_dir_nrm_txt,
                inter_dir_nrm_fig, inter_dir_nrm_txt,
                genom_map_nrm_fig, genom_map_nrm_txt,
                intra_dir_raw_fig, intra_dir_raw_txt,
                inter_dir_raw_fig, inter_dir_raw_txt,
                genom_map_raw_fig, genom_map_raw_txt,
                pickle_path, launch_time, finish_time)
Example #8
0
fraction_mapped_str = ",".join(
    [str(i) for i in [fraction_mapped_read1, fraction_mapped_read2]])

# Plot: distribution of dangling-end lengths
plt.rcParams['font.size'] = 12
infile = '%s/%s_both_map.tsv' % (PROCESSED, pair_id)
outfile = '%s/%s_plot_distribution_dangling_ends_lengths.png' % (
    POSTMAPPING_PLOTS, pair_id)
insert_sizes(infile, xlog=False, max_size=99.9, savefig=outfile)

# Plot: Decay of interaction counts with genomic distamce
plt.rcParams['font.size'] = 12
outfile = '%s/%s_plot_decay_interaction_counts_genomic_distance.png' % (
    POSTMAPPING_PLOTS, pair_id)
myvalues = plot_distance_vs_interactions(infile,
                                         max_diff=50000000,
                                         resolution=10000,
                                         savefig=outfile)
slope = str(myvalues[1][0])

# Plot: sequencing coverage along chromosomes
outfile = '%s/%s_plot_genomic_coverage_mapped_%s.png' % (
    POSTMAPPING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 20
coverages = plot_genomic_distribution(infile,
                                      name='mapped',
                                      savefig=outfile,
                                      resolution=genomic_coverage_resolution,
                                      pair_id=pair_id)
outfile = '%s/%s_plot_genomic_coverage_mapped_%s.bed' % (
    COVERAGES, pair_id, genomic_coverage_resolution)
coverages.to_csv(outfile, sep='\t', index=False)