Ejemplo n.º 1
0
def main():
    opts          = get_options()

    inbam          = opts.inbam
    resolution     = opts.reso
    filter_exclude = opts.filter
    min_count      = opts.min_count
    ncpus          = opts.cpus
    factor         = 1
    outdir         = opts.outdir
    sigma          = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam, filter_exclude, resolution,
                                     min_count=min_count, ncpus=ncpus, sigma=sigma,
                                     factor=factor, outdir=outdir, check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(os.path.join(outdir, 'biases_%s.pickle' % (
        nicer(resolution).replace(' ', ''))), 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': resolution}, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Ejemplo n.º 2
0
def main():
    opts = get_options()

    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    min_count = opts.min_count
    ncpus = opts.cpus
    factor = 1
    outdir = opts.outdir
    sigma = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam,
                                     filter_exclude,
                                     resolution,
                                     min_count=min_count,
                                     ncpus=ncpus,
                                     sigma=sigma,
                                     factor=factor,
                                     outdir=outdir,
                                     check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(
        os.path.join(outdir, 'biases_%s.pickle' %
                     (nicer(resolution).replace(' ', ''))), 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': resolution
        }, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Ejemplo n.º 3
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             min_count=2500,
             sigma=2,
             ncpus=8,
             factor=1,
             outdir='.',
             check_sum=False):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])

    start_bin = 0
    end_bin = len(bins) + 1
    total = len(bins)

    total = end_bin - start_bin + 1
    regs = []
    begs = []
    ends = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(
            pool.apply_async(read_bam_frag,
                             args=(
                                 inbam,
                                 filter_exclude,
                                 bins,
                                 bins_dict,
                                 resolution,
                                 outdir,
                                 region,
                                 start,
                                 end,
                             )))
    pool.close()
    print_progress(procs)
    pool.join()

    ## COLLECT RESULTS
    verbose = True
    cisprc = {}
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if verbose:
            if not countbin % 10 and countbin:
                sys.stdout.write(' ')
            if not countbin % 50 and countbin:
                sys.stdout.write(' %9s\n     ' % ('%s/%s' %
                                                  (countbin, len(regs))))
            sys.stdout.write('.')
            sys.stdout.flush()

        fname = os.path.join(outdir,
                             'tmp_bins_%s:%d-%d.pickle' % (region, start, end))
        tmp_cisprc = load(open(fname))
        cisprc.update(tmp_cisprc)
    if verbose:
        print '%s %9s\n' % (' ' * (54 - (countbin % 50) -
                                   (countbin % 50) / 10), '%s/%s' %
                            (len(regs), len(regs)))

    # out = open(os.path.join(outdir, 'dicos_%s.pickle' % (
    #     nicer(resolution).replace(' ', ''))), 'w')
    # dump(cisprc, out)
    # out.close()
    # bad columns
    def func_gen(x, *args):
        cmd = "zzz = " + func_restring % (args)
        exec(cmd) in globals(), locals()
        #print cmd
        try:
            return np.lib.asarray_chkfinite(zzz)
        except:
            # avoid the creation of NaNs when invalid values for power or log
            return x

    print '  - Removing columns with too few or too much interactions'
    if not min_count:

        badcol = filter_by_cis_percentage(
            cisprc,
            sigma=sigma,
            verbose=True,
            savefig=os.path.join(outdir + 'filtered_bins_%s.png' %
                                 (nicer(resolution).replace(' ', ''))))
    else:
        print '      -> too few  interactions defined as less than %9d interactions' % (
            min_count)
        for k in cisprc:
            cisprc[k] = cisprc[k][1]
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, 0) < min_count:
                badcol[c] = cisprc.get(c, 0)
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total,
            float(len(badcol)) / total * 100)

    printime('  - Rescaling biases')
    size = len(bins)
    biases = [cisprc.get(k, 1.) for k in range(size)]
    mean_col = float(sum(biases)) / len(biases)
    biases = dict([(k, b / mean_col * mean_col**0.5)
                   for k, b in enumerate(biases)])

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(
            fname,
            biases,
        )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    # check the sum
    if check_sum:
        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = os.path.join(outdir,
                                 'tmp_%s:%d-%d.pickle' % (region, start, end))
            procs.append(
                pool.apply_async(sum_nrm_matrix, args=(
                    fname,
                    biases,
                )))
        pool.close()
        print_progress(procs)
        pool.join()

        # to correct biases
        sumnrm = sum(p.get() for p in procs)
        print 'SUM:', sumnrm

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(
            pool.apply_async(sum_dec_matrix,
                             args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    sumdec = {}
    for proc in procs:
        for k, v in proc.get().iteritems():
            try:
                sumdec[k] += v
            except KeyError:
                sumdec[k] = v

    # count the number of cells per diagonal
    # TODO: parallelize
    # find larget chromsome
    len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos)
    # initialize dictionary
    ndiags = dict((k, 0) for k in xrange(len_big))
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set(
            )  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b <= maxp:
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[dist] -= len(bad_diag)
        # chr_sizeerent behavior for longest diagonal:
        ndiags[0] += chr_size - len(thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    for k in sumdec:
        try:
            sumdec[k] /= ndiags[k]
        except ZeroDivisionError:  # all columns at this distance are "bad"
            pass

    return biases, sumdec, badcol
Ejemplo n.º 4
0
def read_bam(inbam, filter_exclude, resolution, min_count=2500,
             sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])

    start_bin = 0
    end_bin   = len(bins) + 1
    total = len(bins)

    total = end_bin - start_bin + 1
    regs = []
    begs = []
    ends = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(pool.apply_async(
            read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict,
                                 resolution, outdir, region, start, end,)))
    pool.close()
    print_progress(procs)
    pool.join()

    ## COLLECT RESULTS
    verbose = True
    cisprc = {}
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if verbose:
            if not countbin % 10 and countbin:
                sys.stdout.write(' ')
            if not countbin % 50 and countbin:
                sys.stdout.write(' %9s\n     ' % ('%s/%s' % (countbin , len(regs))))
            sys.stdout.write('.')
            sys.stdout.flush()

        fname = os.path.join(outdir,
                             'tmp_bins_%s:%d-%d.pickle' % (region, start, end))
        tmp_cisprc = load(open(fname))
        cisprc.update(tmp_cisprc)
    if verbose:
        print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10),
                            '%s/%s' % (len(regs),len(regs)))

    # out = open(os.path.join(outdir, 'dicos_%s.pickle' % (
    #     nicer(resolution).replace(' ', ''))), 'w')
    # dump(cisprc, out)
    # out.close()
    # bad columns
    def func_gen(x, *args):
        cmd = "zzz = " + func_restring % (args)
        exec(cmd) in globals(), locals()
        #print cmd
        try:
            return np.lib.asarray_chkfinite(zzz)
        except:
            # avoid the creation of NaNs when invalid values for power or log
            return x
    print '  - Removing columns with too few or too much interactions'
    if not min_count:

        badcol = filter_by_cis_percentage(
            cisprc, sigma=sigma, verbose=True,
            savefig=os.path.join(outdir + 'filtered_bins_%s.png' % (
                nicer(resolution).replace(' ', ''))))
    else:
        print '      -> too few  interactions defined as less than %9d interactions' % (
            min_count)
        for k in cisprc:
            cisprc[k] = cisprc[k][1]
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, 0) < min_count:
                badcol[c] = cisprc.get(c, 0)
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total, float(len(badcol)) / total * 100)

    printime('  - Rescaling biases')
    size = len(bins)
    biases = [cisprc.get(k, 1.) for k in range(size)]
    mean_col = float(sum(biases)) / len(biases)
    biases = dict([(k, b / mean_col * mean_col**0.5)
                   for k, b in enumerate(biases)])

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    # check the sum
    if check_sum:
        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
            procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, )))
        pool.close()
        print_progress(procs)
        pool.join()

        # to correct biases
        sumnrm = sum(p.get() for p in procs)
        print 'SUM:', sumnrm

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_dec_matrix,
                                      args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    sumdec = {}
    for proc in procs:
        for k, v in proc.get().iteritems():
            try:
                sumdec[k] += v
            except KeyError:
                sumdec[k]  = v

    # count the number of cells per diagonal
    # TODO: parallelize
    # find larget chromsome
    len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos)
    # initialize dictionary
    ndiags = dict((k, 0) for k in xrange(len_big))
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set()  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b <= maxp:
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[dist] -= len(bad_diag)
        # chr_sizeerent behavior for longest diagonal:
        ndiags[0] += chr_size - len(thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    for k in sumdec:
        try:
            sumdec[k] /= ndiags[k]
        except ZeroDivisionError:  # all columns at this distance are "bad"
            pass

    return biases, sumdec, badcol
Ejemplo n.º 5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception(
                "ERROR: chromosomes in FASTA different the ones in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        fh = open(opts.mappability)
        mappability = dict((c, []) for c in refs)
        line = fh.next()
        crmM, begM, endM, val = line.split()
        crm = crmM
        if crmM not in mappability:
            print('     skipping %s' % crmM)
            while crmM not in mappability:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
        while any(not mappability[c] for c in mappability):
            for begB in xrange(0, len(genome[crmM]), opts.reso):
                endB = begB + opts.reso
                tmp = 0
                try:
                    while True:
                        crmM, begM, endM, val = line.split()
                        if crm != crmM:
                            try:
                                while crmM not in refs:
                                    line = fh.next()
                                    crmM, _ = line.split('\t', 1)
                            except StopIteration:
                                pass
                            break
                        begM = int(begM)
                        endM = int(endM)
                        if endM > endB:
                            weight = endB - begM
                            if weight >= 0:
                                tmp += weight * float(val)
                            break
                        weight = endM - (begM if begM > begB else begB)
                        if weight < 0:
                            break
                        tmp += weight * float(val)
                        line = fh.next()
                except StopIteration:
                    pass
                mappability[crm].append(tmp / opts.reso)
                crm = crmM
        mappability = reduce(lambda x, y: x + y,
                             (mappability[c] for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 6
0
 def format_yticks(tickstring, _=None):
     tickstring = int(tickstring * reso + pltbeg2)
     return nicer(tickstring if tickstring else 1,
                  comma=',', allowed_decimals=1)
Ejemplo n.º 7
0
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False,
                                  genome_seq=None, resolution=None, axe=None,
                                  savefig=None, normalized=False):
    """
    :param data: input file name, or HiC_data object or list of lists
    :param 10 min_diff: lower limit (in number of bins)
    :param 1000 max_diff: upper limit (in number of bins) to look for
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).

    :returns: slope, intercept and R square of each of the 3 correlations
    """
    resolution = resolution or 1
    dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)])
    if isinstance(data, str):
        fhandler = open(data)
        line = fhandler.next()
        while line.startswith('#'):
            line = fhandler.next()
        try:
            while True:
                _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
                if cr1 != cr2:
                    line = fhandler.next()
                    continue
                diff = abs(int(ps1)  / resolution - int(ps2) / resolution)
                if max_diff > diff >= min_diff:
                    dist_intr[diff] += 1
                line = fhandler.next()
        except StopIteration:
            pass
        fhandler.close()
    elif isinstance(data, HiC_data):
        if normalized:
            get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y]
        else:
            get_data = lambda x, y: data[x, y]
        max_diff = min(len(data), max_diff)
        if data.section_pos:
            for crm in data.section_pos:
                for diff in xrange(min_diff, min(
                    (max_diff, 1 + data.chromosomes[crm]))):
                    for i in xrange(data.section_pos[crm][0],
                                    data.section_pos[crm][1] - diff):
                        dist_intr[diff] += get_data(i, i + diff)
        else:
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i, i + diff]):
                        dist_intr[diff] += get_data(i, diff)
    else:
        if genome_seq:
            max_diff = min(max(genome_seq.values()), max_diff)
            cnt = 0
            for crm in genome_seq:
                for diff in xrange(min_diff, min(
                    (max_diff, genome_seq[crm]))):
                    for i in xrange(cnt, cnt + genome_seq[crm] - diff):
                        if not np.isnan(data[i][i + diff]):
                            dist_intr[diff] += data[i][i + diff]
                cnt += genome_seq[crm]
        else:
            max_diff = min(len(data), max_diff)
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i][i + diff]):
                        dist_intr[diff] += data[i][i + diff]
    if not axe:
        fig=plt.figure()
        axe = fig.add_subplot(111)
    # remove last part of the plot in case no interaction is count... reduce max_dist
    for diff in xrange(max_diff - 1, min_diff, -1):
        try:
            if not dist_intr[diff]:
                del(dist_intr[diff])
                max_diff -=1
                continue
        except KeyError:
            max_diff -=1
            continue
        break
    xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0]))
    x = []
    y = []
    for k in xrange(len(xp)):
        if yp[k]:
            x.append(xp[k])
            y.append(yp[k])
    axe.plot(x, y, 'k.')
    best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0)
    logx = np.log(x)
    logy = np.log(y)
    ntries = 100
    # set k for better fit
    # for k in xrange(1, ntries/5, ntries/5/5):
    if resolution == 1:
        k = 1
        for i in xrange(3, ntries-2-k):
            v1 = i * len(x) / ntries
            try:
                a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
            except ValueError:
                a1 = b1 = r21 = 0
            r21 *= r21
            for j in xrange(i + 1 + k, ntries - 2 - k):
                v2 = j * len(x) / ntries
                try:
                    a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2])
                    a3, b3, r23, _, _ = linregress(logx[v2+k:  ], logy[v2+k: ])
                except ValueError:
                    a2 = b2 = r22 = 0
                    a3 = b3 = r23 = 0
                r2 = r21 + r22**2 + r23**2
                if r2 > best[0]:
                    best = (r2, v1, v2, a1, a2, a3,
                            b1, b2, b3, k)
        # plot line of best fit
        (v1, v2, 
         a1, a2, a3,
         b1, b2, b3, k) = best[1:]
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2+k:  ], yfit3(x[v2+k:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    else:
        # from 0.7 Mb
        v1 = 700000   / resolution
        # to 10 Mb
        v2 = 10000000 / resolution
        try:
            a1, b1, r21, _, _ = linregress(logx[  :v1], logy[  :v1])
        except ValueError:
            a1, b1, r21 = 0, 0, 0
        try:
            a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2])
        except ValueError:
            a2, b2, r22 = 0, 0, 0
        try:
            a3, b3, r23, _, _ = linregress(logx[v2:  ], logy[v2:  ])
        except ValueError:
            a3, b3, r23 = 0, 0, 0
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1:v2], yfit2(x[v1:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2:  ], yfit3(x[v2:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    axe.set_ylabel('Log interaction count')
    axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution))
    axe.legend(loc='lower left', frameon=False)
    axe.set_xscale('log')
    axe.set_yscale('log')
    axe.set_xlim((min_diff, max_diff))
    try:
        axe.set_ylim((0, max(y)))
    except ValueError:
        pass
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif show==True:
        plt.show()
        plt.close('all')
    return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
Ejemplo n.º 8
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             biases,
             ncpus=8,
             region1=None,
             start1=None,
             end1=None,
             region2=None,
             start2=None,
             end2=None,
             outdir='.'):

    bamfile = pysam.AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])
    start_bin = 0
    end_bin = len(bins) + 1
    if region1:
        regions = [region1]
        start_bin = [i for i, b in enumerate(bins) if b[0] == region1][0]
        end_bin = [
            i for i, b in enumerate(bins[start_bin:], start_bin)
            if b[0] == region1
        ][-1]
    else:
        regions = bamfile.references
        total = len(bins)
        if start1 or end1:
            raise Exception('ERROR: Cannot use start/end1 without region')

    if start1:
        start_bin = section_pos[region1][0] + start1 / resolution
    else:
        start1 = 0
    if end1:
        end_bin = section_pos[region1][0] + end1 / resolution
    else:
        end = len(bins)
        end1 = (section_pos[region1][1] - section_pos[region1][0]) * resolution

    total = end_bin - start_bin + 1
    regs = []
    begs = []
    ends = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in xrange(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
        if crm1 != crm2:
            fin1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(fin1 * resolution + resolution)  # last nt included
            ends.append(fin2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # reduce dictionaries
    bins = []
    for crm in regions:
        beg_crm = section_pos[crm][0]
        if len(regions) == 1:
            start = start_bin - beg_crm
            end = end_bin - beg_crm
        else:
            start = 0
            end = section_pos[crm][1] - section_pos[crm][0] + 1
        bins.extend([(crm, i) for i in xrange(start, end)])
    bins_dict1 = dict([(j, i) for i, j in enumerate(bins)])
    if region2:
        bins = []
        beg_crm = section_pos[region2][0]
        if start2:
            start_bin2 = section_pos[region2][0] + start2 / resolution
            end_bin2 = section_pos[region2][0] + end2 / resolution
        else:
            start2 = 0
            start_bin2 = 0
            end_bin2 = section_pos[region2][1]
            end2 = sections[region2] * resolution
        start = start_bin2 - beg_crm
        end = end_bin2 - beg_crm
        bins = [(region2, i) for i in xrange(start, end)]
        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
    else:
        bins_dict2 = bins_dict1
    pool = mu.Pool(ncpus)
    ## RUN!
    printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    procs = []
    for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
        if ncpus == 1:
            read_bam_frag(
                inbam,
                filter_exclude,
                bins_dict1,
                bins_dict2,
                resolution,
                outdir,
                region,
                b,
                e,
            )
        else:
            procs.append(
                pool.apply_async(read_bam_frag,
                                 args=(
                                     inbam,
                                     filter_exclude,
                                     bins_dict1,
                                     bins_dict2,
                                     resolution,
                                     outdir,
                                     region,
                                     b,
                                     e,
                                 )))
    pool.close()
    print_progress(procs)
    pool.join()

    printime('  - Writing matrices')
    bias1 = dict((k - start_bin, v)
                 for k, v in biases.get('biases', {}).iteritems()
                 if start_bin <= k <= end_bin)
    if region2:
        bias2 = dict((k - start_bin2, v)
                     for k, v in biases.get('biases', {}).iteritems()
                     if start_bin2 <= k <= end_bin2)
    else:
        bias2 = bias1
    decay = biases.get('decay', {})
    bads1 = dict((k - start_bin, v)
                 for k, v in biases.get('badcol', {}).iteritems()
                 if start_bin <= k <= end_bin)
    if region2:
        bads2 = dict((k - start_bin2, v)
                     for k, v in biases.get('badcol', {}).iteritems()
                     if start_bin2 <= k <= end_bin2)
    else:
        bads2 = bads1
    # hic_data = HiC_data((), len(bins_dict), sections,
    #                     bins_dict, resolution=resolution)
    if len(regions) == 1:
        if region2:
            name = '%s:%d-%d_%s:%d-%d' % (region1, start1 / resolution,
                                          end1 / resolution, region2, start2 /
                                          resolution, end2 / resolution)
        else:
            name = '%s:%d-%d' % (region1, start1 / resolution,
                                 end1 / resolution)
    else:
        name = 'full'
    out_raw = open(
        os.path.join(
            outdir, 'matrix_raw_%s_%s.abc' %
            (name, nicer(resolution).replace(' ', ''))), 'w')
    out_raw.write('# %s resolution:%d\n' % (name, resolution))
    if region2:
        out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
        out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
    else:
        out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
    if biases:
        out_nrm = open(
            os.path.join(
                outdir, 'matrix_nrm_%s_%s.abc' %
                (name, nicer(resolution).replace(' ', ''))), 'w')
        out_nrm.write('# %s resolution:%d\n' % (name, resolution))
        if region2:
            out_nrm.write('# BADROWS %s\n' % (','.join([str(b)
                                                        for b in bads1])))
            out_nrm.write('# BADCOLS %s\n' % (','.join([str(b)
                                                        for b in bads2])))
        else:
            out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
        out_dec = open(
            os.path.join(
                outdir, 'matrix_dec_%s_%s.abc' %
                (name, nicer(resolution).replace(' ', ''))), 'w')
        out_dec.write('# %s resolution:%d\n' % (name, resolution))
        if region2:
            out_dec.write('# BADROWS %s\n' % (','.join([str(b)
                                                        for b in bads1])))
            out_dec.write('# BADCOLS %s\n' % (','.join([str(b)
                                                        for b in bads2])))
        else:
            out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))

    def write2matrix(a, b, c):
        out_raw.write('%d\t%d\t%d\n' % (a, b, c))

    def write2matrices(a, b, c):
        out_raw.write('%d\t%d\t%d\n' % (a, b, c))
        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b]))
        out_dec.write('%d\t%d\t%f\n' %
                      (a, b, c / bias1[a] / bias2[b] / decay[abs(a - b)]))

    def write2matrices_2reg(a, b, c):
        out_raw.write('%d\t%d\t%d\n' % (a, b, c))
        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b]))
        out_dec.write(
            '%d\t%d\t%f\n' %
            (a, b, c / bias1[a] / bias2[b] / decay[abs((a + start_bin) -
                                                       (b + start_bin2))]))

    def write2matrices_err(a, b, c):
        out_raw.write('%d\t%d\t%d\n' % (a, b, c))
        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / bias1[a] / bias2[b]))
        try:
            out_dec.write('%d\t%d\t%f\n' %
                          (a, b, c / bias1[a] / bias2[b] / decay[abs(a - b)]))
        except KeyError:  # different chromsomes
            out_dec.write('%d\t%d\t%s\n' % (a, b, 'nan'))

    if biases:
        if len(regions) == 1:
            if region2:
                write = write2matrices_2reg
            else:
                write = write2matrices
        else:
            write = write2matrices_err
    else:
        write = write2matrix

    sys.stdout.write('     ')
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not i % 10 and i:
            sys.stdout.write(' ')
        if not i % 50 and i:
            sys.stdout.write(' %9s\n     ' % ('%s/%s' % (i, len(regs))))
        sys.stdout.write('.')
        sys.stdout.flush()

        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        dico = load(open(fname))
        for (j, k), v in dico.iteritems():
            if j in bads1 or k in bads2:
                continue
            write(j, k, v)
        os.system('rm -f %s' % (fname))
    out_raw.close()
    if biases:
        out_nrm.close()
        out_dec.close()
    print '%s %9s\n' % (' ' * (54 - (i % 50) - (i % 50) / 10), '%s/%s' %
                        (len(regs), len(regs)))
Ejemplo n.º 9
0
 def __repr__(self):
     return 'Experiment %s (resolution: %s, TADs: %s, Hi-C rows: %s, normalized: %s)' % (
         self.name, nicer(self.resolution), len(self.tads) or None,
         self.size, self._normalization if self._normalization else 'None')
Ejemplo n.º 10
0
def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8,
             region1=None, start1=None, end1=None, verbose=False,
             region2=None, start2=None, end2=None, outdir=None,
             tmpdir='/tmp/', normalized=False, by_decay=False,
             get_all_data=False, use_bads=False):
    """
    Extracts a (normalized) submatrix at wanted resolution from pseudo-BAM file

    :param inbam: path to pseudoBAM file
    :param filter_exclude:
    :param resolution:
    :param biases: path to pickle file with biases and low-coverage columns
    :param 8 ncpus:
    :param None region1: chromosome name of region 1
    :param None start1: start genomic coordinate of region 1
    :param None end1: end genomic coordinate of region 1
    :param None region1: chromosome name of region 2 (if not given use region1)
    :param None start1: start genomic coordinate of region 2 (if not given use region1)
    :param None end1: end genomic coordinate of region 2 (if not given use region1)
    :param False normalized: returns the dictionary of Vanilla normalized matrix
    :param False decay: returns the dictionary of Decay normalized matrix (decay
       option can not be used at the same time as normalized option)
    :param False get_all_data:

    returns: dictionary of interactions. If get_all_data is set to True, returns
       a dictionary with all biases used and bads1 columns (keys of the 
       dicitionary are: matrix, bias1, bias2, bads1, bads1, decay).
    """
    if outdir:
        mkdir(outdir)
    mkdir(tmpdir)
    bamfile = pysam.AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])

    start_bin = 0
    end_bin   = len(bins) + 1
    if region1:
        regions = [region1]
        start_bin = [i for i, b in enumerate(bins) if b[0] == region1][0]
        end_bin   = [i for i, b in enumerate(bins[start_bin:], start_bin)
                     if b[0] == region1][-1]
    else:
        regions = bamfile.references
        total = len(bins)
        if start1 or end1:
            raise Exception('ERROR: Cannot use start/end1 without region')

    if start1:
        start_bin = section_pos[region1][0] + start1 / resolution
    else:
        start1 = 0
    if end1:
        end_bin = section_pos[region1][0] + end1 / resolution
    else:
        end = len(bins)
        end1 = (section_pos[region1][1] - section_pos[region1][0]) * resolution

    total = end_bin - start_bin + 1
    regs  = []
    begs  = []
    ends  = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in xrange(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
        if crm1 != crm2:
            fin1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(fin1 * resolution + resolution)  # last nt included
            ends.append(fin2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin2 * resolution + resolution - 1)            
    ends[-1] += 1  # last nucleotide included
    
    # reduce dictionaries
    bins = []
    for crm in regions:
        beg_crm = section_pos[crm][0]
        if len(regions) == 1:
            start = start_bin - beg_crm
            end   = end_bin   - beg_crm
        else:
            start = 0
            end   = section_pos[crm][1] - section_pos[crm][0] + 1
        bins.extend([(crm, i) for i in xrange(start, end)])
    bins_dict1 = dict([(j, i) for i, j in enumerate(bins)])
    if region2:
        bins = []
        beg_crm = section_pos[region2][0]
        if start2 is not None:
            start_bin2 = section_pos[region2][0] + start2 / resolution
            end_bin2   = section_pos[region2][0] + end2   / resolution
        else:
            start2     = 0
            start_bin2 = 0
            end_bin2   = section_pos[region2][1]
            end2       = sections[region2] * resolution
        start = start_bin2 - beg_crm
        end   = end_bin2   - beg_crm
        bins = [(region2, i) for i in xrange(start, end)]
        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
    else:
        bins_dict2 = bins_dict1
    pool = mu.Pool(ncpus)
    ## RUN!
    if verbose:
        printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    procs = []
    for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
        if ncpus == 1:
            read_bam_frag(inbam, filter_exclude,
                          bins_dict1, bins_dict2,
                          resolution, tmpdir, region, b, e,)
        else:
            procs.append(pool.apply_async(
                read_bam_frag, args=(inbam, filter_exclude,
                                     bins_dict1, bins_dict2,
                                     resolution, tmpdir, region, b, e,)))
    pool.close()
    if verbose:
        print_progress(procs)
    pool.join()

    if verbose:
        printime('  - Writing matrices')
    bias1  = dict((k - start_bin, v)
                  for k, v in biases.get('biases', {}).iteritems()
                  if start_bin <= k < end_bin)
    if region2:
        bias2  = dict((k - start_bin2, v)
                      for k, v in biases.get('biases', {}).iteritems()
                      if start_bin2 <= k < end_bin2)
    else:
        bias2 = bias1
    decay = biases.get('decay' , {})
    bads1  = dict((k - start_bin, v)
                  for k, v in biases.get('badcol', {}).iteritems()
                  if start_bin <= k < end_bin)
    if region2:
        bads2  = dict((k - start_bin2, v)
                      for k, v in biases.get('badcol', {}).iteritems()
                      if start_bin2 <= k < end_bin2)
    else:
        bads2 = bads1
    if use_bads:
        bads2 = bads1 = {}
    # hic_data = HiC_data((), len(bins_dict), sections,
    #                     bins_dict, resolution=resolution)
    if len(regions) == 1:
        if region2:
            name = '%s:%d-%d_%s:%d-%d' % (region1, start1 / resolution, end1 / resolution,
                                          region2, start2 / resolution, end2 / resolution)
        else:
            name = '%s:%d-%d' % (region1, start1 / resolution, end1 / resolution)
    else:
        name = 'full'
    if outdir:
        out_raw = open(os.path.join(outdir, 'matrix_raw_%s_%s.abc' % (
            name, nicer(resolution).replace(' ', ''))), 'w')
        out_raw.write('# %s resolution:%d\n' % (name, resolution))
        if region2:
            out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
            out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
        else:
            out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
        if biases:
            out_nrm = open(os.path.join(outdir, 'matrix_nrm_%s_%s.abc' % (
                name, nicer(resolution).replace(' ', ''))), 'w')
            out_nrm.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
            else:
                out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
            out_dec = open(os.path.join(outdir, 'matrix_dec_%s_%s.abc' % (
                name, nicer(resolution).replace(' ', ''))), 'w')
            out_dec.write('# %s resolution:%d\n' % (
                name, resolution))
            if region2:
                out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
            else:
                out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))

        def write2matrix(a, b, c):
            out_raw.write('%d\t%d\t%d\n' % (a, b, c))
        def write2matrices(a, b, c):
            out_raw.write('%d\t%d\t%d\n' % (a, b, c))
            out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
            out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
                                                       decay[abs(a-b)])))
        def write2matrices_2reg(a, b, c):
            out_raw.write('%d\t%d\t%d\n' % (a, b, c))
            out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
            out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
                                                       decay[abs((a + start_bin) -
                                                                 (b + start_bin2))])))
        def write2matrices_err(a, b, c):
            out_raw.write('%d\t%d\t%d\n' % (a, b, c))
            out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
            try:
                out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
                                                           decay[abs(a-b)])))
            except KeyError:  # different chromsomes
                out_dec.write('%d\t%d\t%s\n' % (a, b, 'nan'))

        if biases:
            if len(regions) == 1:
                if region2:
                    write = write2matrices_2reg
                else:
                    write = write2matrices
            else:
                write = write2matrices_err
        else:
            write = write2matrix
    
    if verbose:
        sys.stdout.write('     ')
    dico = {}
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not i % 10 and i:
            if verbose:
                sys.stdout.write(' ')
        if not i % 50 and i:
            if verbose:
                sys.stdout.write(' %9s\n     ' % ('%s/%s' % (i , len(regs))))
        if verbose:
            sys.stdout.write('.')
            sys.stdout.flush()
        fname = os.path.join(tmpdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
        if outdir:
            dico = load(open(fname))
            for (j, k), v in dico.iteritems():
                if j in bads1 or k in bads2:
                    continue
                write(j, k, v)
        else:
            dico.update(load(open(fname)))
        os.system('rm -f %s' % (fname))
    if outdir:
        out_raw.close()
        if biases:
            out_nrm.close()
            out_dec.close()
    if verbose:
        print '%s %9s\n' % (' ' * (54 - (i % 50) - (i % 50) / 10),
                            '%s/%s' % (len(regs),len(regs)))
    if normalized and by_decay:
        warn('WARNING: choose either normalized or by_decay. Using decay normalization')
    if not outdir:
        if by_decay:
            if region2:
                for i, j in dico:
                    if i in bads1 or j in bads2:
                        continue
                    try:
                        dico[(i, j)] /= bias1[i] * bias2[j] * decay[abs((i + start_bin) -
                                                                        (j + start_bin2))]
                    except KeyError:
                        dico[(i, j)] = float('nan')  # no value in decay
            else:
                for i, j in dico:
                    dico[(i, j)] /= bias1[i] * bias2[j] * decay[abs(i - j)]
        elif normalized:
            for i, j in dico:
                dico[(i, j)] /= bias1[i] * bias2[j]
        if get_all_data:
            return {'matrix': dico,
                    'bias1' : bias1,
                    'bias2' : bias2,
                    'bads1' : bads1,
                    'bads2' : bads2,
                    'decay' : decay}
        return dico
Ejemplo n.º 11
0
def plot_filtering(nears,
                   ratio,
                   size,
                   cut_count,
                   cut_ratio,
                   outfile,
                   base_position=None,
                   next_position=None,
                   last_position=None,
                   resolution=1,
                   legend=''):
    plt.figure(figsize=(8.5, 5.5))
    axe = plt.subplot()
    axe.set_position((0.12, 0.1, 0.55, 0.8))
    pl = plt.plot([ratio.get(k, 0) for k in range(size)],
                  [nears.get(k, 0) for k in range(size)],
                  'k.',
                  ms=1 if size > 50_000 else 2 if size > 20_000 else 3,
                  alpha=0.01 if size > 500_000 else 0.05 if size > 200_000 else
                  0.1 if size > 50_000 else 0.2 if size > 20_000 else 0.3)
    ylim = np.percentile(list(nears.values()), 95)
    plt.ylim(0, ylim)
    xlim = np.percentile(list(ratio.values()), 95)
    plt.xlim(0, xlim)
    fb = plt.fill_between([0, cut_ratio],
                          ylim,
                          color='tab:red',
                          alpha=0.4,
                          lw=0)
    plt.fill_betweenx([0, cut_count],
                      cut_ratio,
                      xlim,
                      color='tab:red',
                      alpha=0.4,
                      lw=0)
    plt.ylabel('interactions per {} bin'.format(nicer(resolution)), size=12)
    plt.xlabel('interaction ratio between {0}-{1} and {1}-{2}'.format(
        nicer(resolution * base_position), nicer(resolution * next_position),
        nicer(resolution * last_position)),
               size=12)
    plt.text(xlim,
             cut_count,
             'Minimum sum: {}'.format(cut_count),
             ha='right',
             va='bottom',
             size=11)
    plt.text(cut_ratio,
             ylim,
             'Minimum cis/trans ratio: {}'.format(cut_ratio),
             ha='left',
             va='top',
             size=11,
             rotation=90)
    plt.title(
        'Distribution of interaction\nsums and cis/trans ratio by {} bin'.
        format(nicer(resolution)),
        size=13)
    plt.legend(pl + [fb], [
        '{} bin'.format(nicer(resolution)),
        'Filtered space:\n low ratio or count'
    ],
               bbox_to_anchor=(1, 0.9),
               frameon=False,
               fontsize=10,
               markerscale=4,
               title=legend,
               title_fontsize=11)
    plt.savefig(outfile)
Ejemplo n.º 12
0
 def __repr__(self):
     return 'Experiment %s (resolution: %s, TADs: %s, Hi-C rows: %s, normalized: %s)' % (
         self.name, nicer(self.resolution), len(self.tads) or None,
         self.size, self._normalization if self._normalization else 'None')
Ejemplo n.º 13
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    print(
        '''
%s%s

  - Region: Chromosome %s from %d to %d at resolution %s (%d particles)
    ''' % ('Preparing ' if opts.job_list else '',
           ('Optimization\n' + '*' *
            (21 if opts.job_list else 11)) if opts.optimize else
           ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm,
           opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg))

    # load data
    if opts.matrix:
        crm = load_hic_data(opts)
    else:
        # FIXME: copied from somewhere else
        (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id,
         reso) = load_parameters_fromdb(opts)
        hic_data = load_hic_data_from_reads(mreads, reso)
        hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
        hic_data.bias = dict(
            (int(l.split()[0]), float(l.split()[1])) for l in open(biases))

    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # prepare output folders
    batch_job_hash = digest_parameters(
        opts,
        get_md5=True,
        extra=[
            'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run',
            'job_list', 'rand', 'nmodels', 'nkeep', 'optimize',
            'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg',
            'ori_end'
        ])

    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(
        opts.workdir, '06_model',
        '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(
            path.join(
                outdir, 'job_list_%s.q' %
                ('optimization' if opts.optimize else 'modeling')), 'w')
    else:
        job_file_handler = None

    ###############
    # Optimization
    print '     o Optimizing parameters'
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        print('\n optimization done')
        # correlate all optimization and get best set of parameters

    if not (opts.optimize and opts.job_list):
        optpar, results = correlate_models(opts, outdir, exp)
    else:
        results = []

    ###########
    # Modeling
    if not opts.optimize:
        big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
Ejemplo n.º 14
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    coord1 = opts.coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None

    printime('Importing hic in %s format' % opts.format)
    if opts.format == 'matrix' or opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing)
        if not chroms_gen or (region1 and region1 not in chroms_gen):
            raise Exception(
                '''ERROR: Chromosome size not included in import file.
                             Please include the chromosome sizes of the data that
                             you want to import in the header of the file. Example:
                             # CRM chr1    249250621''')
    elif opts.format == 'cooler':
        if is_cooler(opts.input, opts.reso if opts.reso > 1 else None):
            chroms_gen = parse_header(opts.input,
                                      opts.reso if opts.reso > 1 else None)
            if not chroms_gen or (region1 and region1 not in chroms_gen):
                raise Exception(
                    '''ERROR: Chromosome size not included in import file.
                                ''')
        else:
            raise Exception('''ERROR: The input file is not a cooler''')

    chroms = OrderedDict(
        (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen)
    sections = []
    if not region1:
        size = 0
        for crm in chroms:
            size += chroms[crm]
            sections.extend([(crm, i) for i in range(chroms[crm])])
    elif not start1:
        size = chroms[region1]
        sections.extend([(region1, i) for i in range(size)])
    else:
        #size = (end1 - start1)//opts.reso
        size = chroms[region1]
        sections.extend([
            (region1, i)
            for i in range(start1 // opts.reso, (end1 // opts.reso))
        ])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    bias_file = None
    badcol = {}
    if opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            matrix = abc_reader(f_thing, size,
                                start1 // opts.reso if start1 else None)
        size_mat = size
    elif opts.format == 'matrix':
        with gzopen(opts.input) as in_f:
            matrix, size_mat, _, masked, _ = autoreader(in_f)
        if size != size_mat:
            raise Exception('''ERROR: The size of the specified region is
                            different from the data in the matrix''')
    elif opts.format == 'cooler':
        matrix, weights, size, header = parse_cooler(
            opts.input,
            opts.reso if opts.reso > 1 else None,
            normalized=True,
            raw_values=True)
        masked = {}
        size_mat = size
        if len(set(weights)) > 1:
            printime('Transforming cooler weights to biases')
            outdir_norm = path.join(opts.workdir, '04_normalization')
            mkdir(outdir_norm)

            bias_file = path.join(
                outdir_norm, 'biases_%s_%s.pickle' %
                (nicer(opts.reso).replace(' ', ''), param_hash))
            out = open(bias_file, 'wb')
            badcol.update((i, True) for i, m in enumerate(weights) if m == 0)
            dump(
                {
                    'biases':
                    dict((k, b if b > 0 else float('nan'))
                         for k, b in enumerate(weights)),
                    'decay': {},
                    'badcol':
                    badcol,
                    'resolution':
                    opts.reso
                }, out, HIGHEST_PROTOCOL)
            out.close()

    hic = HiC_data(matrix,
                   size_mat,
                   dict_sec=dict_sec,
                   chromosomes=chroms,
                   masked=masked,
                   resolution=opts.reso)

    #from pytadbit.mapping.analyze import hic_map
    #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis')

    printime('Creating BAM file')
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    total_counts = create_BAMhic(hic,
                                 opts.cpus,
                                 outbam,
                                 chroms_gen,
                                 opts.reso,
                                 samtools=opts.samtools)

    finish_time = time.localtime()
    # save all job information to sqlite DB
    save_to_db(opts, total_counts, size_mat, bias_file, len(badcol),
               outbam + '.bam', launch_time, finish_time)
Ejemplo n.º 15
0
def load_parameters_fromdb(opts):
    if 'tmpdb' in opts and opts.tmpdb:
        dbfile = opts.tmpdb
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        if not opts.jobid:
            # get the JOBid of the parsing job
            try:
                cur.execute("""
                select distinct Id from JOBs
                where Type = 'Normalize'
                """)
                jobids = cur.fetchall()
                parse_jobid = jobids[0][0]
            except IndexError:
                cur.execute("""
                select distinct Id from JOBs
                where Type = '%s'
                """ % ('Filter'))
                jobids = cur.fetchall()
                try:
                    parse_jobid = jobids[0][0]
                except IndexError:
                    parse_jobid = 1
            if len(jobids) > 1:
                cur.execute("""
                select distinct JOBid from NORMALIZE_OUTPUTs
                where Resolution = %d
                """ % (opts.reso))
                jobs = cur.fetchall()
                try:
                    parse_jobid = jobs[0][0]
                except IndexError:
                    raise Exception('ERROR: no normalization found at %s' % (
                        nicer(opts.reso)))
                if len(jobs ) > 1:
                    raise Exception('ERROR: more than one possible input found, use'
                                    '"tadbit describe" and select corresponding '
                                    'jobid with --jobid')
        else:
            parse_jobid = opts.jobid
        # fetch path to BAM files
        # try:
        biases = mreads = reso = None
        try:
            cur.execute("""
            select distinct Path, PATHs.id from PATHs
            where paths.jobid = %s and paths.Type = 'BIASES'
            """ % parse_jobid)
            biases, biases_id = cur.fetchall()[0]

            cur.execute("""
            select distinct Path, PATHs.id from PATHs
            inner join NORMALIZE_OUTPUTs on PATHs.Id = NORMALIZE_OUTPUTs.Input
            where NORMALIZE_OUTPUTs.JOBid = %d;
            """ % parse_jobid)
            mreads, mreads_id = cur.fetchall()[0]

            cur.execute("""
            select distinct Resolution from NORMALIZE_OUTPUTs
            where NORMALIZE_OUTPUTs.JOBid = %d;
            """ % parse_jobid)
            reso = int(cur.fetchall()[0][0])
            if reso != opts.reso:
                warn('WARNING: input resolution does not match '
                     'the one of the precomputed normalization')
        except IndexError:
            raise Exception('ERROR: normalization not found')
        return biases, mreads, biases_id, mreads_id
Ejemplo n.º 16
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # pad mappability at the end if the size is close to gc_content
        if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95:
            mappability += [float('nan')] * (len(gc_content)-len(mappability))

        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1`
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path, 
        cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, 
        min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter)

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 17
0
 def format_yticks(tickstring, _=None):
     tickstring = int(tickstring * opts.reso + pltbeg2)
     return nicer(tickstring if tickstring else 1,
                  coma=True)
Ejemplo n.º 18
0
def parse_mappability_bedGraph(fname, resolution, wanted_chrom=None,
                               save_cache=True, reload_cache=False):
    """
    parse BEDgraph containing mappability.
    GEM mappability file obtained with:

        gem-indexer -i hg38.fa -o hg38
        gem-mappability -I hg38.gem -l 50 -o hg38.50mer -T 8
        gem-2-wig -I hg38.gem -i hg38.50mer.mappability -o hg38.50mer
        wigToBigWig hg38.50mer.wig hg38.50mer.sizes hg38.50mer.bw
        bigWigToBedGraph hg38.50mer.bw  hg38.50mer.bedGraph

    :param fnam: path to BED file with mappability
    :param resolution: to bin the resulting dictionary
    :param wanted_chrom: in case only one chromosome is needed
    :param True save_cache: save a cached version of this file for faster
       loadings (depends on the resolution)
    :param False reload_cache: reload cached genome

    :returns: a dictionary with chromosomes as keys, with average mappability
       per bin.
    """
    tadbit_fname = fname + '_mappability_%s.TADbit' % (nicer(resolution, sep=''))
    if path.exists(tadbit_fname) and not reload_cache:
        def read_line(line):
            crm, elements = line.split()
            return crm, map(float, elements.split(','))
        return dict(read_line(l) for l in open(tadbit_fname))

    fh = open(fname)
    line = fh.next()
    crmM, begM, endM, val = line.split()
    crm = crmM
    if wanted_chrom:
        if crmM != wanted_chrom:
            print('     skipping %s' % crmM)
            while crmM != wanted_chrom:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
    mappability = {}
    mappability[crm] = []
    begB = 0
    while True:
        endB = begB + resolution
        tmp = 0
        try:
            while True:
                crmM, begM, endM, val = line.split()
                if crm != crmM:
                    mappability[crmM] = []
                    begB = -resolution
                    if wanted_chrom:
                        raise StopIteration
                    break
                endM = int(endM)
                if endM > endB:
                    weight = endB - int(begM)
                    if weight >= 0:
                        tmp += weight * float(val)
                    break
                begM = int(begM)
                weight = endM - (begM if begM > begB else begB)
                if weight < 0:
                    break
                tmp += weight * float(val)
                line = fh.next()
        except StopIteration:
            mappability[crm].append(tmp / resolution)
            break
        mappability[crm].append(tmp / resolution)
        crm = crmM
        begB +=  resolution
    print "     saving mappabilty to cache..."
    if save_cache:
        out = open(tadbit_fname, 'w')
        for crm in mappability:
            out.write(crm + '\t' + ','.join(map(str, mappability[crm])) + '\n')
        out.close()
    return mappability
Ejemplo n.º 19
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v != 'raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(
            zip(bamfile.references, [x for x in bamfile.lengths]))
        total = 0
        section_pos = dict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else
                           'NRM' if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads,
                    opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1,
                    start1=start1,
                    end1=end1,
                    region2=region2,
                    start2=start2,
                    end2=end2,
                    tmpdir=tmpdir,
                    ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks,
                    verbose=not opts.quiet,
                    clean=clean)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemeted for '
                         'matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(
                    opts.reso).replace(' ', ''), ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' %
                                        (row_names.next()) + '\t'.join(
                                            str(matrix.get((i, j), 0))
                                            for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(
                        str(matrix.get((i, j), 0)) for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                cmap = plt.get_cmap(opts.cmap)
                if norm != 'raw':
                    cmap.set_bad('grey', 1.)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace(
                    ' ', ''), ('_' + param_hash), opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                if opts.interactive:
                    _ = plt.figure(figsize=(8, 7))
                else:
                    _ = plt.figure(figsize=(16, 14))
                # ax1 = plt.subplot(111)
                ax1 = plt.axes([0.1, 0.1, 0.7, 0.8])
                ax2 = plt.axes([0.82, 0.1, 0.07, 0.8])
                matrix = array([
                    array([matrix.get((i, j), 0) for i in xrange(b1, e1)])
                    for j in xrange(b2, e2)
                ])
                mini = np_min(matrix[nonzero(matrix)]) / 2.
                matrix[matrix == 0] = mini
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:, bad1] = 1
                    for bad2 in bads2:
                        m[bad2, :] = 1
                matrix = log2(ma.masked_array(matrix, m))
                ax1.imshow(matrix,
                           interpolation='None',
                           origin='lower',
                           cmap=cmap,
                           vmin=vmin,
                           vmax=vmax)

                if len(regions) <= 2:
                    pltbeg1 = 0 if start1 is None else start1
                    pltend1 = sections[regions[0]] if end1 is None else end1
                    pltbeg2 = pltbeg1 if len(
                        regions) == 1 else 0 if start2 is None else start2
                    pltend2 = pltend1 if len(regions) == 1 else sections[
                        regions[-1]] if end2 is None else end2

                    ax1.set_xlabel('{}:{:,}-{:,}'.format(
                        regions[0], pltbeg1 if pltbeg1 else 1, pltend1))
                    ax1.set_ylabel('{}:{:,}-{:,}'.format(
                        regions[-1], pltbeg2 if pltbeg2 else 1, pltend2))

                    def format_xticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg1)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    def format_yticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg2)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks))
                    ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks))

                    labels = ax1.get_xticklabels()
                    plt.setp(labels, rotation=-25, ha='left')

                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                else:
                    vals = [0]
                    keys = ['']
                    for crm in regions:
                        vals.append(section_pos[crm][0] / opts.reso)
                        keys.append(crm)
                    vals.append(section_pos[crm][1] / opts.reso)
                    ax1.set_yticks(vals)
                    ax1.set_yticklabels('')
                    ax1.set_yticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_yticklabels(keys, minor=True)
                    for t in ax1.yaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False

                    ax1.set_xticks(vals)
                    ax1.set_xticklabels('')
                    ax1.set_xticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_xticklabels(keys, minor=True)
                    for t in ax1.xaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False
                    ax1.set_xlabel('Chromosomes')
                    ax1.set_ylabel('Chromosomes')
                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                data = [i for d in matrix for i in d if isfinite(i)]
                mindata = nanmin(data)
                maxdata = nanmax(data)
                gradient = linspace(maxdata, mindata,
                                    max((len(matrix), len(matrix[0]))))
                gradient = dstack((gradient, gradient))[0]
                h = ax2.hist(data,
                             color='darkgrey',
                             linewidth=2,
                             orientation='horizontal',
                             bins=50,
                             histtype='step',
                             normed=True)
                _ = ax2.imshow(gradient,
                               aspect='auto',
                               cmap=cmap,
                               extent=(0, max(h[0]), mindata, maxdata))
                ax2.yaxis.tick_right()
                ax2.yaxis.set_label_position("right")
                ax2.set_xticks([])
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' %
                              (name, norm, nicer(opts.reso)))
                ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90)
                ax2.set_xlabel('Count')
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(
            write_matrix(mreads,
                         opts.reso,
                         load(open(biases)) if biases else None,
                         outdir,
                         filter_exclude=opts.filter,
                         normalizations=opts.normalizations,
                         region1=region1,
                         start1=start1,
                         end1=end1,
                         region2=region2,
                         start2=start2,
                         end2=end2,
                         tmpdir=tmpdir,
                         append_to_tar=None,
                         ncpus=opts.cpus,
                         nchunks=opts.nchunks,
                         verbose=not opts.quiet,
                         extra=param_hash,
                         clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 20
0
def write_matrix(inbam,
                 resolution,
                 biases,
                 outdir,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 normalizations=('decay', ),
                 region1=None,
                 start1=None,
                 end1=None,
                 clean=True,
                 region2=None,
                 start2=None,
                 end2=None,
                 extra='',
                 half_matrix=True,
                 nchunks=100,
                 tmpdir='.',
                 append_to_tar=None,
                 ncpus=8,
                 cooler=False,
                 verbose=True):
    """
    Writes matrix file from a BAM file containing interacting reads. The matrix
    will be extracted from the genomic BAM, the genomic coordinates of this
    matrix will be at the intersection of two regions defined byt the parameters
    region1, start1, end1 and region2, start2, end2. If the wanted matrix is
    than the second coodinate can be skipped.

    :param inbam: path to BAM file (generated byt TADbit)
    :param resolution: resolution at which we want to write the matrix
    :param biases: path to a file with biases
    :param outdir: path to a folder where to write output files
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param ('decay',) normalization: tuple with normalizations to use, can be 'decay',
       'norm' or/and 'raw'. One file per normalization will be created.
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None start1: start coordinate of the first region from which to
       extract the matrix
    :param None end1: end coordinate of the first region from which to
       extract the matrix
    :param None region2: chromosome name of the second region from which to
       extract the matrix
    :param None start2: start coordinate of the second region from which to
       extract the matrix
    :param None end2: end coordinate of the second region from which to
       extract the matrix
    :param True half_matrix: writes only half of the matrix (and the diagonal)
    :param '.' tmpdir: where to write temporary files
    :param None append_to_tar: path to a TAR file were generated matrices will
       be written directly
    :param 8 ncpus: number of cpus to use to read the BAM file
    :param True verbose: speak
    :param 100 nchunks: maximum number of chunks into which to cut the BAM

    :returns: path to output files
    """
    if start1 is not None and end1:
        if end1 - start1 < resolution:
            raise Exception(
                'ERROR: region1 should be at least as big as resolution')
    if start2 is not None and end2:
        if end2 - start2 < resolution:
            raise Exception(
                'ERROR: region2 should be at least as big as resolution')

    if isinstance(normalizations, list):
        normalizations = tuple(normalizations)
    elif isinstance(normalizations, str):
        normalizations = tuple([normalizations])

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    regions, rand_hash, bin_coords, chunks = read_bam(inbam,
                                                      filter_exclude,
                                                      resolution,
                                                      ncpus=ncpus,
                                                      region1=region1,
                                                      start1=start1,
                                                      end1=end1,
                                                      region2=region2,
                                                      start2=start2,
                                                      end2=end2,
                                                      tmpdir=tmpdir,
                                                      nchunks=nchunks,
                                                      verbose=verbose)

    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x for x in bamfile.lengths]))

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(
            biases, bin_coords)
    elif normalizations != ('raw', ):
        raise Exception(
            'ERROR: should provide path to file with biases (pickle).')
    else:
        bads1 = bads2 = {}

    start_bin1, start_bin2 = bin_coords[::2]
    if verbose:
        printime('  - Writing matrices')
    # define output file name
    name = _generate_name(regions, (start1, start2), (end1, end2), resolution)

    # prepare file header
    outfiles = []
    if cooler:
        if 'h5py' not in modules:
            raise Exception(
                'ERROR: cooler output is not available. Probably ' +
                'you need to install h5py\n')
        if 'decay' in normalizations or 'raw&decay' in normalizations:
            raise Exception(
                'ERROR: decay and raw&decay matrices cannot be exported '
                'to cooler format. Cooler only accepts weights per column/row')
        fnam = 'raw_%s_%s%s.mcool' % (name, nicer(resolution).replace(' ', ''),
                                      ('_' + extra) if extra else '')
        if os.path.exists(os.path.join(outdir, fnam)):
            os.remove(os.path.join(outdir, fnam))
        out_raw = cooler_file(os.path.join(outdir, fnam), resolution, sections,
                              regions)
        out_raw.create_bins()
        out_raw.prepare_matrix(start_bin1, start_bin2)
        outfiles.append((os.path.join(outdir, fnam), fnam))
    else:
        if 'raw' in normalizations:
            fnam = 'raw_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_raw = StringIO()
                outfiles.append((out_raw, fnam))
            else:
                out_raw = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_raw.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_raw.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_raw.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_raw.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))

        # write file header
        if 'norm' in normalizations:
            fnam = 'nrm_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_nrm = StringIO()
                outfiles.append((out_nrm, fnam))
            else:
                out_nrm = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_nrm.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_nrm.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_nrm.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_nrm.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
        if 'decay' in normalizations or 'raw&decay' in normalizations:
            fnam = 'dec_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_dec = StringIO()
                outfiles.append((out_dec, fnam))
            else:
                out_dec = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_dec.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_dec.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_dec.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_dec.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))

    # functions to write lines of pairwise interactions
    def write_raw(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_raw.write('{}\t{}\t{}\n'.format(a, b, v))

        def writer(_, a, b, v):
            out_raw.write('{}\t{}\t{}\n'.format(a, b, v))

        return writer2 if func else writer

    def write_bias(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b]))

        def writer(_, a, b, v):
            out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b]))

        return writer2 if func else writer

    def write_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))

        def writer(c, a, b, v):
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))

        return writer2 if func else writer

    def write_expc_2reg(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] /
                decay[c][abs((a + start_bin1) - (b + start_bin2))]))

        def writer(c, a, b, v):
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] /
                decay[c][abs((a + start_bin1) - (b + start_bin2))]))

        return writer2 if func else writer

    def write_expc_err(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\t{}\n'.format(
                    a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan'))

        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\t{}\n'.format(
                    a, b, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan'))

        return writer2 if func else writer

    def write_raw_and_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b]))

        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b]))

        return writer2 if func else writer

    write = None
    if 'raw' in normalizations:
        write = write_raw(write)
    if 'norm' in normalizations and not cooler:
        write = write_bias(write)
    if 'decay' in normalizations and not cooler:
        if len(regions) == 1:
            if region2:
                write = write_expc_2reg(write)
            else:
                write = write_expc(write)
        else:
            write = write_expc_err(write)
    if 'raw&decay' in normalizations and not cooler:
        write = write_raw_and_expc(write)

    # pull all sub-matrices and write full matrix
    if region2 is not None:  # already half-matrix in this case
        half_matrix = False

    if cooler:
        for ichunk, c, j, k, v in _iter_matrix_frags(chunks,
                                                     tmpdir,
                                                     rand_hash,
                                                     verbose=verbose,
                                                     clean=clean,
                                                     include_chunk_count=True):
            if j > k:
                continue
            if j not in bads1 and k not in bads2:
                out_raw.write_iter(ichunk, j, k, v)
        out_raw.close()
    else:
        if half_matrix:
            for c, j, k, v in _iter_matrix_frags(chunks,
                                                 tmpdir,
                                                 rand_hash,
                                                 verbose=verbose,
                                                 clean=clean):
                if k > j:
                    continue
                if j not in bads1 and k not in bads2:
                    write(c, j, k, v)
        else:
            for c, j, k, v in _iter_matrix_frags(chunks,
                                                 tmpdir,
                                                 rand_hash,
                                                 verbose=verbose,
                                                 clean=clean):
                if j not in bads1 and k not in bads2:
                    write(c, j, k, v)

    fnames = {}
    if append_to_tar:
        lock = LockFile(append_to_tar)
        with lock:
            archive = taropen(append_to_tar, "a:")
            for fobj, fnam in outfiles:
                fobj.seek(0)
                info = archive.tarinfo(name=fnam)
                info.size = len(fobj.buf)
                archive.addfile(tarinfo=info, fileobj=fobj)
            archive.close()
    else:
        if cooler:
            fnames['RAW'] = out_raw.name
            if 'norm' in normalizations:
                fnam = 'nrm_%s_%s%s.mcool' % (name, nicer(resolution).replace(
                    ' ', ''), ('_' + extra) if extra else '')
                copyfile(outfiles[0][0], os.path.join(outdir, fnam))
                out_nrm = cooler_file(os.path.join(outdir, fnam), resolution,
                                      sections, regions)
                bias_data_row = [1. / b if b > 0 else 0 for b in bias1]
                bias_data_col = [1. / b if b > 0 else 0 for b in bias2]
                out_nrm.write_weights(bias_data_row, bias_data_col,
                                      *bin_coords)
                outfiles.append((os.path.join(outdir, fnam), fnam))
                fnames['NRM'] = os.path.join(outdir, fnam)
        else:
            if 'raw' in normalizations:
                out_raw.close()
                fnames['RAW'] = out_raw.name
            if 'norm' in normalizations:
                out_nrm.close()
                fnames['NRM'] = out_nrm.name
            if 'decay' in normalizations:
                out_dec.close()
                fnames['DEC'] = out_dec.name
            if 'raw&decay' in normalizations:
                out_dec.close()
                fnames['RAW&DEC'] = out_dec.name

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' %
                                              (rand_hash))))

    return fnames
Ejemplo n.º 21
0
def write_matrix(inbam, resolution, biases, outdir,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 normalizations=('decay',),
                 region1=None, start1=None, end1=None, clean=True,
                 region2=None, start2=None, end2=None, extra='',
                 half_matrix=True, nchunks=None, tmpdir='.', append_to_tar=None,
                 ncpus=8, verbose=True):
    """
    Writes matrix file from a BAM file containing interacting reads. The matrix
    will be extracted from the genomic BAM, the genomic coordinates of this
    matrix will be at the intersection of two regions defined byt the parameters
    region1, start1, end1 and region2, start2, end2. If the wanted matrix is
    than the second coodinate can be skipped.

    :param inbam: path to BAM file (generated byt TADbit)
    :param resolution: resolution at which we want to write the matrix
    :param biases: path to a file with biases
    :param outdir: path to a folder where to write output files
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param ('decay',) normalization: tuple with normalizations to use, can be 'decay',
       'norm' or/and 'raw'. One file per normalization will be created.
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None start1: start coordinate of the first region from which to
       extract the matrix
    :param None end1: end coordinate of the first region from which to
       extract the matrix
    :param None region2: chromosome name of the second region from which to
       extract the matrix
    :param None start2: start coordinate of the second region from which to
       extract the matrix
    :param None end2: end coordinate of the second region from which to
       extract the matrix
    :param True half_matrix: writes only half of the matrix (and the diagonal)
    :param '.' tmpdir: where to write temporary files
    :param None append_to_tar: path to a TAR file were generated matrices will
       be written directly
    :param 8 ncpus: number of cpus to use to read the BAM file
    :param True verbose: speak
    :param None nchunks: maximum number of chunks into which to cut the BAM

    :returns: path to output files
    """
    if start1 is not None and end1:
        if end1 - start1 < resolution:
            raise Exception('ERROR: region1 should be at least as big as resolution')
    if start2 is not None and end2:
        if end2 - start2 < resolution:
            raise Exception('ERROR: region2 should be at least as big as resolution')

    if isinstance(normalizations, list):
        normalizations = tuple(normalizations)
    elif isinstance(normalizations, str):
        normalizations = tuple([normalizations])

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    regions, rand_hash, bin_coords, chunks = read_bam(
        inbam, filter_exclude, resolution, ncpus=ncpus,
        region1=region1, start1=start1, end1=end1,
        region2=region2, start2=start2, end2=end2,
        tmpdir=tmpdir, nchunks=nchunks, verbose=verbose)

    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x for x in bamfile.lengths]))

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords)
    elif normalizations != ('raw', ):
        raise Exception('ERROR: should provide path to file with biases (pickle).')
    else:
        bads1 = bads2 = {}

    start_bin1, start_bin2 = bin_coords[::2]
    if verbose:
        printime('  - Writing matrices')
    # define output file name
    name = _generate_name(regions, (start1, start2), (end1, end2), resolution)

    # prepare file header
    outfiles = []
    if 'raw' in normalizations:
        fnam = 'raw_%s_%s%s.abc' % (name,
                                    nicer(resolution).replace(' ', ''),
                                    ('_' + extra) if extra else '')
        if append_to_tar:
            out_raw = StringIO()
            outfiles.append((out_raw, fnam))
        else:
            out_raw = open(os.path.join(outdir, fnam), 'w')
            outfiles.append((os.path.join(outdir, fnam), fnam))
        for reg in regions:
            out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg]))

        out_raw.write('# %s resolution:%d\n' % (name, resolution))
        if region2:
            out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
            out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
        else:
            out_raw.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))

    # write file header
    if 'norm' in normalizations:
        fnam = 'nrm_%s_%s%s.abc' % (name,
                                    nicer(resolution).replace(' ', ''),
                                    ('_' + extra) if extra else '')
        if append_to_tar:
            out_nrm = StringIO()
            outfiles.append((out_nrm, fnam))
        else:
            out_nrm = open(os.path.join(outdir, fnam), 'w')
            outfiles.append((os.path.join(outdir, fnam), fnam))
        for reg in regions:
            out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg]))

        out_nrm.write('# %s resolution:%d\n' % (name, resolution))
        if region2:
            out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
            out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
        else:
            out_nrm.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
    if 'decay' in normalizations or 'raw&decay' in normalizations:
        fnam = 'dec_%s_%s%s.abc' % (name,
                                    nicer(resolution).replace(' ', ''),
                                    ('_' + extra) if extra else '')
        if append_to_tar:
            out_dec = StringIO()
            outfiles.append((out_dec, fnam))
        else:
            out_dec = open(os.path.join(outdir, fnam), 'w')
            outfiles.append((os.path.join(outdir, fnam), fnam))
        for reg in regions:
            out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg]))

        out_dec.write('# %s resolution:%d\n' % (
            name, resolution))
        if region2:
            out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
            out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
        else:
            out_dec.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))

    # functions to write lines of pairwise interactions
    def write_raw(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_raw.write('{}\t{}\t{}\n'.format(a, b, v))
        def writer(_, a, b, v):
            out_raw.write('{}\t{}\t{}\n'.format(a, b, v))
        return writer2 if func else writer

    def write_bias(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b]))
        def writer(_, a, b, v):
            out_nrm.write('{}\t{}\t{}\n'.format(a, b, v / bias1[a] / bias2[b]))
        return writer2 if func else writer

    def write_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
        def writer(c, a, b, v):
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
        return writer2 if func else writer

    def write_expc_2reg(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))]))
        def writer(c, a, b, v):
            out_dec.write('{}\t{}\t{}\n'.format(
                a, b, v / bias1[a] / bias2[b] / decay[c][abs((a + start_bin1) - (b + start_bin2))]))
        return writer2 if func else writer

    def write_expc_err(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\t{}\n'.format(
                    a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan'))
        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\t{}\n'.format(
                    a, b, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\n'.format(a, b, 'nan'))
        return writer2 if func else writer

    def write_raw_and_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b]))
        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b] / decay[c][abs(a-b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\t{}\t{}\n'.format(
                    a, b, v, v / bias1[a] / bias2[b]))
        return writer2 if func else writer

    write = None
    if 'raw'   in normalizations:
        write = write_raw(write)
    if 'norm'  in normalizations:
        write = write_bias(write)
    if 'decay' in normalizations:
        if len(regions) == 1:
            if region2:
                write = write_expc_2reg(write)
            else:
                write = write_expc(write)
        else:
            write = write_expc_err(write)
    if 'raw&decay' in normalizations:
        write = write_raw_and_expc(write)

    # pull all sub-matrices and write full matrix
    if region2 is not None:  # already half-matrix in this case
        half_matrix = False

    if half_matrix:
        for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash,
                                             verbose=verbose, clean=clean):
            if k > j:
                continue
            if j not in bads1 and k not in bads2:
                write(c, j, k, v)
    else:
        for c, j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash,
                                             verbose=verbose, clean=clean):
            if j not in bads1 and k not in bads2:
                write(c, j, k, v)

    fnames = {}
    if append_to_tar:
        lock = LockFile(append_to_tar)
        with lock:
            archive = taropen(append_to_tar, "a:")
            for fobj, fnam in outfiles:
                fobj.seek(0)
                info = archive.tarinfo(name=fnam)
                info.size=len(fobj.buf)
                archive.addfile(tarinfo=info, fileobj=fobj)
            archive.close()
    else:
        if 'raw' in normalizations:
            out_raw.close()
            fnames['RAW'] = out_raw.name
        if 'norm' in normalizations:
            out_nrm.close()
            fnames['NRM'] = out_nrm.name
        if 'decay' in normalizations:
            out_dec.close()
            fnames['DEC'] = out_dec.name
        if 'raw&decay' in normalizations:
            out_dec.close()
            fnames['RAW&DEC'] = out_dec.name

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash))))

    return fnames
Ejemplo n.º 22
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 23
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 24
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             min_count=2500,
             normalization='Vanilla',
             mappability=None,
             n_rsites=None,
             cg_content=None,
             sigma=2,
             ncpus=8,
             factor=1,
             outdir='.',
             extra_out='',
             only_valid=False,
             normalize_only=False,
             max_njobs=100,
             min_perc=None,
             max_perc=None,
             extra_bads=None):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])

    start_bin = 0
    end_bin = len(bins)
    total = len(bins)

    regs = []
    begs = []
    ends = []
    njobs = min(total, max_njobs) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            try:
                (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
            except IndexError:
                break
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(
            pool.apply_async(read_bam_frag,
                             args=(
                                 inbam,
                                 filter_exclude,
                                 bins,
                                 bins_dict,
                                 resolution,
                                 outdir,
                                 extra_out,
                                 region,
                                 start,
                                 end,
                             )))
    pool.close()
    print_progress(procs)
    pool.join()
    ## COLLECT RESULTS
    cisprc = {}
    printime('  - Collecting cis and total interactions per bin (%d chunks)' %
             (len(regs)))
    stdout.write('     ')
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not countbin % 10 and countbin:
            stdout.write(' ')
        if not countbin % 50 and countbin:
            stdout.write(' %9s\n     ' % ('%s/%s' % (countbin, len(regs))))
        stdout.write('.')
        stdout.flush()

        fname = path.join(
            outdir,
            'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        tmp_cisprc = load(open(fname))
        system('rm -f %s' % fname)
        cisprc.update(tmp_cisprc)
    stdout.write('\n')

    printime('  - Removing columns with too few or too much interactions')
    if len(bamfile.references) == 1 and min_count is None:
        raise Exception("ERROR: only one chromosome can't filter by "
                        "cis-percentage, set min_count instead")
    elif min_count is None and len(bamfile.references) > 1:
        badcol = filter_by_cis_percentage(
            cisprc,
            sigma=sigma,
            verbose=True,
            min_perc=min_perc,
            max_perc=max_perc,
            size=total,
            savefig=path.join(
                outdir, 'filtered_bins_%s_%s.png' %
                (nicer(resolution).replace(' ', ''), extra_out)))
    else:
        print(
            '      -> too few interactions defined as less than %9d '
            'interactions') % (min_count)
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, [0, 0])[1] < min_count:
                badcol[c] = cisprc.get(c, [0, 0])[1]
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total,
            float(len(badcol)) / total * 100)

    # no mappability will result in NaNs, better to filter out these columns
    if mappability:
        badcol.update((i, True) for i, m in enumerate(mappability) if not m)

    # add manually columns to bad columns
    if extra_bads:
        removed_manually = 0
        for ebc in extra_bads:
            c, ebc = ebc.split(':')
            b, e = map(int, ebc.split('-'))
            b = b / resolution + section_pos[c][0]
            e = e / resolution + section_pos[c][0]
            removed_manually += (e - b)
            badcol.update(dict((p, 'manual') for p in xrange(b, e)))
        printime('  - Removed %d columns manually.' % removed_manually)
    raw_cisprc = sum(
        float(cisprc[k][0]) / cisprc[k][1]
        for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol))

    printime('  - Rescaling sum of interactions per bins')
    size = len(bins)
    biases = [
        float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1]
        for k in xrange(size)
    ]

    if normalization == 'Vanilla':
        printime('  - Vanilla normalization')
        mean_col = nanmean(biases)
        biases = dict(
            (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases))
    elif normalization == 'oneD':
        printime('  - oneD normalization')
        if len(
                set([
                    len(biases),
                    len(mappability),
                    len(n_rsites),
                    len(cg_content)
                ])) > 1:
            print "biases", "mappability", "n_rsites", "cg_content"
            print len(biases), len(mappability), len(n_rsites), len(cg_content)
            raise Exception('Error: not all arrays have the same size')
        tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out))
        mkdir(tmp_oneD)
        biases = oneD(tmp_dir=tmp_oneD,
                      tot=biases,
                      map=mappability,
                      res=n_rsites,
                      cg=cg_content)
        biases = dict((k, b) for k, b in enumerate(biases))
        rmtree(tmp_oneD)
    else:
        raise NotImplementedError('ERROR: method %s not implemented' %
                                  normalization)

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    printime('  - Getting sum of normalized bins')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(
            fname,
            biases,
        )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    if not normalize_only:
        printime('  - Computing Cis percentage')
        # Calculate Cis percentage

        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = path.join(
                outdir,
                'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
            procs.append(
                pool.apply_async(get_cis_perc,
                                 args=(fname, biases, badcol, bins)))
        pool.close()
        print_progress(procs)
        pool.join()

        # collect results
        cis = total = 0
        for proc in procs:
            c, t = proc.get()
            cis += c
            total += t
        norm_cisprc = float(cis) / total
        print '    * Cis-percentage: %.1f%%' % (norm_cisprc * 100)
    else:
        norm_cisprc = 0.

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(
            pool.apply_async(sum_dec_matrix,
                             args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    nrmdec = {}
    rawdec = {}
    for proc in procs:
        tmpnrm, tmpraw = proc.get()
        for c, d in tmpnrm.iteritems():
            for k, v in d.iteritems():
                try:
                    nrmdec[c][k] += v
                    rawdec[c][k] += tmpraw[c][k]
                except KeyError:
                    try:
                        nrmdec[c][k] = v
                        rawdec[c][k] = tmpraw[c][k]
                    except KeyError:
                        nrmdec[c] = {k: v}
                        rawdec[c] = {k: tmpraw[c][k]}
    # count the number of cells per diagonal
    # TODO: parallelize
    # find largest chromosome
    len_crms = dict(
        (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos)
    # initialize dictionary
    ndiags = dict(
        (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections)
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[crm][dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set(
            )  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b < maxp:  # not inclusive!!
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[crm][dist] -= len(bad_diag)
        # different behavior for longest diagonal:
        ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr
                                         for b in thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    signal_to_noise = 0.05
    min_n = signal_to_noise**-2.  # equals 400 when default
    for crm in sections:
        if not crm in nrmdec:
            nrmdec[crm] = {}
            rawdec[crm] = {}
        tmpdec = 0  # store count by diagonal
        tmpsum = 0  # store count by diagonal
        ndiag = 0
        val = 0
        previous = [
        ]  # store diagonals to be summed in case not reaching the minimum
        for k in ndiags[crm]:
            tmpdec += nrmdec[crm].get(k, 0.)
            tmpsum += rawdec[crm].get(k, 0.)
            previous.append(k)
            if tmpsum > min_n:
                ndiag = sum(ndiags[crm][k] for k in previous)
                val = tmpdec  # backup of tmpdec kept for last ones outside the loop
                try:
                    ratio = val / ndiag
                    for k in previous:
                        nrmdec[crm][k] = ratio
                except ZeroDivisionError:  # all columns at this distance are "bad"
                    pass
                previous = []
                tmpdec = 0
                tmpsum = 0
        # last ones we average with previous result
        if len(previous) == len(ndiags[crm]):
            nrmdec[crm] = {}
        elif tmpsum < min_n:
            ndiag += sum(ndiags[crm][k] for k in previous)
            val += tmpdec
            try:
                ratio = val / ndiag
                for k in previous:
                    nrmdec[crm][k] = ratio
            except ZeroDivisionError:  # all columns at this distance are "bad"
                pass
    return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
Ejemplo n.º 25
0
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False,
                                  genome_seq=None, resolution=None, axe=None,
                                  savefig=None, normalized=False):
    """
    :param data: input file name, or HiC_data object or list of lists
    :param 10 min_diff: lower limit (in number of bins)
    :param 1000 max_diff: upper limit (in number of bins) to look for
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    
    """
    resolution = resolution or 1
    dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)])
    if isinstance(data, str):
        fhandler = open(data)
        line = fhandler.next()
        while line.startswith('#'):
            line = fhandler.next()
        try:
            while True:
                _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
                if cr1 != cr2:
                    line = fhandler.next()
                    continue
                diff = abs(int(ps1)  / resolution - int(ps2) / resolution)
                if max_diff > diff >= min_diff:
                    dist_intr[diff] += 1
                line = fhandler.next()
        except StopIteration:
            pass
        fhandler.close()
    elif isinstance(data, HiC_data):
        if normalized:
            get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y]
        else:
            get_data = lambda x, y: data[x, y]
        max_diff = min(len(data), max_diff)
        if data.section_pos:
            for crm in data.section_pos:
                for diff in xrange(min_diff, min(
                    (max_diff, 1 + data.chromosomes[crm]))):
                    for i in xrange(data.section_pos[crm][0],
                                    data.section_pos[crm][1] - diff):
                        dist_intr[diff] += get_data(i, i + diff)
        else:
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i, i + diff]):
                        dist_intr[diff] += get_data(i, diff)
    else:
        if genome_seq:
            max_diff = min(max(genome_seq.values()), max_diff)
            cnt = 0
            for crm in genome_seq:
                for diff in xrange(min_diff, min(
                    (max_diff, genome_seq[crm]))):
                    for i in xrange(cnt, cnt + genome_seq[crm] - diff):
                        if not np.isnan(data[i][i + diff]):
                            dist_intr[diff] += data[i][i + diff]
                cnt += genome_seq[crm]
        else:
            max_diff = min(len(data), max_diff)
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i][i + diff]):
                        dist_intr[diff] += data[i][i + diff]
    if not axe:
        fig=plt.figure()
        axe = fig.add_subplot(111)
    # remove last part of the plot in case no interaction is count... reduce max_dist
    for diff in xrange(max_diff - 1, min_diff, -1):
        try:
            if not dist_intr[diff]:
                del(dist_intr[diff])
                max_diff -=1
                continue
        except KeyError:
            max_diff -=1
            continue
        break
    xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0]))
    x = []
    y = []
    for k in xrange(len(xp)):
        if yp[k]:
            x.append(xp[k])
            y.append(yp[k])
    axe.plot(x, y, 'k.')
    best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0)
    logx = np.log(x)
    logy = np.log(y)
    ntries = 100
    # set k for better fit
    # for k in xrange(1, ntries/5, ntries/5/5):
    if resolution == 1:
        k = 1
        for i in xrange(3, ntries-2-k):
            v1 = i * len(x) / ntries
            try:
                a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
            except ValueError:
                a1 = b1 = r21 = 0
            r21 *= r21
            for j in xrange(i + 1 + k, ntries - 2 - k):
                v2 = j * len(x) / ntries
                try:
                    a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2])
                    a3, b3, r23, _, _ = linregress(logx[v2+k:  ], logy[v2+k: ])
                except ValueError:
                    a2 = b2 = r22 = 0
                    a3 = b3 = r23 = 0
                r2 = r21 + r22**2 + r23**2
                if r2 > best[0]:
                    best = (r2, v1, v2, a1, a2, a3,
                            b1, b2, b3, k)
        # plot line of best fit
        (v1, v2, 
         a1, a2, a3,
         b1, b2, b3, k) = best[1:]
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2+k:  ], yfit3(x[v2+k:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    else:
        # from 0.7 Mb
        v1 = 700000   / resolution
        # to 10 Mb
        v2 = 10000000 / resolution
        try:
            a1, b1, r21, _, _ = linregress(logx[  :v1], logy[  :v1])
        except ValueError:
            a1, b1, r21 = 0, 0, 0
        try:
            a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2])
        except ValueError:
            a2, b2, r22 = 0, 0, 0
        try:
            a3, b3, r23, _, _ = linregress(logx[v2:  ], logy[v2:  ])
        except ValueError:
            a3, b3, r23 = 0, 0, 0
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1:v2], yfit2(x[v1:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2:  ], yfit3(x[v2:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    axe.set_ylabel('Log interaction count')
    axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution))
    axe.legend(loc='lower left', frameon=False)
    axe.set_xscale('log')
    axe.set_yscale('log')
    axe.set_xlim((min_diff, max_diff))
    try:
        axe.set_ylim((0, max(y)))
    except ValueError:
        pass
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif show==True:
        plt.show()
        plt.close('all')