Ejemplo n.º 1
0
def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False,
                       savefig=None, show=False, savedata=None):
    """
    Compare the iteractions of two Hi-C matrices at a given distance,
    with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 1 resolution: to be used for scaling the plot
    :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will
       not look further than 10 times the resolution)
    :param None savefig: path to save the plot
    :param False intra: only takes into account intra-chromosomal contacts
    :param False show: displays the plot

    :returns: list of correlations and list of genomic distances
    """
    corr = []
    dist = []
    if (intra and hic_data1.sections and hic_data2.sections and 
        hic_data1.sections == hic_data2.sections):
        for i in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for crm in hic_data1.section_pos:
                for j in xrange(hic_data1.section_pos[crm][0],
                                hic_data1.section_pos[crm][1] - i):
                    diag1.append(hic_data1[j, i + j])
                    diag2.append(hic_data2[j, i + j])
            corr.append(spearmanr(diag1, diag2)[0])
            dist.append(i)
    else:
        if intra:
            warn('WARNING: hic_dta does not contain chromosome coordinates, ' +
                 'intra set to False')
        for i in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for j in xrange(len(hic_data1) - i):
                diag1.append(hic_data1[j, i + j])
                diag2.append(hic_data2[j, i + j])
            corr.append(spearmanr(diag1, diag2)[0])
            dist.append(i)
    if show or savefig:
        plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8)
        plt.xlabel('Genomic distance in bins')
        plt.ylabel('Spearman rank correlation')
        plt.xlim((0, dist[-1]))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')
    if savedata:
        out = open(savedata, 'w')
        out.write('# genomic distance\tSpearman rank correlation\n')
        for i in xrange(len(corr)):
            out.write('%s\t%s\n' % (dist[i], corr[i]))
        out.close()
    return corr, dist
Ejemplo n.º 2
0
def do_3d_plot(nam, outfile, size, count, minmax, sigma=0, log=False):
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(1, 1, 1, projection='3d')
    beg = -size / 2
    end =  size / 2
    X = np.arange(beg, end, 1)
    Y = np.arange(beg, end, 1)
    X, Y = np.meshgrid(X, Y)
    Z = np.array([np.array([float(i) for i in l.split()]) for l in open(nam) if not l.startswith('#')])
    plt.title(nam + '\nMean: %.3f, median: %.3f, standard-deviation: %.3f (N=%d)' % (np.mean(Z), np.median(Z), np.std(Z), count))
    if sigma:
        Z = ndimage.gaussian_filter(Z, sigma=sigma, order=0)
    if log:
        Z = np.log(Z)
        zspan = minmax if minmax else np.max(np.abs(Z))
        zmax =  zspan
        zmin = -zspan
    else:
        zspan = minmax if minmax else np.max(np.abs(Z - 1))
        zmin = -zspan + 1
        zmax =  zspan + 1
    cmap = 'coolwarm'  # 'coolwarm'
    _ = ax.contourf(X, Y, Z, zdir='z', offset=zmin,
                    cmap=cmap, vmin=zmin, vmax=zmax)
    surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap,
                           linewidth=0, antialiased=True, alpha=1,
                           vmin=zmin, vmax=zmax, shade=True)
    ax.set_zlim3d(zmin, zmax)
    ax.view_init(elev=15, azim=25)
    cb = fig.colorbar(surf, shrink=0.5, aspect=20)
    cb.set_label('%sverage normalized interactions%s' %
                 ('Log a' if log else 'A',
                  '\nSmoothed with $\sigma=%s$' % sigma))
    tadbit_savefig(outfile)
Ejemplo n.º 3
0
def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False,
                       savefig=None, show=False, savedata=None):
    """
    Compare the iteractions of two Hi-C matrices at a given distance,
    with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 1 resolution: to be used for scaling the plot
    :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will
       not look further than 10 times the resolution)
    :param None savefig: path to save the plot
    :param False intra: only takes into account intra-chromosomal contacts
    :param False show: displays the plot

    :returns: list of correlations and list of genomic distances
    """
    corr = []
    dist = []
    if (intra and hic_data1.sections and hic_data2.sections and 
        hic_data1.sections == hic_data2.sections):
        for i in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for crm in hic_data1.section_pos:
                for j in xrange(hic_data1.section_pos[crm][0],
                                hic_data1.section_pos[crm][1] - i):
                    diag1.append(hic_data1[j, i + j])
                    diag2.append(hic_data2[j, i + j])
            corr.append(spearmanr(diag1, diag2)[0])
            dist.append(i)
    else:
        if intra:
            warn('WARNING: hic_dta does not contain chromosome coordinates, ' +
                 'intra set to False')
        for i in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for j in xrange(len(hic_data1) - i):
                diag1.append(hic_data1[j, i + j])
                diag2.append(hic_data2[j, i + j])
            corr.append(spearmanr(diag1, diag2)[0])
            dist.append(i)
    if show or savefig:
        plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8)
        plt.xlabel('Genomic distance in bins')
        plt.ylabel('Spearman rank correlation')
        plt.xlim((0, dist[-1]))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')
    if savedata:
        out = open(savedata, 'w')
        out.write('# genomic distance\tSpearman rank correlation\n')
        for i in xrange(len(corr)):
            out.write('%s\t%s\n' % (dist[i], corr[i]))
        out.close()
    return corr, dist
Ejemplo n.º 4
0
    def objective_function(self, log=False, smooth=True,
                           axe=None, savefig=None):
        """
        This function plots the objective function value per each Monte-Carlo
        step.

        :param False log: log plot
        :param True smooth: curve smoothing
        :param None axe: a matplotlib.axes.Axes object to define the plot
           appearance
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).

        """
        show = False
        if not axe:
            fig = plt.figure(figsize=(7, 7))
            axe = fig.add_subplot(111)
            show = True
            axe.patch.set_facecolor('lightgrey')
            axe.patch.set_alpha(0.4)
            axe.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
            axe.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
            axe.set_axisbelow(True)
            axe.minorticks_on() # always on, not only for log
            # remove tick marks
            axe.tick_params(axis='both', direction='out', top=False,
                            right=False, left=False, bottom=False)
            axe.tick_params(axis='both', direction='out', top=False,
                            right=False, left=False, bottom=False,
                            which='minor')
        else:
            fig = axe.get_figure()
        # text
        plt.xlabel('Iteration number')
        plt.ylabel('IMP Objective Function Value')
        plt.title('Model ' + str(self['rand_init']))
        # smooth
        nrjz = self['log_objfun'][1:]
        if smooth:
            xnew = linspace(0, len(nrjz), 10000)
            nrjz_smooth = spline(range(len(nrjz)), nrjz, xnew,
                                 order=3)
            axe.plot(xnew, nrjz_smooth, color='darkred')
        else:
            axe.plot(nrjz, color='darkred')
        # plot
        axe.plot(nrjz, color='darkred', marker='o', alpha=.5, ms=4, ls='None')
        # log
        if log:
            axe.set_yscale('log')
        if savefig:
            tadbit_savefig(savefig)
        elif show:
            plt.show()
Ejemplo n.º 5
0
    def objective_function(self, log=False, smooth=True,
                           axe=None, savefig=None):
        """
        This function plots the objective function value per each Monte-Carlo
        step.

        :param False log: log plot
        :param True smooth: curve smoothing
        :param None axe: a matplotlib.axes.Axes object to define the plot
           appearance
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).

        """
        show = False
        if not axe:
            fig = plt.figure(figsize=(7, 7))
            axe = fig.add_subplot(111)
            show = True
            axe.patch.set_facecolor('lightgrey')
            axe.patch.set_alpha(0.4)
            axe.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
            axe.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
            axe.set_axisbelow(True)
            axe.minorticks_on() # always on, not only for log
            # remove tick marks
            axe.tick_params(axis='both', direction='out', top=False,
                            right=False, left=False, bottom=False)
            axe.tick_params(axis='both', direction='out', top=False,
                            right=False, left=False, bottom=False,
                            which='minor')
        else:
            fig = axe.get_figure()
        # text
        plt.xlabel('Iteration number')
        plt.ylabel('IMP Objective Function Value')
        plt.title('Model ' + str(self['rand_init']))
        # smooth
        nrjz = self['log_objfun'][1:]
        if smooth:
            xnew = linspace(0, len(nrjz), 10000)
            nrjz_smooth = spline(range(len(nrjz)), nrjz, xnew,
                                 order=3)
            axe.plot(xnew, nrjz_smooth, color='darkred')
        else:
            axe.plot(nrjz, color='darkred')
        # plot
        axe.plot(nrjz, color='darkred', marker='o', alpha=.5, ms=4, ls='None')
        # log
        if log:
            axe.set_yscale('log')
        if savefig:
            tadbit_savefig(savefig)
        elif show:
            plt.show()
Ejemplo n.º 6
0
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None):
    """
    :param fnam: input file name
    :param total_reads: total number of reads in the initial FASTQ file
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :returns: a dictionary with the number of reads per mapped length
    """
    count_by_len = {}
    total_reads = total_reads or 1
    if not axe:
        fig=plt.figure()
        _ = fig.add_subplot(111)
    colors = ['olive', 'darkcyan']
    for i, fnam in enumerate([fnam1, fnam2]):
        fhandler = open(fnam)
        line = fhandler.next()
        while line.startswith('#'):
            line = fhandler.next()
        try:
            count_by_len[i] = {}
            while True:
                _, length, _, _ = line.rsplit('\t', 3)
                try:
                    count_by_len[i][int(length)] += 1
                except KeyError:
                    count_by_len[i][int(length)] = 1
                line = fhandler.next()
        except StopIteration:
            pass
        fhandler.close()
        lengths = sorted(count_by_len[i].keys())
        for k in lengths[::-1]:
            count_by_len[i][k] += sum([count_by_len[i][j]
                                       for j in lengths if j < k])
        plt.plot(lengths, [float(count_by_len[i][l]) / total_reads
                           for l in lengths],
                 label='read' + str(i + 1), linewidth=2, color=colors[i])
    plt.xlabel('read length (bp)')
    if total_reads != 1:
        plt.ylabel('Proportion of mapped reads')
    else:
        plt.ylabel('Number of mapped reads')
    plt.legend(loc=4)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    return count_by_len
Ejemplo n.º 7
0
def plot_genomic_distribution(fnam, first_read=True, resolution=10000,
                              genome_seq=None, axe=None, ylim=None, savefig=None):
    """
    :param fnam: input file name
    :param True first_read: map first read.
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    
    """

    distr = {}
    idx1, idx2 = (1, 3) if first_read else (7, 9)
    for line in open(fnam):
        crm, pos = line.split()[idx1:idx2]
        pos = int(pos) / resolution
        try:
            distr[crm][pos] += 1
        except KeyError:
            try:
                distr[crm][pos] = 1
            except KeyError:
                distr[crm] = {pos: 1}

    if not axe:
        fig=plt.figure(figsize=(15, 3 * len(distr.keys())))

    max_y = max([max(distr[c].values()) for c in distr])
    max_x = max([len(distr[c].values()) for c in distr])
    for i, crm in enumerate(genome_seq if genome_seq else distr):
        plt.subplot(len(distr.keys()), 1, i + 1)
        plt.plot(range(max(distr[crm])),
                 [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))],
                 color='red', lw=1.5, alpha=0.7)
        if genome_seq:
            if ylim:
                plt.vlines(len(genome_seq[crm]) / resolution, ylim[0], ylim[1])
            else:
                plt.vlines(len(genome_seq[crm]) / resolution, 0, max_y)
        plt.xlim((0, max_x))
        plt.ylim(ylim or (0, max_y))
        plt.title(crm)

    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
Ejemplo n.º 8
0
def plot_distance_vs_interactions(fnam, min_diff=100, max_diff=1000000,
                                  resolution=100, axe=None, savefig=None):
    """
    :param fnam: input file name
    :param 100 min_diff: lower limit kn genomic distance (usually equal to read
       length)
    :param 1000000 max_diff: upper limit in genomic distance to look for
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    
    """
    dist_intr = {}
    for line in open(fnam):
        _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.rsplit('\t', 9)
        if cr1 != cr2:
            continue
        diff = resolution * (abs(int(ps1) - int(ps2)) / resolution)
        if max_diff > diff > min_diff:
            dist_intr.setdefault(diff, 0)
            dist_intr[diff] += 1
            
    for k in dist_intr.keys()[:]:
        if dist_intr[k] <= 2:
            del(dist_intr[k])
                    
    if not axe:
        fig=plt.figure()
        ax = fig.add_subplot(111)

    x, y = zip(*sorted(dist_intr.items(), key=lambda x:x[0]))
    plt.plot(x, y, 'k.')
    # sigma = 10
    # p_x = gaussian_filter1d(x, sigma)
    # p_y = gaussian_filter1d(y, sigma)
    # plot line of best fit
    # plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Gaussian fit')
    plt.yscale('log')
    plt.xscale('log')
    plt.xlabel('Log genomic distance (binned by %d bp)' % resolution)
    plt.ylabel('Log interaction count')
    # plt.legend()
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
Ejemplo n.º 9
0
def hic_map(data, genome_seq, biases=None, masked=None, resolution=100000,
            savefig=None, show=False, savedata=None, focus=None):
    if isinstance(data, str):
        fnam = data
        cumcs = {} 
        total = 0
        for crm in genome_seq:
            cumcs[crm] = total
            total += len(genome_seq[crm]) / resolution + 1
        # bin the data
        data = [[0 for _ in xrange(total + 1)] for _ in xrange(total + 1)]
        masked = masked or set()
        for line in open(fnam):
            read, cr1, ps1, _, _, _, _, cr2, ps2, _, _, _, _ = line.split()
            if read in masked:
                continue
            ps1 = int(ps1) / resolution
            ps2 = int(ps2) / resolution
            try:
                data[cumcs[cr1] + ps1][cumcs[cr2] + ps2] += 1
            except:
                break
    else:
        hic_data = data
        beg, end = focus if focus else (0, len(hic_data))
        beg -= 1 if focus else 0
        if biases:
            data = [[hic_data[len(hic_data) * i + j] / (biases[i] * biases[j])
                     for j in xrange(beg, end)]
                    for i in xrange(beg, end)]
        else: 
            data = [[hic_data[len(hic_data) * i + j]
                     for j in xrange(beg, end)]
                    for i in xrange(beg, end)]
    # do the plot
    if show or savefig:
        import numpy as np
        plt.figure(figsize=(16, 12))
        plt.imshow(np.log2(data), origin='lower', cmap='gist_earth',
                   interpolation='nearest')
        plt.colorbar()
        if savefig:
            tadbit_savefig(savefig)
        elif show:
            plt.show()
    if savedata:
        out = open(savedata, 'w')
        for line in data:
            out.write('\t'.join([str(cell) for cell in line]) + '\n')
        out.close()
Ejemplo n.º 10
0
def correlate_matrices(hic_data1, hic_data2, max_dist=10,
                       savefig=None, show=False, savedata=None):
    """
    Compare the iteractions of two Hi-C matrices at a given distance,
    with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 1 resolution: to be used for scaling the plot
    :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will
       not look further than 10 times the resolution)
    :param None savefig: path to save the plot
    :param False show: displays the plot

    :returns: list of correlations and list of genomic distances
    """
    corr = []
    dist = []
    for i in xrange(1, max_dist + 1):
        diag1 = []
        diag2 = []
        for j in xrange(len(hic_data1) - i):
            diag1.append(hic_data1[j, i + j])
            diag2.append(hic_data2[j, i + j])
        corr.append(spearmanr(diag1, diag2)[0])
        dist.append(i)
    if show or savefig:
        plt.plot(dist, corr, color='orange', linewidth=3, alpha=.8)
        plt.xlabel('Genomic distance in bins')
        plt.ylabel('Spearman rank correlation')
        plt.xlim((0, dist[-1]))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')
    if savedata:
        out = open(savedata, 'w')
        out.write('# genomic distance\tSpearman rank correlation\n')
        for i in xrange(len(corr)):
            out.write('%s\t%s\n' % (dist[i], corr[i]))
        out.close()

    return corr, dist
Ejemplo n.º 11
0
def quality_plot(fnam,
                 r_enz=None,
                 nreads=float('inf'),
                 axe=None,
                 savefig=None,
                 paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    )])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, str):
        r_enzs = [r_enz]
    for k in RESTRICTION_ENZYMES.keys():
        for i in range(len(r_enzs)):
            if k.lower() == str(r_enz[i]).lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = {}
    ligep = {}
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if len(r_enzs) == 1 and r_enzs[0] is None:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else:  # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(r_sites[r_enz])
            fixe[r_enz] = re.compile(d_sites[r_enz])
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0  # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in l_sites.values():
                seq = seq.replace(lig.upper(), lig)
            for r_enz in r_enzs:
                sites[r_enz].extend(
                    [m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend(
                    [m.start() for m in fixe[r_enz].finditer(seq)])
            for k in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                ligep[k] += l_sites[k] in seq
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = izip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals])
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if len(r_enzs) == 1 and r_enzs[0] is None:  # do both plots
            _, ax = plt.subplots(1, 1, figsize=(15, 6))
        else:  # only do the quality_plot plot
            _, (ax, ax2) = plt.subplots(2, 1, figsize=(15, 12))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False)
        ax.tick_params(axis='both',
                       direction='out',
                       top=False,
                       right=False,
                       left=False,
                       bottom=False,
                       which='minor')

    ax.errorbar(range(max_seq_len),
                meanquals,
                linewidth=1,
                elinewidth=1,
                color='darkblue',
                yerr=errorquals,
                ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in range(max_seq_len)],
             linewidth=1,
             color='black',
             linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try:  # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if not (len(r_enzs) == 1 and r_enzs[0] is None):
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' %
                     (', '.join(map(str, r_enzs)), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                        max([len(l_sites[k]) for k in l_sites]),
                        max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k)
                            for k in range(seq_len)]  # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k)
                            for k in range(seq_len)]  # DE
        for r1, r2 in liges:
            liges[(r1,
                   r2)] = [liges[(r1, r2)].count(k)
                           for k in range(seq_len)]  # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] + [
                    fixes[r_enz][k] - sites[r_enz][k - pos]
                    for k in range(pos, seq_len)
                ])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] + [
                    fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                    for k in range(pos, seq_len)
                ])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
                fixes[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len // 2 - site_len:max_seq_len //
                         2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(
                sites[r_enz],
                linewidth=2,
                color=color.next(),
                alpha=0.9,
                label='Undigested RE site (%s: %s)' %
                (r_enz, r_sites[r_enz]) if any([f > 0 for f in fixes[r_enz]])
                else 'Undigested & Dangling-Ends (%s: %s)' %
                (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)],
                     linewidth=2,
                     color=color.next(),
                     alpha=0.9,
                     label='Ligated (%s-%s: %s)' %
                     (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz],
                         linewidth=2,
                         color=color.next(),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' %
                         (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                          liges[k][max_seq_len // 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len // 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = (
                    (100. *
                     (fixes[r_enz][0] +
                      (fixes[r_enz][(max_seq_len // 2)] if paired else 0))) //
                    nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][
                    (max_seq_len // 2)] if paired else 0))) // nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(
                sum([
                    lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                    for r_enz1 in r_enzs for r_enz2 in r_enzs
                    if r_enz1 == r_enz or r_enz2 == r_enz
                ]))
            title += (
                'Percentage of digested sites (not considering Dangling-Ends) '
                '%s: %.1f%%\n' % (r_enz, 100. * float(lcnt) /
                                  (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz,
                                                                   des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += (
                    'Percentage of reads with ligation site (%s-%s): %.1f%% \n'
                    % (r_enz1, r_enz2,
                       (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines,
                   labels,
                   bbox_to_anchor=(0.75, 1.0),
                   loc=3,
                   borderaxespad=0.,
                   frameon=False,
                   fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    if len(r_enzs) == 1 and r_enzs[0] is None:
        return {}, {}
    return des, ligep
Ejemplo n.º 12
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 13
0
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    quals = []
    henes = []
    sites = []
    fixes = []
    liges = []
    ligep = 0
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enz:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        l_site = religated(r_enz)
        d_site = repaired(r_enz)
        if r_site*2 == l_site:
            # in case the religated site equals 2 restriction sites (like DnpII)
            site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site)
            fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site)
        else:
            site = re.compile(r_site)
            fixe = re.compile(d_site)
        lige = re.compile(l_site)
        if nreads:
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:
        if r_enz:
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    ax.errorbar(range(len(line.strip())), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, len(line)))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, len(line)))

    if r_enz:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            r_enz, nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')
        seq_len = len(line) - max((len(r_site), len(l_site), len(d_site)))
        sites = [sites.count(k) for k in xrange(seq_len)] # Undigested
        liges = [liges.count(k) for k in xrange(seq_len)] # OK
        fixes = [fixes.count(k) for k in xrange(seq_len)] # DE
        if d_site in r_site:
            pos = r_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)])
        if d_site in l_site:
            pos = l_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)])
        site_len = max((len(r_site), len(l_site), len(d_site)))
        if paired:
            sites[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            liges[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            fixes[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
        ax2.plot(sites, linewidth=2, color='darkred')
        ax2.set_ylabel('Undigested RE site (%s)' % r_site)
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)
        ax3 = ax2.twinx()
        ax3.plot(liges, linewidth=2, color='darkblue')
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Religated (%s)' % l_site)
        if any([f > 0 for f in fixes]):
            ax4 = ax2.twinx()
            ax4.spines["right"].set_position(("axes", 1.07))
            make_patch_spines_invisible(ax4)
            ax4.spines["right"].set_visible(True)        
            ax4.plot(fixes, linewidth=2, color='darkorange')
            ax4.yaxis.label.set_color('darkorange')
            ax4.tick_params(axis='y', colors='darkorange', **tkw)
            ax4.set_ylabel('Dangling-ends (%s)' % d_site)
        else:
            ax2.set_ylabel('RE site & Dangling-ends  (%s)' % r_site)
        ax2.set_xlim((0, len(line)))
        lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2])
        sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2])
        des = ((100. * (fixes[0] + (fixes[(len(line) / 2)]
                                            if paired else 0)))
                       / nreads) if any([f > 0 for f in fixes]) else (
            100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads
        plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' +
                   'Percentage of reads with ligation site: %.0f%%') %(
                      (100. * lig_cnt) / (lig_cnt + sit_cnt),
                      des,
                      (ligep * 100.) / nreads))
        plt.subplots_adjust(right=0.85)
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    return des, (ligep * 100.) / nreads
Ejemplo n.º 14
0
def quality_plot(fnam, nreads=None, axe=None, savefig=None):
    """
    Plot the qualities

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    """
    phred = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'
    quals = []
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    else:
        fhandler = open(fnam)
    if nreads:
        while True:
            try:
                next(fhandler)
            except EOFError:
                break
            next(fhandler)
            next(fhandler)
            line = next(fhandler)
            quals.append([phred.index(i) for i in line.strip()])
            if len(quals) > nreads:
                break
    else: # do this because it's faster
        while True:
            try:
                next(fhandler)
            except EOFError:
                break
            next(fhandler)
            next(fhandler)
            next(fhandler)
            line = next(fhandler)
            quals.append([phred.index(i) for i in line.strip()])
    fhandler.close()

    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        plt.clf()
    else:
        fig = plt.figure()
        plt.clf()
        ax = fig.add_subplot(111)
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    plt.figure(figsize=(15, 7))
    plt.errorbar(range(len(line.strip())), meanquals,
                 yerr=errorquals, ecolor='orange')

    plt.xlim((0, len(line)))
    plt.xlabel('Sequence')
    plt.ylabel('PHRED score')
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
Ejemplo n.º 15
0
def mmp_score(matrix, nrand=10, verbose=False, savefig=None):
    """
    :param matrix: list of lists
    :param 10 nrand: number of randomizations
    :param None savefig: path where to save figure

    :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the
       expected correlation of the contact matrices of the modeled chromatin
       with the original Hi-C data (plus the 3- lower and 4- upper values
       expected in 95% of the cases)
    """
    data = np.array([np.array([v for v in l]) for l in matrix])

    if verbose:
        sys.stdout.write('  - getting EigenVectors\n')
    egval, _ = np.linalg.eigh(data)
    # sort eigenvalues/vectors
    idx = (-egval).argsort()
    egval = egval[idx]

    regvals = []

    if verbose:
        sys.stdout.write('  - randomization\n')
    for i in xrange(int(nrand)):
        if verbose:
            sys.stdout.write('\r    ' + str(i + 1) + ' / ' + str(nrand))
            sys.stdout.flush()
        regval, _ = np.linalg.eigh(randomize_matrix(data))
        regval = [abs(j) for j in regval]
        regval.sort(reverse=True)
        regvals.append( regval)
    if verbose:
        sys.stdout.write('\n')
    regvals = zip(*regvals)
    rvmean = []
    for rv in regvals:
        rvmean.append(np.mean(rv))
    total = sum(rvmean)/100
    rvmean = [i/total for i in rvmean]

    err = []
    for rv in regvals:
        rvstd = np.std(rv/total)
        err.append(2 * rvstd)

    zdata = sorted(np.log2([data[i][j] for i in xrange(len(data))
                            for j in xrange(i, len(data)) if data[i][j]]))
    skewness = skew(zdata)
    kurtness = kurtosis(zdata)

    if savefig:
        _ = plt.figure(figsize=(14, 8))
        gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5)
        ax1 = plt.subplot(gs[:   , 0:3])
        ax2 = plt.subplot(gs[1:5 , 3: ])
        ax3 = plt.subplot(gs[5:7 , 3: ])
        img = ax2.imshow(np.log2(data), interpolation='none')
        plt.colorbar(img, ax=ax2)

        if savefig:
            ax2.set_title('Original matrix', size=12)
            ax2.tick_params(axis='both', which='major', labelsize=10)
        ax2.set_xlabel('Bin')
        ax2.set_ylabel('Bin')

        normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata))
        _ = ax3.plot(zdata, normfit, ':o', color='grey', ms=3, alpha=.4,
                     markersize=.5)
        ax3.tick_params(axis='both', which='major', labelsize=10)

        ax3.hist(zdata, bins=20, density=True, alpha=0.7, color='r')
        ax3.set_xlabel('Z-score')
        ax3.set_ylabel('Frequency')
        rcParams['xtick.direction'] = 'out'
        rcParams['ytick.direction'] = 'out'
        rcParams['axes.axisbelow']  = True
        rcParams['xtick.direction'] = 'out'
        rcParams['ytick.direction'] = 'out'
        rcParams['axes.axisbelow']  = True
        rcParams['axes.grid']       = True
        rcParams['grid.color']      = 'w'
        rcParams['grid.linestyle']  = '-'
        rcParams['grid.linewidth']  = 2
        # rcParams['grid.alpha']      = .3
        ax1.minorticks_on()
        ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major')
        ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor')
        ax1.spines['top'].set_color('none')
        ax1.spines['right'].set_color('none')
        ax1.spines['bottom'].set_color('none')
        ax1.spines['left'].set_color('none')
        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.set_xscale('log')
        try:
            ax1.set_axis_bgcolor((.9,.9,.9))
        except AttributeError:
            ax1.set_facecolor((.9,.9,.9))

        ax1.errorbar(range(1, 1 + len(rvmean)), rvmean, yerr=err, ecolor='red',
                     color='orange', lw=2,
                     label='%s randomizations' % (nrand))

    total = sum(abs(egval)) / 100
    egval = np.array(sorted([e/total for e in abs(egval)], reverse=True))

    for i in xrange(len(rvmean)):
        if rvmean[i] + err[i] > egval[i]:
            break
    signifidx = i

    size = len(data)

    sev = sum(egval[:signifidx]-rvmean[:signifidx])

    if savefig:
        ax1.plot(range(1, 1 + len(rvmean)), egval,
                 color='green', lw=2, label='Observed data')

        ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval,
                         where=(np.array(rvmean) + np.array(err))<egval,
                         facecolor='green', interpolate=True, alpha=0.2)
        ax1.fill_between(range(1, 1 + len(rvmean)), rvmean, egval,
                         where=(np.array(rvmean) + np.array(err))>egval,
                         facecolor='red'  , interpolate=True, alpha=0.2)
        ax1.set_xlim((0,len(rvmean)))
        ax1.set_ylim((0, max(max(rvmean), max(egval))))
        ax1.legend(frameon=False, loc='upper right', prop={'size': 10})
        ax1.set_xlabel('Log indexes of Eigenvalues')
        ax1.set_ylabel('Eigenvalues (percentage of total)')
        #plt.subplots_adjust(right=0.6)

        #img = Image.open(opts.outdir + '/matrix_small.png')
        #fig.figimage(img, 640, -160)

    minv = float(min([i for d in data for i in d if i])) / 2
    if minv == 0.5:
        minv = 1./(len(data)**2)

    mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126

    if verbose:
        sys.stdout.write('\n')
        sys.stdout.write('\n                       Results\n')
        sys.stdout.write('                       -------\n\n')


    if verbose:
        sys.stdout.write('                  MMP score: %.4f\n\n' % mmp)

    ex_a1, ex_b1 = [0.6975926,  0.2548171]
    supa1, supb1 = [0.69300732000423904, 0.29858572176099613]
    lowa1, lowb1 = [0.70217788900976075, 0.211048473299004]

    scc     = (mmp - ex_b1 ) / ex_a1
    scc_up1 = (mmp - supb1 ) / supa1
    scc_lw1 = (mmp - lowb1 ) / lowa1

    if verbose:
        sys.stdout.write(('  predicted dSCC is %.3f (%.3f-%.3f '
                          '68%% confidence)\n') % (scc , scc_up1 , scc_lw1 ))

    supa75, supb75 = [0.69230778430383244, 0.30526310790548261]
    lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746]

    scc_up75 = (mmp - supb75 ) / supa75
    scc_lw75 = (mmp - lowb75 ) / lowa75

    if verbose:
        sys.stdout.write(('                        (%.3f-%.3f '
                          '75%% confidence)\n') % (scc_up75 , scc_lw75 ))

    supa2, supb2 = [0.68855373600821357, 0.34109720480765293]
    lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709]

    scc_up2 = (mmp - supb2 ) / supa2
    scc_lw2 = (mmp - lowb2 ) / lowa2
    if verbose:
        sys.stdout.write(('                        (%.3f-%.3f '
                          '95%% confidence)\n') % (scc_up2 , scc_lw2 ))

    if savefig:
        # write the log
        log = ''
        log +=  '    1- Matrix size (number of eigenvalues): %s\n' % (len(egval))
        log +=  "    2- Skewness of the distribution: %0.3f\n" % (skewness)
        log +=  "    3- Kurtosis of the distribution: %0.3f\n" % (kurtness)
        log +=  "    4- Sum of differences signif EV real-rand: %0.3f\n\n" % (sev)
        plt.figtext(0.62, 0.77, log, size='small')
        log =  "MMP score: %.3f\n" % (mmp)
        log +=  "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % (scc, scc_up2, scc_lw2)
        plt.figtext(0.61, 0.87, log, size=12)
        tadbit_savefig(savefig)
        plt.close('all')

    return mmp, scc, scc_up2 , scc_lw2
Ejemplo n.º 16
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v != 'raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(
            zip(bamfile.references, [x for x in bamfile.lengths]))
        total = 0
        section_pos = dict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else
                           'NRM' if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads,
                    opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1,
                    start1=start1,
                    end1=end1,
                    region2=region2,
                    start2=start2,
                    end2=end2,
                    tmpdir=tmpdir,
                    ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks,
                    verbose=not opts.quiet,
                    clean=clean)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemeted for '
                         'matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(
                    opts.reso).replace(' ', ''), ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' %
                                        (row_names.next()) + '\t'.join(
                                            str(matrix.get((i, j), 0))
                                            for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(
                        str(matrix.get((i, j), 0)) for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                cmap = plt.get_cmap(opts.cmap)
                if norm != 'raw':
                    cmap.set_bad('grey', 1.)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace(
                    ' ', ''), ('_' + param_hash), opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                if opts.interactive:
                    _ = plt.figure(figsize=(8, 7))
                else:
                    _ = plt.figure(figsize=(16, 14))
                # ax1 = plt.subplot(111)
                ax1 = plt.axes([0.1, 0.1, 0.7, 0.8])
                ax2 = plt.axes([0.82, 0.1, 0.07, 0.8])
                matrix = array([
                    array([matrix.get((i, j), 0) for i in xrange(b1, e1)])
                    for j in xrange(b2, e2)
                ])
                mini = np_min(matrix[nonzero(matrix)]) / 2.
                matrix[matrix == 0] = mini
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:, bad1] = 1
                    for bad2 in bads2:
                        m[bad2, :] = 1
                matrix = log2(ma.masked_array(matrix, m))
                ax1.imshow(matrix,
                           interpolation='None',
                           origin='lower',
                           cmap=cmap,
                           vmin=vmin,
                           vmax=vmax)

                if len(regions) <= 2:
                    pltbeg1 = 0 if start1 is None else start1
                    pltend1 = sections[regions[0]] if end1 is None else end1
                    pltbeg2 = pltbeg1 if len(
                        regions) == 1 else 0 if start2 is None else start2
                    pltend2 = pltend1 if len(regions) == 1 else sections[
                        regions[-1]] if end2 is None else end2

                    ax1.set_xlabel('{}:{:,}-{:,}'.format(
                        regions[0], pltbeg1 if pltbeg1 else 1, pltend1))
                    ax1.set_ylabel('{}:{:,}-{:,}'.format(
                        regions[-1], pltbeg2 if pltbeg2 else 1, pltend2))

                    def format_xticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg1)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    def format_yticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg2)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks))
                    ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks))

                    labels = ax1.get_xticklabels()
                    plt.setp(labels, rotation=-25, ha='left')

                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                else:
                    vals = [0]
                    keys = ['']
                    for crm in regions:
                        vals.append(section_pos[crm][0] / opts.reso)
                        keys.append(crm)
                    vals.append(section_pos[crm][1] / opts.reso)
                    ax1.set_yticks(vals)
                    ax1.set_yticklabels('')
                    ax1.set_yticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_yticklabels(keys, minor=True)
                    for t in ax1.yaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False

                    ax1.set_xticks(vals)
                    ax1.set_xticklabels('')
                    ax1.set_xticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_xticklabels(keys, minor=True)
                    for t in ax1.xaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False
                    ax1.set_xlabel('Chromosomes')
                    ax1.set_ylabel('Chromosomes')
                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                data = [i for d in matrix for i in d if isfinite(i)]
                mindata = nanmin(data)
                maxdata = nanmax(data)
                gradient = linspace(maxdata, mindata,
                                    max((len(matrix), len(matrix[0]))))
                gradient = dstack((gradient, gradient))[0]
                h = ax2.hist(data,
                             color='darkgrey',
                             linewidth=2,
                             orientation='horizontal',
                             bins=50,
                             histtype='step',
                             normed=True)
                _ = ax2.imshow(gradient,
                               aspect='auto',
                               cmap=cmap,
                               extent=(0, max(h[0]), mindata, maxdata))
                ax2.yaxis.tick_right()
                ax2.yaxis.set_label_position("right")
                ax2.set_xticks([])
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' %
                              (name, norm, nicer(opts.reso)))
                ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90)
                ax2.set_xlabel('Count')
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(
            write_matrix(mreads,
                         opts.reso,
                         load(open(biases)) if biases else None,
                         outdir,
                         filter_exclude=opts.filter,
                         normalizations=opts.normalizations,
                         region1=region1,
                         start1=start1,
                         end1=end1,
                         region2=region2,
                         start2=start2,
                         end2=end2,
                         tmpdir=tmpdir,
                         append_to_tar=None,
                         ncpus=opts.cpus,
                         nchunks=opts.nchunks,
                         verbose=not opts.quiet,
                         extra=param_hash,
                         clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 17
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6,
                           savefig=None, show=False, savedata=None, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix()
    data2 = hic_data2.get_matrix()
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()

    return corr
Ejemplo n.º 18
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6,
                           savefig=None, show=False, savedata=None):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot

    :returns: matrix of correlations
    """
    corr = []
    ev1, evect1 = eigh(np.array([[hic_data1[i, j]
                         for j in xrange(len(hic_data1))]
                        for i in xrange(len(hic_data1))]))
    ev2, evect2 = eigh(np.array([[hic_data2[i, j]
                         for j in xrange(len(hic_data2))]
                        for i in xrange(len(hic_data2))]))
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm][::-1]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm][::-1]
    
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,i],
                                      evect2[:,j])[0])

    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower')
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticklabels(range(nvect + 1), range(1, nvect + 2))
        axe.set_yticklabels(range(nvect + 1), range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()

    return corr
Ejemplo n.º 19
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 xlog=False):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    ax = setup_plot(axe, figsize=(10, 5.5))
    max_perc = np.percentile(des, max_size)
    perc99  = np.percentile(des, 99)
    perc01  = np.percentile(des, 1)
    perc95  = np.percentile(des, 95)
    perc05  = np.percentile(des, 5)
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(top %.1f%%, up to %0.f nts)' % (max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')
Ejemplo n.º 20
0
def draw_map(
    data,
    genome_seq,
    cumcs,
    savefig,
    show,
    one=False,
    clim=None,
    cmap="jet",
    decay=False,
    perc=10,
    name=None,
    cistrans=None,
    decay_resolution=10000,
    normalized=False,
    max_diff=None,
):
    _ = plt.figure(figsize=(15.0, 12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(
            data, genome_seq=genome_seq, axe=ax3, resolution=decay_resolution, max_diff=max_diff, normalized=normalized
        )
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata = np.nanmin(data)
        maxoridata = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata = np.min(vals)
        maxoridata = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data)) for j in xrange(i, len(data))])
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0 if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == "tadbit":
        cuts = perc
        cdict = {"red": [(0.0, 0.0, 0.0)], "green": [(0.0, 0.0, 0.0)], "blue": [(0.0, 0.5, 0.5)]}
        prev_pos = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.0) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1.0 / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict["red"].append([pos, prc, prc])
            cdict["green"].append([pos, prc, prc])
            cdict["blue"].append([pos, 1, 1])
            prev_pos = pos
        for prc in np.linspace(median + 1.0 / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.0) - mindata) / diff
                prc = (prc - median) / (posF - median)
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict["red"].append([pos, 1.0, 1.0])
            cdict["green"].append([pos, 1 - prc, 1 - prc])
            cdict["blue"].append([pos, 1 - prc, 1 - prc])
            prev_pos = pos
        pos = (np.percentile(vals, 97.0) - mindata) / diff
        cdict["red"].append([pos, 0.1, 0.1])
        cdict["green"].append([pos, 0, 0])
        cdict["blue"].append([pos, 0, 0])

        cdict["red"].append([1.0, 1, 1])
        cdict["green"].append([1.0, 1, 1])
        cdict["blue"].append([1.0, 0, 0])
        cmap = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad("darkgrey", 1)

    ax1.imshow(data, interpolation="none", cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size = len(data)
    for i in xrange(size):
        for j in xrange(i, size):
            if np.isnan(data[i][j]):
                data[i][j] = 0
                data[j][i] = 0
            # data[j][i] = data[i][j]
    evals, evect = eigh(data)
    sort_perm = evals.argsort()
    evect = evect[sort_perm]
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data), np.nanmax(data), size)
    gradient = np.vstack((gradient, gradient))
    h = ax2.hist(data, color="darkgrey", linewidth=2, bins=20, histtype="step", normed=True)
    _ = ax2.imshow(gradient, aspect="auto", cmap=cmap, extent=(np.nanmin(data), np.nanmax(data), 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines(
                [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="w",
                linestyle="-",
                linewidth=1,
                alpha=1,
            )
            ax1.hlines(
                [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="w",
                linestyle="-",
                linewidth=1,
                alpha=1,
            )
            ax1.vlines(
                [cumcs[crm][0] - 0.5, cumcs[crm][1] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="k",
                linestyle="--",
            )
            ax1.hlines(
                [cumcs[crm][1] - 0.5, cumcs[crm][0] - 0.5],
                cumcs[crm][0] - 0.5,
                cumcs[crm][1] - 0.5,
                color="k",
                linestyle="--",
            )
        if not one:
            vals = [0]
            keys = [""]
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels("")
            ax1.set_yticks([float(vals[i] + vals[i + 1]) / 2 for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(
        0.05,
        0.25,
        "".join(
            [
                (name + "\n") if name else "",
                "Number of interactions: %s\n" % str(totaloridata),
                ("" if np.isnan(cistrans) else ("Percentage of cis interactions: %.0f%%\n" % (cistrans * 100))),
                "Min interactions: %s\n" % (minoridata),
                "Max interactions: %s\n" % (maxoridata),
            ]
        ),
    )
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim((-0.5, size - 0.5))
    ax1.set_ylim((-0.5, size - 0.5))
    ax2.set_xlabel("log interaction count")
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d * 100)) / 100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, "w.", markersize=2.5, alpha=0.4)
    ax2.plot(subdata, normfit, "k.", markersize=1.5, alpha=1)
    ax2.set_title("skew: %.3f, kurtosis: %.3f" % (skew(data), kurtosis(data)))
    ax4.vlines(range(size), 0, evect[:, -1], color="k")
    ax4.hlines(0, 0, size, color="red")
    ax4.set_ylabel("E1")
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size), 0, evect[:, -2], color="k")
    except IndexError:
        pass
    ax5.hlines(0, 0, size, color="red")
    ax5.set_ylabel("E2")
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size), 0, evect[:, -3], color="k")
    except IndexError:
        pass
    ax6.hlines(0, 0, size, color="red")
    ax6.set_ylabel("E3")
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close("all")
Ejemplo n.º 21
0
def quality_plot(fnam, r_enz=None, nreads=float('inf'), axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    if isinstance(r_enz, list):
        r_enzs = r_enz
    elif isinstance(r_enz, str):
        r_enzs = [r_enz]
    for k in RESTRICTION_ENZYMES.keys():
        for i in range(len(r_enzs)):
            if k.lower() == r_enz[i].lower():
                r_enz[i] = k
    # else let it as None

    quals = []
    henes = []
    sites = {}
    fixes = {}
    liges = {}
    ligep = {}
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    elif fnam.endswith('.dsrc'):
        proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE)
        fhandler = proc.stdout
    else:
        fhandler = open(fnam)
    if not r_enzs:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_sites = {}
        d_sites = {}
        for r_enz in r_enzs:
            r_sites[r_enz] = RESTRICTION_ENZYMES[r_enz].replace('|', '')
            d_sites[r_enz] = repaired(r_enz)
            sites[r_enz] = []  # initialize dico to store undigested sites
            fixes[r_enz] = []  # initialize dico to store digested sites
        l_sites = religateds(r_enzs)
        site = {}
        fixe = {}
        for r_enz in r_enzs:
            site[r_enz] = re.compile(r_sites[r_enz])
            fixe[r_enz] = re.compile(d_sites[r_enz])
        # ligation sites should appear in lower case in the sequence
        lige = {}
        for k in l_sites:
            liges[k] = []  # initialize dico to store sites
            ligep[k] = 0   # initialize dico to store sites
            l_sites[k] = l_sites[k].lower()
            lige[k] = re.compile(l_sites[k])
        while len(quals) <= nreads:
            try:
                next(fhandler)
            except StopIteration:
                break
            seq = next(fhandler)
            # ligation sites replaced by lower case to ease the search
            for lig in l_sites.values():
                seq = seq.replace(lig.upper(), lig)
            for r_enz in r_enzs:
                sites[r_enz].extend([m.start() for m in site[r_enz].finditer(seq)])
                # TODO: you cannot have a repaired/fixed site in the middle of
                # the sequence, this could be only checked at the beginning
                fixes[r_enz].extend([m.start() for m in fixe[r_enz].finditer(seq)])
            for k  in lige:  # for each paired of cut-site
                liges[k].extend([m.start() for m in lige[k].finditer(seq)])
                ligep[k] += l_sites[k] in seq
            # store the number of Ns found in the sequences
            if 'N' in seq:
                henes.extend([i for i, s in enumerate(seq) if s == 'N'])
            next(fhandler)
            line = next(fhandler)
            quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = izip_longest(*quals, fillvalue=float('nan'))
    meanquals, errorquals = zip(*[(nanmean(q), nanstd(q)) for q in quals])
    max_seq_len = len(meanquals)

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:  # configure plot
        if r_enz:  # do both plots
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:  # only do the quality_plot plot
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')

    ax.errorbar(range(max_seq_len), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, max_seq_len))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    # quality_plot plot
    axb.plot([henes.count(i) for i in xrange(max_seq_len)], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, max_seq_len))

    # Hi-C plot
    if r_enzs:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            ', '.join(r_enzs), nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')

        # seq_len is the length of the line to plot. we don't want to plot
        # if there is no room for the cut-site, or ligation site.
        site_len = max((max([len(r_sites[k]) for k in r_sites]),
                                   max([len(l_sites[k]) for k in l_sites]),
                                   max([len(d_sites[k]) for k in d_sites])))
        seq_len = max_seq_len - site_len

        # transform dictionaries of positions into dictionaries of counts
        for r_enz in sites:
            sites[r_enz] = [sites[r_enz].count(k) for k in xrange(seq_len)] # Undigested
            fixes[r_enz] = [fixes[r_enz].count(k) for k in xrange(seq_len)] # DE
        for r1, r2 in liges:
            liges[(r1, r2)] = [liges[(r1, r2)].count(k) for k in xrange(seq_len)] # OK

        # in case the pattern of the repaired cut-site contains the target
        # cut-site pattern. These sites were counted twice, once in the
        # undigested, and once in the repaired. We remove them from the
        # repaired:
        for r_enz in r_enzs:
            if d_sites[r_enz] in r_sites[r_enz]:
                pos = r_sites[r_enz].find(d_sites[r_enz])

                fixes[r_enz] = (fixes[r_enz][:pos] +
                                [fixes[r_enz][k] - sites[r_enz][k-pos]
                                 for k in xrange(pos, seq_len)])
        # same for ligated sites
        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                if d_sites[r_enz1] not in l_sites[(r_enz1, r_enz2)]:
                    continue
                pos = l_sites[(r_enz1, r_enz2)].find(d_sites[r_enz1])
                fixes[r_enz1] = (fixes[r_enz1][:pos] +
                                 [fixes[r_enz1][k] - liges[(r_enz1, r_enz2)][k - pos]
                                  for k in xrange(pos, seq_len)])

        # remove anything that could be in between the two read ends
        if paired:
            for k in sites:
                sites[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
                fixes[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len
            for k in liges:
                liges[k][max_seq_len / 2 - site_len:
                         max_seq_len / 2] = [float('nan')] * site_len

        # plot undigested cut-sites
        color = iter(plt.cm.Reds(linspace(0.3, 0.95, len(r_enzs))))
        for r_enz in sites:
            # print 'undigested', r_enz
            # print sites[r_enz][:20]
            ax2.plot(sites[r_enz], linewidth=2, color = color.next(),
                     alpha=0.9,
                     label='Undigested RE site (%s: %s)' % (r_enz, r_sites[r_enz])
                     if any([f > 0 for f in fixes[r_enz]])
                     else 'Undigested & Dangling-Ends (%s: %s)' % (r_enz, r_sites[r_enz]))
        ax2.set_ylabel('Undigested')
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)

        lines, labels = ax2.get_legend_handles_labels()

        ax3 = ax2.twinx()
        color = iter(plt.cm.Blues(linspace(0.3, 0.95, len(liges))))
        for r1, r2 in liges:
            # print 'ligated', r1, r2
            # print liges[(r1, r2)][:20]
            ax3.plot(liges[(r1, r2)], linewidth=2, color=color.next(),
                     alpha=0.9,
                     label = 'Ligated (%s-%s: %s)' % (r1, r2, l_sites[(r1, r2)].upper()))
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Ligated')

        tmp_lines, tmp_labels = ax3.get_legend_handles_labels()
        lines.extend(tmp_lines)
        labels.extend(tmp_labels)

        color = iter(plt.cm.Greens(linspace(0.3, 0.95, len(r_enzs))))
        for i, r_enz in enumerate(r_enzs):
            if any([f > 0 for f in fixes[r_enz]]):
                ax4 = ax2.twinx()
                ax4.spines["right"].set_position(("axes", 1.07))
                make_patch_spines_invisible(ax4)
                ax4.spines["right"].set_visible(True)
                # print 'repaired', r_enz
                # print fixes[r_enz][:20]
                ax4.plot(fixes[r_enz], linewidth=2, color=color.next(),
                         alpha=0.9,
                         label='Dangling-ends (%s: %s)' % (r_enz, d_sites[r_enz]))
                ax4.yaxis.label.set_color('darkgreen')
                ax4.tick_params(axis='y', colors='darkgreen', **tkw)
                ax4.set_ylabel('Dangling-ends')
                tmp_lines, tmp_labels = ax4.get_legend_handles_labels()
                lines.extend(tmp_lines)
                labels.extend(tmp_labels)
            else:
                ax2.set_ylabel('Undigested & Dangling-ends')
        ax2.set_xlim((0, max_seq_len))

        # Count ligation sites
        lig_cnt = {}
        for k in liges:
            lig_cnt[k] = (nansum(liges[k]) - liges[k][0] -
                              liges[k][max_seq_len / 2])

        # Count undigested sites
        sit_cnt = {}
        for r_enz in r_enzs:
            sit_cnt[r_enz] = (nansum(sites[r_enz]) - sites[r_enz][0] -
                              sites[r_enz][max_seq_len / 2])

        # Count Dangling-Ends
        des = {}
        for r_enz in r_enzs:
            if any([f > 0 for f in fixes[r_enz]]):
                des[r_enz] = ((100. * (fixes[r_enz][0] + (fixes[r_enz][(max_seq_len / 2)]
                                                          if paired else 0))) / nreads)
            else:
                des[r_enz] = (100. * (sites[r_enz][0] + (sites[r_enz][(max_seq_len / 2)]
                                                         if paired else 0))) / nreads

        # Decorate plot
        title = ''
        for r_enz in r_enzs:
            lcnt = float(sum([lig_cnt[(r_enz1, r_enz2)] * (2 if r_enz1 == r_enz2 else 1)
                              for r_enz1 in r_enzs for r_enz2 in r_enzs
                              if r_enz1 == r_enz or r_enz2 == r_enz]))
            title += ('Percentage of digested sites (not considering Dangling-Ends) '
                      '%s: %.1f%%\n' % (r_enz,
                                        100. * float(lcnt) / (lcnt + sit_cnt[r_enz])))
        for r_enz in r_enzs:
            title += 'Percentage of dangling-ends %s: %.1f%%\n' % (r_enz, des[r_enz])

        for r_enz1 in r_enzs:
            for r_enz2 in r_enzs:
                title += ('Percentage of reads with ligation site (%s-%s): %.1f%% \n' %
                          (r_enz1, r_enz2, (ligep[(r_enz1, r_enz2)] * 100.) / nreads))
        plt.title(title.strip(), size=10, ha='left', x=0)
        plt.subplots_adjust(right=0.85)
        ax2.legend(lines, labels, bbox_to_anchor=(0.75, 1.0),
                   loc=3, borderaxespad=0., frameon=False, fontsize=9)
    plt.tight_layout()
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    for k in ligep:
        ligep[k] = (ligep[k] * 100.) / nreads
    return des, ligep
Ejemplo n.º 22
0
def plot_genomic_distribution(fnam, first_read=True, resolution=10000,
                              axe=None, ylim=None, savefig=None,
                              chr_names=None, nreads=None):
    """
    :param fnam: input file name
    :param True first_read: uses first read.
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param None chr_names: can pass a list of chromosome names in case only some
       them the need to be plotted (this option may last even more than default)
    
    """

    distr = {}
    idx1, idx2 = (1, 3) if first_read else (7, 9)
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    if chr_names:
        chr_names = set(chr_names)
        cond1 = lambda x: x not in chr_names
    else:
        cond1 = lambda x: False
    if nreads:
        cond2 = lambda x: x >= nreads
    else:
        cond2 = lambda x: False
    cond = lambda x, y: cond1(x) and cond2(y)
    count = 0
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split('\t')
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    try:
        while True:
            crm, pos = line.strip().split('\t')[idx1:idx2]
            count += 1
            if cond(crm, count):
                line = fhandler.next()
                if cond2(count):
                    break
                continue
            pos = int(pos) / resolution
            try:
                distr[crm][pos] += 1
            except KeyError:
                try:
                    distr[crm][pos] = 1
                except KeyError:
                    distr[crm] = {pos: 1}
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    if not axe:
        _ = plt.figure(figsize=(15, 3 + 3 * len(distr.keys())))

    max_y = max([max(distr[c].values()) for c in distr])
    max_x = max([len(distr[c].values()) for c in distr])
    ncrms = len(genome_seq if genome_seq else distr)
    for i, crm in enumerate(chr_names if chr_names else genome_seq
                            if genome_seq else distr):
        plt.subplot(ncrms, 1, i + 1)
        try:
            plt.plot(range(max(distr[crm])),
                     [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))],
                     color='red', lw=1.5, alpha=0.7)
        except KeyError:
            pass
        if ylim:
            plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1])
        else:
            plt.vlines(genome_seq[crm] / resolution, 0, max_y)
        plt.xlim((0, max_x))
        plt.ylim(ylim or (0, max_y))
        plt.title(crm)

    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')
Ejemplo n.º 23
0
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None,
             cmap='jet', decay=False, perc=10, name=None, cistrans=None,
             decay_resolution=10000, normalized=False, max_diff=None):
    _ = plt.figure(figsize=(15.,12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3,
                                      resolution=decay_resolution,
                                      max_diff=max_diff, normalized=normalized)
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata   = np.nanmin(data)
        maxoridata   = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata   = np.min(vals)
        maxoridata   = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data))
                              for j in xrange(i, len(data[i]))]) # may not be square
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0  if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == 'tadbit':
        cuts = perc
        cdict = {'red'  : [(0.0,  0.0, 0.0)],
                 'green': [(0.0,  0.0, 0.0)],
                 'blue' : [(0.0,  0.5, 0.5)]}
        prev_pos  = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1. / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, prc, prc])
            cdict['green'].append([pos, prc, prc])
            cdict['blue' ].append([pos, 1, 1])
            prev_pos  = pos
        for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - median) / (posF - median))
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, 1.0, 1.0])
            cdict['green'].append([pos, 1 - prc, 1 - prc])
            cdict['blue' ].append([pos, 1 - prc, 1 - prc])
            prev_pos  = pos
        pos = (np.percentile(vals ,97.) - mindata) / diff
        cdict['red'  ].append([pos, 0.1, 0.1])
        cdict['green'].append([pos, 0, 0])
        cdict['blue' ].append([pos, 0, 0])

        cdict['red'  ].append([1.0, 1, 1])
        cdict['green'].append([1.0, 1, 1])
        cdict['blue' ].append([1.0, 0, 0])
        cmap  = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad('darkgrey', 1)

    ax1.imshow(data, interpolation='none',
               cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size1 = len(data)
    size2 = len(data[0])
    if size1 == size2:
        for i in xrange(size1):
            for j in xrange(i, size2):
                if np.isnan(data[i][j]):
                    data[i][j] = 0
                    data[j][i] = 0
    else:
        for i in xrange(size1):
            for j in xrange(size2):
                if np.isnan(data[i][j]):
                    data[i][j] = 0
            #data[j][i] = data[i][j]
    try:
        evals, evect = eigh(data)
        sort_perm = evals.argsort()
        evect = evect[sort_perm]
    except:
        evals, evect = None, None
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data),
                           np.nanmax(data), max(size1, size2))
    gradient = np.vstack((gradient, gradient))
    h  = ax2.hist(data, color='darkgrey', linewidth=2,
                  bins=20, histtype='step', normed=True)
    _  = ax2.imshow(gradient, aspect='auto', cmap=cmap,
                    extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
        if not one:
            vals = [0]
            keys = ['']
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels('')
            ax1.set_yticks([float(vals[i]+vals[i+1])/2
                            for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(0.05,0.25, ''.join([
        (name + '\n') if name else '',
        'Number of interactions: %s\n' % str(totaloridata),
        ('' if np.isnan(cistrans) else
         ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))),
        'Min interactions: %s\n' % (minoridata),
        'Max interactions: %s\n' % (maxoridata)]))
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim ((-0.5, size1 - .5))
    ax1.set_ylim ((-0.5, size2 - .5))
    ax2.set_xlabel('log interaction count')
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d*100))/100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4)
    ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1)
    ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data),
                                                   kurtosis(data)))
    try: 
        ax4.vlines(range(size1), 0, evect[:,-1], color='k')
    except (TypeError, IndexError):
        pass
    ax4.hlines(0, 0, size2, color='red')
    ax4.set_ylabel('E1')
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size1), 0, evect[:,-2], color='k')
    except (TypeError, IndexError):
        pass
    ax5.hlines(0, 0, size2, color='red')
    ax5.set_ylabel('E2')
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size1), 0, evect[:,-3], color='k')
    except (TypeError, IndexError):
        pass
    ax6.hlines(0, 0, size2, color='red')
    ax6.set_ylabel('E3')
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close('all')
Ejemplo n.º 24
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6,
                           savefig=None, show=False, savedata=None, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix()
    data2 = hic_data2.get_matrix()
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()

    return corr
Ejemplo n.º 25
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 show=False, xlog=False, stats=('median', 'perc_max')):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale
    :param ('median', 'perc_max') stats: returns this set of values calculated from the
       distribution of insert/fragment sizes. Possible values are:
        - 'median' median of the distribution
        - 'perc_max' percentil defined by the other parameter 'max_size'
        - 'first_deacay' starting from the median of the distribution to the
            first window where 10 consecutive insert sizes are counted less than
            a given value (this given value is equal to the sum of all
            sizes divided by 100 000)
        - 'MAD' Double Median Adjusted Deviation

    :returns: the median value and the percentile inputed as max_size.
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    max_perc = np.percentile(des, max_size)
    perc99   = np.percentile(des, 99)
    perc01   = np.percentile(des, 1)
    perc50   = np.percentile(des, 50)
    perc95   = np.percentile(des, 95)
    perc05   = np.percentile(des, 5)
    to_return = {'median': perc50}
    cutoff = len(des) / 100000.
    count  = 0
    for v in xrange(int(perc50), int(max(des))):
        if des.count(v) < cutoff:
            count += 1
        else:
            count = 0
        if count >= 10:
            to_return['first_decay'] = v - 10
            break
    else:
        raise Exception('ERROR: not found')
    to_return['perc_max'] = max_perc
    to_return['MAD'] = mad(des)
    if not savefig and not axe and not show:
        return [to_return[k] for k in stats]
    
    ax = setup_plot(axe, figsize=(10, 5.5))
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % (
                     perc50, max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show and not axe:
        plt.show()
    plt.close('all')
    return [to_return[k] for k in stats]
Ejemplo n.º 26
0
def mmp_score(matrix, nrand=10, verbose=False, savefig=None):
    """
    :param matrix: list of lists
    :param 10 nrand: number of randomizations
    :param None savefig: path where to save figure

    :returns: 1- MMP score which ranges from 0 (bad) to 1 (good), and 2- the
       expected correlation of the contact matrices of the modeled chromatin
       with the original Hi-C data (plus the 3- lower and 4- upper values
       expected in 95% of the cases)
    """
    data = np.array([np.array([v for v in l]) for l in matrix])

    if verbose:
        sys.stdout.write('  - getting EigenVectors\n')
    egval, _ = np.linalg.eigh(data)
    # sort eigenvalues/vectors
    idx = (-egval).argsort()
    egval = egval[idx]

    regvals = []

    if verbose:
        sys.stdout.write('  - randomization\n')
    for i in xrange(int(nrand)):
        if verbose:
            sys.stdout.write('\r    ' + str(i + 1) + ' / ' + str(nrand))
            sys.stdout.flush()
        regval, _ = np.linalg.eigh(randomize_matrix(data))
        regval = [abs(j) for j in regval]
        regval.sort(reverse=True)
        regvals.append(regval)
    if verbose:
        sys.stdout.write('\n')
    regvals = zip(*regvals)
    rvmean = []
    for rv in regvals:
        rvmean.append(np.mean(rv))
    total = sum(rvmean) / 100
    rvmean = [i / total for i in rvmean]

    err = []
    for rv in regvals:
        rvstd = np.std(rv / total)
        err.append(2 * rvstd)

    zdata = sorted(
        np.log2([
            data[i][j] for i in xrange(len(data))
            for j in xrange(i, len(data)) if data[i][j]
        ]))
    skewness = skew(zdata)
    kurtness = kurtosis(zdata)

    if savefig:
        _ = plt.figure(figsize=(14, 8))
        gs = gridspec.GridSpec(7, 5, wspace=0.5, hspace=1.5)
        ax1 = plt.subplot(gs[:, 0:3])
        ax2 = plt.subplot(gs[1:5, 3:])
        ax3 = plt.subplot(gs[5:7, 3:])
        img = ax2.imshow(np.log2(data), interpolation='none')
        plt.colorbar(img, ax=ax2)

        if savefig:
            ax2.set_title('Original matrix', size=12)
            ax2.tick_params(axis='both', which='major', labelsize=10)
        ax2.set_xlabel('Bin')
        ax2.set_ylabel('Bin')

        normfit = sc_norm.pdf(zdata, np.mean(zdata), np.std(zdata))
        _ = ax3.plot(zdata,
                     normfit,
                     ':o',
                     color='grey',
                     ms=3,
                     alpha=.4,
                     markersize=.5)
        ax3.tick_params(axis='both', which='major', labelsize=10)

        ax3.hist(zdata, bins=20, normed=True, alpha=0.7, color='r')
        ax3.set_xlabel('Z-score')
        ax3.set_ylabel('Frequency')
        rcParams['xtick.direction'] = 'out'
        rcParams['ytick.direction'] = 'out'
        rcParams['axes.axisbelow'] = True
        rcParams['xtick.direction'] = 'out'
        rcParams['ytick.direction'] = 'out'
        rcParams['axes.axisbelow'] = True
        rcParams['axes.grid'] = True
        rcParams['grid.color'] = 'w'
        rcParams['grid.linestyle'] = '-'
        rcParams['grid.linewidth'] = 2
        # rcParams['grid.alpha']      = .3
        ax1.minorticks_on()
        ax1.grid(ls='-', color='w', alpha=.3, lw=2, which='major')
        ax1.grid(ls='-', b=True, color='w', alpha=.3, lw=1, which='minor')
        ax1.spines['top'].set_color('none')
        ax1.spines['right'].set_color('none')
        ax1.spines['bottom'].set_color('none')
        ax1.spines['left'].set_color('none')
        ax1.xaxis.set_ticks_position('bottom')
        ax1.yaxis.set_ticks_position('left')
        ax1.set_xscale('log')
        ax1.set_axis_bgcolor((.9, .9, .9))

        ax1.errorbar(range(1, 1 + len(rvmean)),
                     rvmean,
                     yerr=err,
                     ecolor='red',
                     color='orange',
                     lw=2,
                     label='%s randomizations' % (nrand))

    total = sum(abs(egval)) / 100
    egval = np.array(sorted([e / total for e in abs(egval)], reverse=True))

    for i in xrange(len(rvmean)):
        if rvmean[i] + err[i] > egval[i]:
            break
    signifidx = i

    size = len(data)

    sev = sum(egval[:signifidx] - rvmean[:signifidx])

    if savefig:
        ax1.plot(range(1, 1 + len(rvmean)),
                 egval,
                 color='green',
                 lw=2,
                 label='Observed data')

        ax1.fill_between(range(1, 1 + len(rvmean)),
                         rvmean,
                         egval,
                         where=(np.array(rvmean) + np.array(err)) < egval,
                         facecolor='green',
                         interpolate=True,
                         alpha=0.2)
        ax1.fill_between(range(1, 1 + len(rvmean)),
                         rvmean,
                         egval,
                         where=(np.array(rvmean) + np.array(err)) > egval,
                         facecolor='red',
                         interpolate=True,
                         alpha=0.2)
        ax1.set_xlim((0, len(rvmean)))
        ax1.set_ylim((0, max(max(rvmean), max(egval))))
        ax1.legend(frameon=False, loc='upper right', prop={'size': 10})
        ax1.set_xlabel('Log indexes of Eigenvalues')
        ax1.set_ylabel('Eigenvalues (percentage of total)')
        #plt.subplots_adjust(right=0.6)

        #img = Image.open(opts.outdir + '/matrix_small.png')
        #fig.figimage(img, 640, -160)

    minv = float(min([i for d in data for i in d if i])) / 2
    if minv == 0.5:
        minv = 1. / (len(data)**2)

    mmp = -0.0002 * size + 0.0335 * skewness - 0.0229 * kurtness + 0.0069 * sev + 0.8126

    if verbose:
        sys.stdout.write('\n')
        sys.stdout.write('\n                       Results\n')
        sys.stdout.write('                       -------\n\n')

    if verbose:
        sys.stdout.write('                  MMP score: %.4f\n\n' % mmp)

    ex_a1, ex_b1 = [0.6975926, 0.2548171]
    supa1, supb1 = [0.69300732000423904, 0.29858572176099613]
    lowa1, lowb1 = [0.70217788900976075, 0.211048473299004]

    scc = (mmp - ex_b1) / ex_a1
    scc_up1 = (mmp - supb1) / supa1
    scc_lw1 = (mmp - lowb1) / lowa1

    if verbose:
        sys.stdout.write(('  predicted dSCC is %.3f (%.3f-%.3f '
                          '68%% confidence)\n') % (scc, scc_up1, scc_lw1))

    supa75, supb75 = [0.69230778430383244, 0.30526310790548261]
    lowa75, lowb75 = [0.70287742471016734, 0.20437108715451746]

    scc_up75 = (mmp - supb75) / supa75
    scc_lw75 = (mmp - lowb75) / lowa75

    if verbose:
        sys.stdout.write(('                        (%.3f-%.3f '
                          '75%% confidence)\n') % (scc_up75, scc_lw75))

    supa2, supb2 = [0.68855373600821357, 0.34109720480765293]
    lowa2, lowb2 = [0.70663147300578644, 0.16853699025234709]

    scc_up2 = (mmp - supb2) / supa2
    scc_lw2 = (mmp - lowb2) / lowa2
    if verbose:
        sys.stdout.write(('                        (%.3f-%.3f '
                          '95%% confidence)\n') % (scc_up2, scc_lw2))

    if savefig:
        # write the log
        log = ''
        log += '    1- Matrix size (number of eigenvalues): %s\n' % (
            len(egval))
        log += "    2- Skewness of the distribution: %0.3f\n" % (skewness)
        log += "    3- Kurtosis of the distribution: %0.3f\n" % (kurtness)
        log += "    4- Sum of differences signif EV real-rand: %0.3f\n\n" % (
            sev)
        plt.figtext(0.62, 0.77, log, size='small')
        log = "MMP score: %.3f\n" % (mmp)
        log += "Predicted dSCC: %.3f (%.3f-%.3f at 95%% conf)\n" % (
            scc, scc_up2, scc_lw2)
        plt.figtext(0.61, 0.87, log, size=12)
        tadbit_savefig(savefig)
        plt.close('all')

    return mmp, scc, scc_up2, scc_lw2
Ejemplo n.º 27
0
def correlate_matrices(hic_data1, hic_data2, max_dist=10, intra=False, axe=None,
                       savefig=None, show=False, savedata=None,
                       normalized=False, remove_bad_columns=True, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices at a given distance,
    with spearman rank correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 1 resolution: to be used for scaling the plot
    :param 10 max_dist: maximum distance from diagonal (e.g. 10 mean we will
       not look further than 10 times the resolution)
    :param None savefig: path to save the plot
    :param False intra: only takes into account intra-chromosomal contacts
    :param False show: displays the plot
    :param False normalized: use normalized data
    :param True remove_bads: computes the union of bad columns between samples
       and exclude them from the comparison

    :returns: list of correlations and list of genomic distances
    """
    corrs = []
    dists = []

    if normalized:
        get_the_guy1 = lambda i, j: (hic_data1[j, i] / hic_data1.bias[i] /
                                     hic_data1.bias[j])
        get_the_guy2 = lambda i, j: (hic_data2[j, i] / hic_data2.bias[i] /
                                     hic_data2.bias[j])
    else:
        get_the_guy1 = lambda i, j: hic_data1[j, i]
        get_the_guy2 = lambda i, j: hic_data2[j, i]
    
    if remove_bad_columns:
        # union of bad columns
        bads = hic_data1.bads.copy()
        bads.update(hic_data2.bads)

    if (intra and hic_data1.sections and hic_data2.sections and 
        hic_data1.sections == hic_data2.sections):
        for dist in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for crm in hic_data1.section_pos:
                for j in xrange(hic_data1.section_pos[crm][0],
                                hic_data1.section_pos[crm][1] - dist):
                    i = j + dist
                    if j in bads or i in bads:
                        continue
                    diag1.append(get_the_guy1(i, j))
                    diag2.append(get_the_guy2(i, j))
            corrs.append(spearmanr(diag1, diag2)[0])
            dists.append(dist)
    else:
        if intra:
            warn('WARNING: hic_dta does not contain chromosome coordinates, ' +
                 'intra set to False')
        for dist in xrange(1, max_dist + 1):
            diag1 = []
            diag2 = []
            for j in xrange(len(hic_data1) - dist):
                i = j + dist
                if j in bads or i in bads:
                    continue
                diag1.append(get_the_guy1(i, j))
                diag2.append(get_the_guy2(i, j))
            corrs.append(spearmanr(diag1, diag2)[0])
            dists.append(dist)
    if show or savefig or axe:
        if not axe:
            fig = plt.figure()
            axe = fig.add_subplot(111)
            given_axe = False
        else:
            given_axe = True
        axe.plot(dists, corrs, color='orange', linewidth=3, alpha=.8)
        axe.set_xlabel('Genomic distance in bins')
        axe.set_ylabel('Spearman rank correlation')
        axe.set_xlim((0, dists[-1]))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        if not given_axe:
            plt.close('all')
    if savedata:
        out = open(savedata, 'w')
        out.write('# genomic distance\tSpearman rank correlation\n')
        for i in xrange(len(corrs)):
            out.write('%s\t%s\n' % (dists[i], corrs[i]))
        out.close()
    if kwargs.get('get_bads', False):
        return corrs, dists, bads
    else:
        return corrs, dists
Ejemplo n.º 28
0
def filter_by_zero_count(matrx, draw_hist=False, savefig=None):
    """
    fits the distribution of Hi-C interaction count by column in the matrix to
    a polynomial. Then searches for the first possible 
    """
    nbins = 100
    # get sum of columns
    cols = []
    for c in sorted(matrx, key=sum):
        cols.append(len(c) - c.count(0))
    cols = np.array(cols)
    if draw_hist:
        plt.figure(figsize=(9, 9))
    median = np.median(cols)
    # mad = np.median([abs(median - c ) for c in cols])
    best =(None, None, None, None)
    # bin the sum of columns
    xmin = min(cols)
    xmax = max(cols)
    y = np.linspace(xmin, xmax, nbins)
    hist = np.digitize(cols, y)
    x = [sum(hist == i) for i in range(1, nbins + 1)]
    if draw_hist:
        hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
    xp = range(0, cols[-1])
    # check if the binning is correct
    # we want at list half of the bins with some data
    while list(x).count(0) > 2*len(x)/3:
        cols = cols[:-1]
        xmin = min(cols)
        xmax = max(cols)
        y = np.linspace(xmin, xmax, nbins)
        hist = np.digitize(cols, y)
        x = [sum(hist == i) for i in range(1, nbins + 1)]
        if draw_hist:
            plt.clf()
            hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
        xp = range(0, cols[-1])
    # find best polynomial fit in a given range
    for order in range(7, 14):
        z = np.polyfit(y, x, order)
        zpp = np.polyder(z, m=1)
        roots = np.roots(np.polyder(z))
        # check that we are concave down, otherwise take next root
        pente = np.polyval(zpp, abs(roots[-2] - roots[-1]) / 2 + roots[-1])
        if pente > 0:
            root = roots[-1]
        else:
            root = roots[-2]
        # root must be higher than zero
        if root <= 0:
            continue
        # and lower than the median
        if root >= median:
            continue
        p  = np.poly1d(z)
        R2 = get_r2(p, x, y)
        if best[0] < R2:
            best = (R2, order, p, z, root)
    p, z, root = best[2:]
    if draw_hist:
        a = plt.plot(xp, p(xp), "--", color='k')
        b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed')
        try:
            plt.legend(a + [b], ['polyfit \n%s' % (
                ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                             '$%s%.1fx^%s$' % ('+' if j>0 else '', j,
                                               '{' + str(i) + '}'))
                         for i, j in enumerate(list(p)[::-1])])),
                                   'first solution of polynomial derivation'],
                       fontsize='x-small')
        except TypeError:
            plt.legend(a + [b], ['polyfit \n%s' % (
                ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                             '$%s%.1fx^%s$' % ('+' if j>0 else '', j,
                                               '{' + str(i) + '}'))
                         for i, j in enumerate(list(p)[::-1])])),
                                   'first solution of polynomial derivation'])
        plt.ylim(0, plt.ylim()[1])
        if savefig:
            tadbit_savefig(savefig)
        else:
            plt.show()
    # label as bad the columns with sums lower than the root
    bads = {}
    for i, col in enumerate(matrx):
        if sum(col) < root:
            bads[i] = None
    # now stored in Experiment._zeros, used for getting more accurate z-scores
    return bads
Ejemplo n.º 29
0
    def visualize(self, names=None, tad=None, focus=None, paint_tads=False,
                  axe=None, show=True, logarithm=True, normalized=False,
                  relative=True, decorate=True, savefig=None, clim=None,
                  scale=(8, 6), cmap='jet'):
        """
        Visualize the matrix of Hi-C interactions of a given experiment

        :param None names: name of the experiment to visualize, or list of
           experiment names. If None, all experiments will be shown
        :param None tad: a given TAD in the form:
           ::

             {'start': start,
              'end'  : end,
              'brk'  : end,
              'score': score}

           **Alternatively** a list of the TADs can be passed (all the TADs
           between the first and last one passed will be showed. Thus, passing
           more than two TADs might be superfluous)
        :param None focus: a tuple with the start and end positions of the
           region to visualize
        :param False paint_tads: draw a box around the TADs defined for this
           experiment
        :param None axe: an axe object from matplotlib can be passed in order
           to customize the picture
        :param True show: either to pop-up matplotlib image or not
        :param True logarithm: show the logarithm values
        :param True normalized: show the normalized data (weights might have
           been calculated previously). *Note: white rows/columns may appear in
           the matrix displayed; these rows correspond to filtered rows (see*
           :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)*
        :param True relative: color scale is relative to the whole matrix of
           data, not only to the region displayed
        :param True decorate: draws color bar, title and axes labels
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        :param None clim: tuple with minimum and maximum value range for color
           scale. I.e. clim=(-4, 10)
        :param 'jet' cmap: color map from matplotlib. Can also be a
           preconfigured cmap object.
        """
        if names == None:
            names = [xpr.name for xpr in self.experiments]
        if not isinstance(names, list) and not isinstance(names, tuple):
            names = [names]
            cols = 1
            rows = 1
        else:
            sqrtxpr = sqrt(len(names))
            cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5)))
            rows = int(sqrtxpr+.5)
        notaxe = axe == None
        if not scale:
            scale = (8, 6)
        if notaxe and len(names) != 1:
            fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows))
        for i in range(rows):
            for j in range(cols):
                if i * cols + j >= len(names):
                    break
                if notaxe and len(names) != 1:
                    axe = fig.add_subplot(
                        rows, cols, i * cols + j + 1)
                if (isinstance(names[i * cols + j], tuple) or
                    isinstance(names[i * cols + j], list)):
                    if not axe:
                        fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows))
                        axe = fig.add_subplot(
                            rows, cols, i * cols + j + 1)
                    xpr1 = self.get_experiment(names[i * cols + j][0])
                    xpr2 = self.get_experiment(names[i * cols + j][1])
                    img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads,
                                    axe=axe, show=False, logarithm=logarithm,
                                    normalized=normalized, relative=relative,
                                    decorate=decorate, savefig=False,
                                    where='up', clim=clim, cmap=cmap)
                    img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads,
                                    axe=axe, show=False, logarithm=logarithm,
                                    normalized=normalized, relative=relative,
                                    decorate=False, savefig=False, where='down',
                                    clim=clim or img.get_clim(), cmap=cmap)
                    #axe = axe.twinx()
                    #axe.set_aspect('equal',adjustable='box-forced',anchor='NE')
                    if decorate:
                        plt.text(1.01, .5,
                                 'Chromosome %s experiment %s' % (
                                     self.name, xpr2.name),
                                  rotation=-90, va='center', size='large',
                                  ha='left', transform=axe.transAxes)
                else:
                    xper = self.get_experiment(names[i * cols + j])
                    if not xper.hic_data and not xper.norm:
                        continue
                    xper.view(tad=tad, focus=focus, paint_tads=paint_tads,
                              axe=axe, show=False, logarithm=logarithm,
                              normalized=normalized, relative=relative,
                              decorate=decorate, savefig=False, clim=clim,
                              cmap=cmap)
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
Ejemplo n.º 30
0
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False,
                                  genome_seq=None, resolution=None, axe=None,
                                  savefig=None, normalized=False):
    """
    :param data: input file name, or HiC_data object or list of lists
    :param 10 min_diff: lower limit (in number of bins)
    :param 1000 max_diff: upper limit (in number of bins) to look for
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).

    :returns: slope, intercept and R square of each of the 3 correlations
    """
    resolution = resolution or 1
    dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)])
    if isinstance(data, str):
        fhandler = open(data)
        line = fhandler.next()
        while line.startswith('#'):
            line = fhandler.next()
        try:
            while True:
                _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
                if cr1 != cr2:
                    line = fhandler.next()
                    continue
                diff = abs(int(ps1)  / resolution - int(ps2) / resolution)
                if max_diff > diff >= min_diff:
                    dist_intr[diff] += 1
                line = fhandler.next()
        except StopIteration:
            pass
        fhandler.close()
    elif isinstance(data, HiC_data):
        if normalized:
            get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y]
        else:
            get_data = lambda x, y: data[x, y]
        max_diff = min(len(data), max_diff)
        if data.section_pos:
            for crm in data.section_pos:
                for diff in xrange(min_diff, min(
                    (max_diff, 1 + data.chromosomes[crm]))):
                    for i in xrange(data.section_pos[crm][0],
                                    data.section_pos[crm][1] - diff):
                        dist_intr[diff] += get_data(i, i + diff)
        else:
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i, i + diff]):
                        dist_intr[diff] += get_data(i, diff)
    else:
        if genome_seq:
            max_diff = min(max(genome_seq.values()), max_diff)
            cnt = 0
            for crm in genome_seq:
                for diff in xrange(min_diff, min(
                    (max_diff, genome_seq[crm]))):
                    for i in xrange(cnt, cnt + genome_seq[crm] - diff):
                        if not np.isnan(data[i][i + diff]):
                            dist_intr[diff] += data[i][i + diff]
                cnt += genome_seq[crm]
        else:
            max_diff = min(len(data), max_diff)
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i][i + diff]):
                        dist_intr[diff] += data[i][i + diff]
    if not axe:
        fig=plt.figure()
        axe = fig.add_subplot(111)
    # remove last part of the plot in case no interaction is count... reduce max_dist
    for diff in xrange(max_diff - 1, min_diff, -1):
        try:
            if not dist_intr[diff]:
                del(dist_intr[diff])
                max_diff -=1
                continue
        except KeyError:
            max_diff -=1
            continue
        break
    xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0]))
    x = []
    y = []
    for k in xrange(len(xp)):
        if yp[k]:
            x.append(xp[k])
            y.append(yp[k])
    axe.plot(x, y, 'k.')
    best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0)
    logx = np.log(x)
    logy = np.log(y)
    ntries = 100
    # set k for better fit
    # for k in xrange(1, ntries/5, ntries/5/5):
    if resolution == 1:
        k = 1
        for i in xrange(3, ntries-2-k):
            v1 = i * len(x) / ntries
            try:
                a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
            except ValueError:
                a1 = b1 = r21 = 0
            r21 *= r21
            for j in xrange(i + 1 + k, ntries - 2 - k):
                v2 = j * len(x) / ntries
                try:
                    a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2])
                    a3, b3, r23, _, _ = linregress(logx[v2+k:  ], logy[v2+k: ])
                except ValueError:
                    a2 = b2 = r22 = 0
                    a3 = b3 = r23 = 0
                r2 = r21 + r22**2 + r23**2
                if r2 > best[0]:
                    best = (r2, v1, v2, a1, a2, a3,
                            b1, b2, b3, k)
        # plot line of best fit
        (v1, v2, 
         a1, a2, a3,
         b1, b2, b3, k) = best[1:]
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2+k:  ], yfit3(x[v2+k:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    else:
        # from 0.7 Mb
        v1 = 700000   / resolution
        # to 10 Mb
        v2 = 10000000 / resolution
        try:
            a1, b1, r21, _, _ = linregress(logx[  :v1], logy[  :v1])
        except ValueError:
            a1, b1, r21 = 0, 0, 0
        try:
            a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2])
        except ValueError:
            a2, b2, r22 = 0, 0, 0
        try:
            a3, b3, r23, _, _ = linregress(logx[v2:  ], logy[v2:  ])
        except ValueError:
            a3, b3, r23 = 0, 0, 0
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1:v2], yfit2(x[v1:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2:  ], yfit3(x[v2:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    axe.set_ylabel('Log interaction count')
    axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution))
    axe.legend(loc='lower left', frameon=False)
    axe.set_xscale('log')
    axe.set_yscale('log')
    axe.set_xlim((min_diff, max_diff))
    try:
        axe.set_ylim((0, max(y)))
    except ValueError:
        pass
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif show==True:
        plt.show()
        plt.close('all')
    return (a1, b1, r21), (a2, b2, r22), (a3, b3, r23)
Ejemplo n.º 31
0
def filter_by_cis_percentage(cisprc,
                             beg=0.3,
                             end=0.8,
                             sigma=2,
                             verbose=False,
                             size=None,
                             min_perc=None,
                             max_perc=None,
                             savefig=None):
    """
    Define artifactual columns with either too low or too high counts of
    interactions by compraing their percentage of cis interactions
    (inter-chromosomal).

    :param cisprc: dictionary with counts of cis-percentage by bin number.
       Values of the dictionary are tuple with,m as first element the number
       of cis interactions and as second element the total number of
       interactions.
    :param 0.3 beg: proportion of bins to be considered as possibly having low
       counts
    :param 0.8 end: proportion of bins to be considered as possibly having high
       counts
    :param 2 sigma: number of standard deviations used to define lower and upper
       ranges in the varaition of the percentage of cis interactions
    :param None size: size of the genome, inumber of bins (otherwise inferred
       from cisprc dictionary)
    :param None sevefig: path to save image of the distribution of cis
       percentages and total counts by bin.

    :returns: dictionary of bins to be filtered out (with either too low or too
       high counts of interactions).
    """
    sorted_sum, indices = list(zip(*sorted((cisprc[i][1], i) for i in cisprc)))

    sorted_prc = [float(cisprc[i][0]) / cisprc[i][1] for i in indices]

    size = (max(indices) + 1) if not size else size

    win_size = _best_window_size(sorted_prc, size, beg, end, verbose=verbose)

    # define confidance bands, compute median plus/minus one standard deviation
    errors_pos = []
    errors_neg = []
    for k in range(0, size, 1):
        vals = sorted_prc[k:k + win_size]
        std = np.std(vals)
        med = np.median(vals)
        errors_pos.append(med + std * sigma)
        errors_neg.append(med - std * sigma)

    # calculate median and variation of median plus/minus one standard deviation
    # for values between percentile 10 and 90 of the distribution of the
    # percentage of cis interactions
    #  - for median plus one standard deviation
    std_err_pos = np.std(errors_pos[int(size * beg):int(size * end)])
    med_err_pos = np.median(errors_pos[int(size * beg):int(size * end)])
    #  - for median minus one standard deviation
    std_err_neg = np.std(errors_neg[int(size * beg):int(size * end)])
    med_err_neg = np.median(errors_neg[int(size * beg):int(size * end)])

    # define cutoffs, values of cis percentage plus 1 stddev should be between
    # the general median +/- 2 stddev of the distribution of the cis percentage
    # plus 1 stddev. Same on the side of median cis percentage minus 1 stddev
    beg_pos = med_err_pos - std_err_pos * sigma
    end_pos = med_err_pos + std_err_pos * sigma
    beg_neg = med_err_neg - std_err_neg * sigma
    end_neg = med_err_neg + std_err_neg * sigma

    cutoffL = None
    passed = 0
    consecutive = 10
    for cutoffL, (p, n) in enumerate(zip(errors_pos, errors_neg)):
        # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg)
        if (beg_pos < p < end_pos) and (beg_neg < n < end_neg):
            if passed >= consecutive:
                break
            passed += 1
        else:
            passed = 0
    else:
        if min_perc is None:
            raise Exception('ERROR: left cutoff not found!!!\n'
                            '  define it by hand with min_perc')
        else:
            cutoffL = min_perc / 100. * size + consecutive
    cutoffL -= consecutive  # rescale, we asked for XX consecutive

    # right
    cutoffR = None
    passed = 0
    for cutoffR, (p, n) in enumerate(list(zip(errors_pos, errors_neg))[::-1]):
        cutoffR = size - cutoffR
        # print '%6.4f %6.4f %6.4f %6.4f %6.4f %6.4f' % (beg_pos, p, end_pos, beg_neg, n, end_neg)
        if (beg_pos < p < end_pos) and (beg_neg < n < end_neg):
            if passed >= consecutive:
                break
            passed += 1
        else:
            passed = 0
    else:
        if max_perc is None:
            raise Exception('ERROR: right cutoff not found!!!\n'
                            '  define it by hand with max_perc')
        else:
            cutoffR = max_perc / 100. * size - consecutive
    cutoffR += consecutive  # rescale, we asked for XX consecutive

    if min_perc:
        cutoffL = min_perc / 100. * size
    if max_perc:
        cutoffR = max_perc / 100. * size

    min_count = sorted_sum[int(cutoffL)]
    try:
        max_count = sorted_sum[int(cutoffR)]
    except IndexError:  # all good
        max_count = sorted_sum[-1] + 1

    if verbose:
        print('        * Lower cutoff applied until bin number: %d' %
              (cutoffL))
        print(
            '        * too few  interactions defined as less than %9d interactions'
            % (min_count))
        print('        * Upper cutoff applied until bin number: %d' %
              (cutoffR))
        print(
            '        * too much interactions defined as more than %9d interactions'
            % (max_count))

    # plot
    if savefig:
        if verbose:
            print('      -> Making plot...')
        fig = plt.figure(figsize=(20, 11))
        ax1 = fig.add_subplot(111)
        plt.subplots_adjust(left=0.25, bottom=0.2)
        line1 = ax1.plot([
            float(cisprc.get(i, [0, 0])[0]) / cisprc.get(i, [1, 1])[1]
            for i in indices
        ],
                         '.',
                         color='grey',
                         alpha=0.2,
                         label='cis interactions ratio by bin',
                         zorder=1)
        line2 = ax1.plot(list(range(0, len(indices), 20)), [
            sum(
                float(cisprc.get(j, [0, 0])[0]) / cisprc.get(j, [1, 1])[1]
                for j in indices[k:k + win_size]) / win_size
            for k in range(0, len(indices), 20)
        ],
                         '.',
                         color='k',
                         alpha=0.3,
                         label='cis interactions ratio by %d bin' % win_size,
                         zorder=1)

        for k, (p, n) in enumerate(
                zip(errors_pos[::size // 100], errors_neg[::size // 100])):
            ax1.vlines(k * (size // 100), (p + n) // 2,
                       p,
                       color='red',
                       alpha=0.6)
            ax1.vlines(k * (size // 100),
                       n, (p + n) // 2,
                       color='blue',
                       alpha=0.6)
        ax1.plot(list(range(0, size, size // 100)),
                 errors_neg[::size // 100],
                 'b^',
                 mec='blue',
                 alpha=0.5)
        ax1.plot(list(range(0, size, size // 100)),
                 errors_pos[::size // 100],
                 'rv',
                 mec='red',
                 alpha=0.5)

        ax1.fill_between([0, size],
                         beg_pos,
                         end_pos,
                         color='red',
                         alpha=0.3,
                         zorder=2)
        ax1.text(-size / 15., (end_pos + beg_pos) / 2,
                 'Confidance band for\nupper stddev of median',
                 color='red',
                 ha='right',
                 va='center')
        ax1.fill_between([0, size],
                         beg_neg,
                         end_neg,
                         color='blue',
                         alpha=0.3,
                         zorder=2)
        ax1.text(-size / 15., (end_neg + beg_neg) / 2,
                 'Confidance band for\nlower stddev of median',
                 color='blue',
                 ha='right',
                 va='center')

        ax1.set_ylim((0, 1.1))
        ax1.set_ylabel('Ratio of cis interactions ratio')
        ax1.fill_betweenx([0, 1.1], cutoffL, cutoffR, color='green', alpha=0.2)
        ax1.text(
            (cutoffR + cutoffL) / 2,
            -0.1,
            ('Kept bins, top and bottom deviations from median cis-ratio\n' +
             'should be inside their respective confidance bands'),
            ha='center',
            color='green')

        ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
        line3 = ax2.plot(sorted_sum,
                         'rx',
                         alpha=0.4,
                         label='Log sum of interactions by bin')
        ax2.set_yscale('log')
        ax2.yaxis.tick_right()
        ax2.yaxis.set_label_position("right")
        ax2.set_ylabel('Log interaction counts')
        lns = line1 + line2 + line3
        labs = [l.get_label() for l in lns]
        ax2.legend(lns, labs, loc=0, bbox_to_anchor=(0, 0), frameon=False)

        ax3 = fig.add_subplot(111, frameon=False)
        ax3.xaxis.tick_top()
        ax3.set_xticks(list(range(100)), minor=True)
        ax3.set_xticks(list(range(0, 100, 5)), minor=False)
        ax3.set_yticks([])
        ax3.set_xticklabels([])
        for p in range(5, 100, 5):
            ax3.text(p, 99, '%d%%' % p, va='top', ha='left', size=9)
        ax3.tick_params(direction='in', axis='x', which='both')
        ax3.set_xlim(0, 100)
        ax3.set_ylim(0, 100)
        ax3.grid(which='major')
        ax3.grid(which='minor', alpha=0.5)
        if min_perc:
            plt.title('Setting from %.2f%% to %.2f%%' %
                      (100 * float(cutoffL) / len(indices),
                       100 * float(cutoffR) / len(indices)))
        else:
            plt.title('Keeping from %.2f%% to %.2f%%' %
                      (100 * float(cutoffL) / len(indices),
                       100 * float(cutoffR) / len(indices)))
        ax1.set_xlim((0, len(indices)))

        tadbit_savefig(savefig)
        plt.close('all')

    badcol = {}
    countL = 0
    countZ = 0
    countU = 0
    for c in range(size):
        if cisprc.get(c, [0, 0])[1] < min_count:
            badcol[c] = cisprc.get(c, [0, 0])[1]
            countL += 1
            if not c in cisprc:
                countZ += 1
        elif cisprc[c][
                1] > max_count:  # don't need get here, already cought in previous condition
            badcol[c] = cisprc.get(c, [0, 0])[1]
            countU += 1
    print(
        '     => %d BAD bins (%d/%d/%d null/low/high counts) of %d (%.1f%%)' %
        (len(badcol), countZ, countL, countU, size,
         float(len(badcol)) / size * 100))

    return badcol
Ejemplo n.º 32
0
def plot_genomic_distribution(fnam, first_read=True, resolution=10000,
                              axe=None, ylim=None, savefig=None, show=False,
                              savedata=None, chr_names=None, nreads=None):
    """
    :param fnam: input file name
    :param True first_read: uses first read.
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param None savedata: path where to store the output read counts per bin.
    :param None chr_names: can pass a list of chromosome names in case only some
       them the need to be plotted (this option may last even more than default)
    
    """
    distr = {}
    idx1, idx2 = (1, 3) if first_read else (7, 9)
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    if chr_names:
        chr_names = set(chr_names)
        cond1 = lambda x: x not in chr_names
    else:
        cond1 = lambda x: False
    if nreads:
        cond2 = lambda x: x >= nreads
    else:
        cond2 = lambda x: False
    cond = lambda x, y: cond1(x) and cond2(y)
    count = 0
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split('\t')
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    try:
        while True:
            crm, pos = line.strip().split('\t')[idx1:idx2]
            count += 1
            if cond(crm, count):
                line = fhandler.next()
                if cond2(count):
                    break
                continue
            pos = int(pos) / resolution
            try:
                distr[crm][pos] += 1
            except KeyError:
                try:
                    distr[crm][pos] = 1
                except KeyError:
                    distr[crm] = {pos: 1}
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    if not axe:
        _ = plt.figure(figsize=(15, 1 + 3 * len(
                              chr_names if chr_names else distr.keys())))

    max_y = max([max(distr[c].values()) for c in distr])
    max_x = max([len(distr[c].values()) for c in distr])
    ncrms = len(chr_names if chr_names else genome_seq if genome_seq else distr)
    data = {}
    for i, crm in enumerate(chr_names if chr_names else genome_seq
                            if genome_seq else distr):
        try:
            data[crm] = [distr[crm].get(j, 0) for j in xrange(max(distr[crm]))]
            if savefig:
                plt.subplot(ncrms, 1, i + 1)
                plt.plot(range(max(distr[crm])), data[crm],
                         color='red', lw=1.5, alpha=0.7)
        except KeyError:
            pass
        if savefig:
            if ylim:
                plt.vlines(genome_seq[crm] / resolution, ylim[0], ylim[1])
            else:
                plt.vlines(genome_seq[crm] / resolution, 0, max_y)
            plt.xlim((0, max_x))
            plt.ylim(ylim or (0, max_y))
            plt.title(crm)

    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif show:
        plt.show()

    if savedata:
        out = open(savedata, 'w')
        out.write('# CRM\tstart-end\tcount\n')
        out.write('\n'.join('%s\t%d-%d\t%d' % (c, (i * resolution) + 1, ((i + 1) * resolution), v) for c in data
                            for i, v in enumerate(data[c])))
        out.write('\n')
        out.close()
Ejemplo n.º 33
0
def filter_by_mean(matrx,
                   draw_hist=False,
                   silent=False,
                   bads=None,
                   savefig=None):
    """
    fits the distribution of Hi-C interaction count by column in the matrix to
    a polynomial. Then searches for the first possible 
    """
    nbins = 100
    if not bads:
        bads = {}
    # get sum of columns
    cols = []
    size = len(matrx)
    for c in sorted(
        [[matrx.get(i + j * size, 0) for j in xrange(size) if not j in bads]
         for i in xrange(size) if not i in bads],
            key=sum):
        cols.append(sum(c))
    cols = np.array(cols)
    if draw_hist:
        plt.figure(figsize=(9, 9))
    try:
        percentile = np.percentile(cols, 5)
    except IndexError:
        warn('WARNING: no columns to filter out')
        return bads
    # mad = np.median([abs(median - c ) for c in cols])
    best = (None, None, None, None)
    # bin the sum of columns
    xmin = min(cols)
    xmax = max(cols)
    y = np.linspace(xmin, xmax, nbins)
    hist = np.digitize(cols, y)
    x = [sum(hist == i) for i in range(1, nbins + 1)]
    if draw_hist:
        hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
    xp = range(0, int(cols[-1]))
    # check if the binning is correct
    # we want at list half of the bins with some data
    try:
        cnt = 0
        while list(x).count(0) > len(x) / 2:
            cnt += 1
            cols = cols[:-1]
            xmin = min(cols)
            xmax = max(cols)
            y = np.linspace(xmin, xmax, nbins)
            hist = np.digitize(cols, y)
            x = [sum(hist == i) for i in range(1, nbins + 1)]
            if draw_hist:
                plt.clf()
                hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
            xp = range(0, int(cols[-1]))
            if cnt > 10000:
                raise ValueError
        # find best polynomial fit in a given range
        for order in range(6, 18):
            z = np.polyfit(y, x, order)
            zp = np.polyder(z, m=1)
            roots = np.roots(np.polyder(z))
            # check that we are concave down, otherwise take next root
            pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1])
            if pente > 0:
                root = roots[-1]
            else:
                root = roots[-2]
            # root must be higher than zero
            if root <= 0:
                continue
            # and lower than the median
            if root >= percentile:
                continue
            p = np.poly1d(z)
            R2 = get_r2(p, x, y)
            # try to avoid very large orders by weigthing negatively their fit
            if order > 13:
                R2 -= float(order) / 30
            if best[0] < R2:
                best = (R2, order, p, z, root)
        try:
            p, z, root = best[2:]
            if draw_hist:
                xlims = plt.xlim()
                ylims = plt.ylim()
                a = plt.plot(xp, p(xp), "--", color='k')
                b = plt.vlines(root,
                               0,
                               plt.ylim()[1],
                               colors='r',
                               linestyles='dashed')
                # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g',
                #                linestyles='dashed')
                try:
                    plt.legend(a + [b], [
                        'polyfit \n%s' % (''.join([
                            sub(
                                'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' %
                                ('+' if j > 0 else '', j, '{' + str(i) + '}'))
                            for i, j in enumerate(list(p)[::-1])
                        ])), 'first solution of polynomial derivation'
                    ],
                               fontsize='x-small')
                except TypeError:
                    plt.legend(a + [b], [
                        'polyfit \n%s' % (''.join([
                            sub(
                                'e([-+][0-9]+)', 'e^{\\1}', '$%s%.1fx^%s$' %
                                ('+' if j > 0 else '', j, '{' + str(i) + '}'))
                            for i, j in enumerate(list(p)[::-1])
                        ])), 'first solution of polynomial derivation'
                    ])
                # plt.legend(a+[b]+[c], ['polyfit \n{}'.format (
                #     ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                #                  '${}{:.1}x^{}$'.format ('+' if j>0 else '', j,
                #                                         '{' + str(i) + '}'))
                #              for i, j in enumerate(list(p)[::-1])])),
                #                        'first solution of polynomial derivation',
                #                        'median - (1.5 * median absolute deviation)'],
                #            fontsize='x-small')
                plt.ylim([0, ylims[1]])
                plt.xlim(xlims)
                plt.xlabel('Sum of interactions')
                plt.xlabel('Number of columns with a given value')
                if savefig:
                    tadbit_savefig(savefig)
                else:
                    plt.show()
            # label as bad the columns with sums lower than the root
            for i, col in enumerate(
                [[matrx.get(i + j * size, 0) for j in xrange(size)]
                 for i in xrange(size)]):
                if sum(col) < root:
                    bads[i] = sum(col)
            # now stored in Experiment._zeros, used for getting more accurate z-scores
            if bads and not silent:
                stderr.write(
                    ('\nWARNING: removing columns having less than %s ' +
                     'counts:\n %s\n') % (round(root, 3), ' '.join([
                         '%5s' % str(i + 1) + ('' if (j + 1) % 20 else '\n')
                         for j, i in enumerate(sorted(bads.keys()))
                     ])))
        except:
            if not silent:
                stderr.write('WARNING: Too many zeroes to filter columns.' +
                             ' SKIPPING...\n')
            if draw_hist:
                plt.xlabel('Sum of interactions')
                plt.xlabel('Number of columns with a given value')
                if savefig:
                    tadbit_savefig(savefig)
                else:
                    plt.show()
    except ValueError:
        if not silent:
            stderr.write('WARNING: Too few data to filter columns based on ' +
                         'mean value.\n')
    if draw_hist:
        plt.close('all')
    return bads
Ejemplo n.º 34
0
def eig_correlate_matrices(hic_data1, hic_data2, nvect=6, normalized=False, 
                           savefig=None, show=False, savedata=None,
                           remove_bad_columns=True, **kwargs):
    """
    Compare the iteractions of two Hi-C matrices using their 6 first
    eigenvectors, with pearson correlation

    :param hic_data1: Hi-C-data object
    :param hic_data2: Hi-C-data object
    :param 6 nvect: number of eigenvectors to compare
    :param None savefig: path to save the plot
    :param False show: displays the plot
    :param False normalized: use normalized data
    :param True remove_bads: computes the union of bad columns between samples
       and exclude them from the comparison
    :param kwargs: any argument to pass to matplotlib imshow function

    :returns: matrix of correlations
    """
    data1 = hic_data1.get_matrix(normalized=normalized)
    data2 = hic_data2.get_matrix(normalized=normalized)
    ## reduce matrices to remove bad columns
    if remove_bad_columns:
        # union of bad columns
        bads = hic_data1.bads.copy()
        bads.update(hic_data2.bads)
        # remove them form both matrices
        for bad in sorted(bads, reverse=True):
            del(data1[bad])
            del(data2[bad])
            for i in xrange(len(data1)):
                _ = data1[i].pop(bad)
                _ = data2[i].pop(bad)
    # get the log
    size = len(data1)
    data1 = nozero_log(data1, np.log2)
    data2 = nozero_log(data2, np.log2)
    # get the eigenvectors
    ev1, evect1 = eigh(data1)
    ev2, evect2 = eigh(data2)
    corr = [[0 for _ in xrange(nvect)] for _ in xrange(nvect)]
    # sort eigenvectors according to their eigenvalues => first is last!!
    sort_perm = ev1.argsort()
    ev1.sort()
    evect1 = evect1[sort_perm]
    sort_perm = ev2.argsort()
    ev2.sort()
    evect2 = evect2[sort_perm]
    # calculate Pearson correlation
    for i in xrange(nvect):
        for j in xrange(nvect):
            corr[i][j] = abs(pearsonr(evect1[:,-i-1],
                                      evect2[:,-j-1])[0])
    # plot
    axe    = plt.axes([0.1, 0.1, 0.6, 0.8])
    cbaxes = plt.axes([0.85, 0.1, 0.03, 0.8])
    if show or savefig:
        im = axe.imshow(corr, interpolation="nearest",origin='lower', **kwargs)
        axe.set_xlabel('Eigen Vectors exp. 1')
        axe.set_ylabel('Eigen Vectors exp. 2')
        axe.set_xticks(range(nvect))
        axe.set_yticks(range(nvect))
        axe.set_xticklabels(range(1, nvect + 2))
        axe.set_yticklabels(range(1, nvect + 2))
        axe.xaxis.set_tick_params(length=0, width=0)
        axe.yaxis.set_tick_params(length=0, width=0)
        
        cbar = plt.colorbar(im, cax = cbaxes )
        cbar.ax.set_ylabel('Pearson correlation', rotation=90*3,
                           verticalalignment='bottom')
        axe2 = axe.twinx()
        axe2.set_yticks(range(nvect))
        axe2.set_yticklabels(['%.1f' % (e) for e in ev2[-nvect:][::-1]])
        axe2.set_ylabel('corresponding Eigen Values exp. 2', rotation=90*3,
                        verticalalignment='bottom')
        axe2.set_ylim((-0.5, nvect - 0.5))
        axe2.yaxis.set_tick_params(length=0, width=0)
        
        axe3 = axe.twiny()
        axe3.set_xticks(range(nvect))
        axe3.set_xticklabels(['%.1f' % (e) for e in ev1[-nvect:][::-1]])
        axe3.set_xlabel('corresponding Eigen Values exp. 1')
        axe3.set_xlim((-0.5, nvect - 0.5))
        axe3.xaxis.set_tick_params(length=0, width=0)
        
        axe.set_ylim((-0.5, nvect - 0.5))
        axe.set_xlim((-0.5, nvect - 0.5))
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
        plt.close('all')

    if savedata:
        out = open(savedata, 'w')
        out.write('# ' + '\t'.join(['Eigen Vector %s'% i
                                    for i in xrange(nvect)]) + '\n')
        for i in xrange(nvect):
            out.write('\t'.join([str(corr[i][j])
                                 for j in xrange(nvect)]) + '\n')
        out.close()
    if kwargs.get('get_bads', False):
        return corr, bads
    else:
        return corr
Ejemplo n.º 35
0
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False):
    """
    Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme
    (RE) name is provided, can also represent the distribution of digested and
    undigested RE sites and estimate an expected proportion of dangling-ends.

    Proportion of dangling-ends is inferred by counting the number of times a
    dangling-end site, is found at the beginning of any of the reads (divided by
    the number of reads).

    :param fnam: path to FASTQ file
    :param None nreads: max number of reads to read, not necesary to read all
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :param False paired: is input FASTQ contains both ends

    :returns: the percentage of dangling-ends (sensu stricto) and the percentage of
       reads with at least a ligation site.
    """
    phred = dict([(c, i) for i, c in enumerate(
        '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')])
    quals = []
    henes = []
    sites = []
    fixes = []
    liges = []
    ligep = 0
    tkw = dict(size=4, width=1.5)
    if fnam.endswith('.gz'):
        fhandler = gopen(fnam)
    else:
        fhandler = open(fnam)
    if not r_enz:
        if nreads:
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except EOFError:
                    break
                seq = next(fhandler)
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    else:
        r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '')
        l_site = religated(r_enz)
        d_site = repaired(r_enz)
        if r_site*2 == l_site:
            # in case the religated site equals 2 restriction sites (like DnpII)
            site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site)
            fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site)
        else:
            site = re.compile(r_site)
            fixe = re.compile(d_site)
        lige = re.compile(l_site)
        if nreads:
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
                if len(quals) > nreads:
                    break
        else: # do this because it's faster
            while True:
                try:
                    next(fhandler)
                except StopIteration:
                    break
                seq = next(fhandler)
                sites.extend([m.start() for m in site.finditer(seq)])
                fixes.extend([m.start() for m in fixe.finditer(seq)])
                liges.extend([m.start() for m in lige.finditer(seq)])
                ligep += l_site in seq
                if 'N' in seq:
                    henes.extend([i for i, s in enumerate(seq) if s == 'N'])
                next(fhandler)
                line = next(fhandler)
                quals.append([phred[i] for i in line.strip()])
    fhandler.close()
    if not nreads:
        nreads = len(quals)
    quals = zip(*quals)
    meanquals = [np.mean(q) for q in quals]
    errorquals = [np.std(q) for q in quals]

    if axe:
        ax = axe
        fig = axe.get_figure()
        ax2 = fig.add_subplot(212)
    else:
        if r_enz:
            _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12))
        else:
            _, ax = plt.subplots(1,1, figsize=(15, 6))
        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.4)
        ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax.set_axisbelow(True)
        # remove tick marks
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False)
        ax.tick_params(axis='both', direction='out', top=False, right=False,
                       left=False, bottom=False, which='minor')
    ax.errorbar(range(len(line.strip())), meanquals,
                linewidth=1, elinewidth=1, color='darkblue',
                yerr=errorquals, ecolor='orange')

    ax.set_xlim((0, len(line)))
    ax.set_xlabel('Nucleotidic position')
    ax.set_ylabel('PHRED score')
    ax.set_title('Sequencing Quality (%d reads)' % (nreads))
    ax.yaxis.label.set_color('darkblue')
    ax.tick_params(axis='y', colors='darkblue', **tkw)
    axb = ax.twinx()
    axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1,
             color='black', linestyle='--')
    axb.yaxis.label.set_color('black')
    axb.tick_params(axis='y', colors='black', **tkw)
    axb.set_ylabel('Number of "N" per position')
    try: # no Ns found (yes... it happens)
        axb.set_yscale('log')
        axb.set_ylim((0, axb.get_ylim()[1] * 1000))
    except ValueError:
        axb.set_yscale('linear')
    ax.set_ylim((0, ax.get_ylim()[1]))
    ax.set_xlim((0, len(line)))

    if r_enz:
        ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % (
            r_enz, nreads))
        ax.set_xlabel('')
        plt.setp(ax.get_xticklabels(), visible=False)
        ax2.patch.set_facecolor('lightgrey')
        ax2.patch.set_alpha(0.4)
        ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major')
        ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor')
        ax2.set_axisbelow(True)
        ax2.set_xlabel('Nucleotidic position')
        seq_len = len(line) - max((len(r_site), len(l_site), len(d_site)))
        sites = [sites.count(k) for k in xrange(seq_len)] # Undigested
        liges = [liges.count(k) for k in xrange(seq_len)] # OK
        fixes = [fixes.count(k) for k in xrange(seq_len)] # DE
        if d_site in r_site:
            pos = r_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)])
        if d_site in l_site:
            pos = l_site.find(d_site)
            fixes = (fixes[:pos] +
                     [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)])
        site_len = max((len(r_site), len(l_site), len(d_site)))
        if paired:
            sites[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            liges[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
            fixes[len(line) / 2 - site_len:
                  len(line) / 2] = [float('nan')] * site_len
        ax2.plot(sites, linewidth=2, color='darkred')
        ax2.set_ylabel('Undigested RE site (%s)' % r_site)
        ax2.yaxis.label.set_color('darkred')
        ax2.tick_params(axis='y', colors='darkred', **tkw)
        ax3 = ax2.twinx()
        ax3.plot(liges, linewidth=2, color='darkblue')
        ax3.yaxis.label.set_color('darkblue')
        ax3.tick_params(axis='y', colors='darkblue', **tkw)
        ax3.set_ylabel('Religated (%s)' % l_site)
        if any([f > 0 for f in fixes]):
            ax4 = ax2.twinx()
            ax4.spines["right"].set_position(("axes", 1.07))
            make_patch_spines_invisible(ax4)
            ax4.spines["right"].set_visible(True)        
            ax4.plot(fixes, linewidth=2, color='darkorange')
            ax4.yaxis.label.set_color('darkorange')
            ax4.tick_params(axis='y', colors='darkorange', **tkw)
            ax4.set_ylabel('Dangling-ends (%s)' % d_site)
        else:
            ax2.set_ylabel('RE site & Dangling-ends  (%s)' % r_site)
        ax2.set_xlim((0, len(line)))
        lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2])
        sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2])
        des = ((100. * (fixes[0] + (fixes[(len(line) / 2)]
                                            if paired else 0)))
                       / nreads) if any([f > 0 for f in fixes]) else (
            100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads
        plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' +
                   'Percentage of reads with ligation site: %.0f%%') %(
                      (100. * lig_cnt) / (lig_cnt + sit_cnt),
                      des,
                      (ligep * 100.) / nreads))
        plt.subplots_adjust(right=0.85)
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif not axe:
        plt.show()
    return des, (ligep * 100.) / nreads
Ejemplo n.º 36
0
    def draw(self, focus=None, extras=None, ymax=None, ali_colors=('grey',),
             normalized=True, savefig=None, shape='ellipse'):
        """
        Draw alignments as a plot.

        :param None focus: can pass a tuple (bin_start, bin_stop) to display the
           alignment between these genomic bins
        :param None extras: list of coordinates (genomic bin) where to draw a
           red cross
        :param None ymax: limit the y axis up to a given value
        :param ('grey', ): successive colors for alignment
        :param True normalized: normalized Hi-C count are plotted instead of raw
           data.
        :param 'ellipse' shape: which kind of shape to use as schematic
           representation of TADs. Implemented: 'ellipse', 'rectangle',
           'triangle'
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        """
        from matplotlib.cm import jet
        from matplotlib import pyplot as plt
        experiments = self.__experiments

        maxres = max([e.resolution for e in experiments])
        facts = [maxres / e.resolution for e in experiments]


        siz = experiments[0].size
        if focus:
            figsiz = 4 + (focus[1] - focus[0]) / 30
        else:
            figsiz = 4 + siz / 30
        fig, axes = plt.subplots(nrows=len(experiments),
                                 sharex=True, sharey=True,
                                 figsize=(figsiz, 1 + len(experiments) * 1.8))
        fig.subplots_adjust(hspace=0)

        maxys = []
        for iex, xpr in enumerate(experiments):
            if not xpr.name in self:
                continue
            _tad_density_plot(xpr, maxys=maxys, normalized=normalized,
                              fact_res=facts[iex], axe=axes[iex],
                              extras=extras, shape=shape, focus=focus)
        # draw alignment columns
        start = focus[0] if focus else 1
        end = focus[1] if focus else xpr.tads[max(xpr.tads)]['end']
        maxy = (ymax or max(maxys)) + 0.4
        maxxs = []
        for iex in range(len(experiments)):
            starting = focus[0] if focus else 1
            ending = (focus[1] if focus
                      else experiments[iex].tads.values()[-1]['end'])
            axes[iex].hlines(1, 1, end, 'k', lw=1.5)
            axes[iex].set_ylim((0, maxy))
            maxxs.append(ending / facts[iex])
            axes[iex].text(starting + 1, float(maxy) / 20,
                           experiments[iex].name, {'ha': 'left',
                                                   'va': 'bottom'})
            axes[iex].set_yticks([float(i) / 2
                                  for i in range(1, int(maxy + .5) * 2)])
            if ymax:
                axes[iex].set_ylim((0, ymax))
            axes[iex].set_xlim(xmin=starting, xmax=max(maxxs))

        pos = {'ha': 'center', 'va': 'bottom'}
        for i, col in enumerate(self.itercolumns()):
            ends = sorted([(t['end'], j) for j, t in enumerate(col) if t['end']])
            beg = (ends[0][0] + 0.9) / facts[ends[0][1]]
            end = (ends[-1][0] + 1.1) / facts[ends[-1][1]]
            if focus:
                if beg < focus[0] or end > focus[1]:
                    continue
            axes[0].text(beg + float(end - beg) / 2, maxy + float(maxy) / 20,
                         str(i + 1), pos,
                         rotation=90, size='small')
            for iex, tad in enumerate(col):
                if not tad['end']:
                    continue
                axes[iex].axvspan(beg-.2, end+.2, alpha=0.2,
                                  color=ali_colors[i%(len(ali_colors))])
        axes[iex].set_xlabel('Genomic bin')
        tit1 = fig.suptitle("TAD borders' alignment", size='x-large')
        tit2 = axes[0].set_title("Alignment column number")
        tit2.set_y(1.3)
        plt.subplots_adjust(top=0.76)
        # This was for color bar instead of legend
        # ax1 = fig.add_axes([0.9 + 0.3/figsiz, 0.05, 0.2/figsiz, 0.9])
        # cb1 = colorbar.ColorbarBase(ax1, cmap=jet,
        #                             norm=colors.Normalize(vmin=0., vmax=1.))
        # cb1.set_label('Border prediction score')
        # cb1.ax.set_yticklabels([str(i)for i in range(1, 11)])
        fig.set_facecolor('white')
        plots = []
        for scr in xrange(1, 11):
            plots += plt.plot((100,),(100,), marker=6, ms=9,
                              color=jet(float(scr) / 10), mec='none')
        try:
            axes[-1].legend(plots,
                            [str(scr) for scr in xrange(1, 11)],
                            numpoints=1, title='Border scores',
                            fontsize='small', loc='lower left',
                            bbox_to_anchor=(1, 0.5))
        except TypeError:
            axes[-1].legend(plots,
                            [str(scr) for scr in xrange(1, 11)],
                            numpoints=1, title='Border scores',
                            loc='lower left',
                            bbox_to_anchor=(1, 0.5))
        if savefig:
            tadbit_savefig(savefig)
        else:
            plt.show()
Ejemplo n.º 37
0
def draw_map(data, genome_seq, cumcs, savefig, show, one=False, clim=None,
             cmap='jet', decay=False, perc=10, name=None, cistrans=None,
             decay_resolution=10000, normalized=False, max_diff=None):
    _ = plt.figure(figsize=(15.,12.5))
    if not max_diff:
        max_diff = len(data)
    ax1 = plt.axes([0.34, 0.08, 0.6, 0.7205])
    ax2 = plt.axes([0.07, 0.65, 0.21, 0.15])
    if decay:
        ax3 = plt.axes([0.07, 0.42, 0.21, 0.15])
        plot_distance_vs_interactions(data, genome_seq=genome_seq, axe=ax3,
                                      resolution=decay_resolution,
                                      max_diff=max_diff, normalized=normalized)
    ax4 = plt.axes([0.34, 0.805, 0.6, 0.04], sharex=ax1)
    ax5 = plt.axes([0.34, 0.845, 0.6, 0.04], sharex=ax1)
    ax6 = plt.axes([0.34, 0.885, 0.6, 0.04], sharex=ax1)
    try:
        minoridata   = np.nanmin(data)
        maxoridata   = np.nanmax(data)
    except AttributeError:
        vals = [i for d in data for i in d if not np.isnan(i)]
        minoridata   = np.min(vals)
        maxoridata   = np.max(vals)
    totaloridata = np.nansum([data[i][j] for i in xrange(len(data))
                              for j in xrange(i, len(data))])
    data = nozero_log(data, np.log2)
    vals = np.array([i for d in data for i in d])
    vals = vals[np.isfinite(vals)]

    mindata = np.nanmin(vals)
    maxdata = np.nanmax(vals)
    diff = maxdata - mindata
    posI = 0.01 if not clim else (float(clim[0]) / diff) if clim[0] != None else 0.01
    posF = 1.0  if not clim else (float(clim[1]) / diff) if clim[1] != None else 1.0
    if cmap == 'tadbit':
        cuts = perc
        cdict = {'red'  : [(0.0,  0.0, 0.0)],
                 'green': [(0.0,  0.0, 0.0)],
                 'blue' : [(0.0,  0.5, 0.5)]}
        prev_pos  = 0
        median = (np.median(vals) - mindata) / diff
        for prc in np.linspace(posI, median, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - posI) / (median - posI)) + 1. / cuts
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, prc, prc])
            cdict['green'].append([pos, prc, prc])
            cdict['blue' ].append([pos, 1, 1])
            prev_pos  = pos
        for prc in np.linspace(median + 1. / cuts, posF, cuts / 2, endpoint=False):
            try:
                pos = (np.percentile(vals, prc * 100.) - mindata) / diff
                prc = ((prc - median) / (posF - median))
            except ValueError:
                pos = prc = 0
            if prev_pos >= pos:
                continue
            cdict['red'  ].append([pos, 1.0, 1.0])
            cdict['green'].append([pos, 1 - prc, 1 - prc])
            cdict['blue' ].append([pos, 1 - prc, 1 - prc])
            prev_pos  = pos
        pos = (np.percentile(vals ,97.) - mindata) / diff
        cdict['red'  ].append([pos, 0.1, 0.1])
        cdict['green'].append([pos, 0, 0])
        cdict['blue' ].append([pos, 0, 0])

        cdict['red'  ].append([1.0, 1, 1])
        cdict['green'].append([1.0, 1, 1])
        cdict['blue' ].append([1.0, 0, 0])
        cmap  = LinearSegmentedColormap(cmap, cdict)
        clim = None
    else:
        cmap = plt.get_cmap(cmap)
    cmap.set_bad('darkgrey', 1)

    ax1.imshow(data, interpolation='none',
               cmap=cmap, vmin=clim[0] if clim else None, vmax=clim[1] if clim else None)
    size = len(data)
    for i in xrange(size):
        for j in xrange(i, size):
            if np.isnan(data[i][j]):
                data[i][j] = 0
                data[j][i] = 0
            #data[j][i] = data[i][j]
    evals, evect = eigh(data)
    sort_perm = evals.argsort()
    evect = evect[sort_perm]
    data = [i for d in data for i in d if not np.isnan(i)]
    gradient = np.linspace(np.nanmin(data),
                           np.nanmax(data), size)
    gradient = np.vstack((gradient, gradient))
    h  = ax2.hist(data, color='darkgrey', linewidth=2,
                  bins=20, histtype='step', normed=True)
    _  = ax2.imshow(gradient, aspect='auto', cmap=cmap,
                    extent=(np.nanmin(data), np.nanmax(data) , 0, max(h[0])))
    if genome_seq:
        for crm in genome_seq:
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='w', linestyle='-', linewidth=1, alpha=1)
            ax1.vlines([cumcs[crm][0]-.5, cumcs[crm][1]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
            ax1.hlines([cumcs[crm][1]-.5, cumcs[crm][0]-.5], cumcs[crm][0]-.5, cumcs[crm][1]-.5,
                       color='k', linestyle='--')
        if not one:
            vals = [0]
            keys = ['']
            for crm in genome_seq:
                vals.append(cumcs[crm][0])
                keys.append(crm)
            vals.append(cumcs[crm][1])
            ax1.set_yticks(vals)
            ax1.set_yticklabels('')
            ax1.set_yticks([float(vals[i]+vals[i+1])/2
                            for i in xrange(len(vals) - 1)], minor=True)
            ax1.set_yticklabels(keys, minor=True)
            for t in ax1.yaxis.get_minor_ticks():
                t.tick1On = False
                t.tick2On = False
    # totaloridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j in enumerate(str(totaloridata)[::-1])])[::-1].strip(',')
    # minoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(minoridata)[::-1])])[::-1].strip(',')
    # maxoridata = ''.join([j + ('' if (i+1)%3 else ',') for i, j   in enumerate(str(maxoridata)[::-1])])[::-1].strip(',')
    plt.figtext(0.05,0.25, ''.join([
        (name + '\n') if name else '',
        'Number of interactions: %s\n' % str(totaloridata),
        ('' if np.isnan(cistrans) else
         ('Percentage of cis interactions: %.0f%%\n' % (cistrans*100))),
        'Min interactions: %s\n' % (minoridata),
        'Max interactions: %s\n' % (maxoridata)]))
    ax2.set_xlim((np.nanmin(data), np.nanmax(data)))
    ax2.set_ylim((0, max(h[0])))
    ax1.set_xlim ((-0.5, size - .5))
    ax1.set_ylim ((-0.5, size - .5))
    ax2.set_xlabel('log interaction count')
    # we reduce the number of dots displayed.... we just want to see the shape
    subdata = np.array(list(set([float(int(d*100))/100 for d in data])))
    try:
        normfit = sc_norm.pdf(subdata, np.nanmean(data), np.nanstd(data))
    except AttributeError:
        normfit = sc_norm.pdf(subdata, np.mean(data), np.std(data))
    ax2.plot(subdata, normfit, 'w.', markersize=2.5, alpha=.4)
    ax2.plot(subdata, normfit, 'k.', markersize=1.5, alpha=1)
    ax2.set_title('skew: %.3f, kurtosis: %.3f' % (skew(data),
                                                   kurtosis(data)))
    ax4.vlines(range(size), 0, evect[:,-1], color='k')
    ax4.hlines(0, 0, size, color='red')
    ax4.set_ylabel('E1')
    ax4.set_yticklabels([])
    try:
        ax5.vlines(range(size), 0, evect[:,-2], color='k')
    except IndexError:
        pass
    ax5.hlines(0, 0, size, color='red')
    ax5.set_ylabel('E2')
    ax5.set_yticklabels([])
    try:
        ax6.vlines(range(size), 0, evect[:,-3], color='k')
    except IndexError:
        pass
    ax6.hlines(0, 0, size, color='red')
    ax6.set_ylabel('E3')
    ax6.set_yticklabels([])
    xticklabels = ax4.get_xticklabels() + ax5.get_xticklabels() + ax6.get_xticklabels()
    plt.setp(xticklabels, visible=False)
    if savefig:
        tadbit_savefig(savefig)
    elif show:
        plt.show()
    plt.close('all')
Ejemplo n.º 38
0
    def visualize(self, names=None, tad=None, focus=None, paint_tads=False,
                  axe=None, show=True, logarithm=True, normalized=False,
                  relative=True, decorate=True, savefig=None, clim=None,
                  scale=(8, 6), cmap='jet'):
        """
        Visualize the matrix of Hi-C interactions of a given experiment

        :param None names: name of the experiment to visualize, or list of
           experiment names. If None, all experiments will be shown
        :param None tad: a given TAD in the form:
           ::
           
             {'start': start,
              'end'  : end,
              'brk'  : end,
              'score': score}
              
           **Alternatively** a list of the TADs can be passed (all the TADs
           between the first and last one passed will be showed. Thus, passing
           more than two TADs might be superfluous)
        :param None focus: a tuple with the start and end positions of the 
           region to visualize
        :param False paint_tads: draw a box around the TADs defined for this
           experiment
        :param None axe: an axe object from matplotlib can be passed in order
           to customize the picture
        :param True show: either to pop-up matplotlib image or not
        :param True logarithm: show the logarithm values
        :param True normalized: show the normalized data (weights might have
           been calculated previously). *Note: white rows/columns may appear in
           the matrix displayed; these rows correspond to filtered rows (see*
           :func:`pytadbit.utils.hic_filtering.hic_filtering_for_modelling` *)*
        :param True relative: color scale is relative to the whole matrix of
           data, not only to the region displayed
        :param True decorate: draws color bar, title and axes labels
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        :param None clim: tuple with minimum and maximum value range for color
           scale. I.e. clim=(-4, 10)
        """
        if names == None:
            names = [xpr.name for xpr in self.experiments]
        if not isinstance(names, list) and not isinstance(names, tuple):
            names = [names]
            cols = 1
            rows = 1
        else:
            sqrtxpr = sqrt(len(names))
            cols = int(round(sqrtxpr + (0.0 if int(sqrtxpr)==sqrtxpr else .5)))
            rows = int(sqrtxpr+.5)
        notaxe = axe == None
        if not scale:
            scale = (8, 6)
        if notaxe and len(names) != 1:
            fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows))
        for i in xrange(rows):
            for j in xrange(cols):
                if i * cols + j >= len(names):
                    break
                if notaxe and len(names) != 1:
                    axe = fig.add_subplot(
                        rows, cols, i * cols + j + 1)
                if (isinstance(names[i * cols + j], tuple) or
                    isinstance(names[i * cols + j], list)):
                    if not axe:
                        fig = plt.figure(figsize=(scale[0] * cols, scale[1] * rows))
                        axe = fig.add_subplot(
                            rows, cols, i * cols + j + 1)                        
                    xpr1 = self.get_experiment(names[i * cols + j][0])
                    xpr2 = self.get_experiment(names[i * cols + j][1])
                    img = xpr1.view(tad=tad, focus=focus, paint_tads=paint_tads,
                                    axe=axe, show=False, logarithm=logarithm,
                                    normalized=normalized, relative=relative,
                                    decorate=decorate, savefig=False,
                                    where='up', clim=clim, cmap=cmap)
                    img = xpr2.view(tad=tad, focus=focus, paint_tads=paint_tads,
                                    axe=axe, show=False, logarithm=logarithm,
                                    normalized=normalized, relative=relative,
                                    decorate=False, savefig=False, where='down',
                                    clim=clim or img.get_clim(), cmap=cmap)
                    #axe = axe.twinx()
                    #axe.set_aspect('equal',adjustable='box-forced',anchor='NE')
                    if decorate:
                        plt.text(1.01, .5, 
                                 'Chromosome %s experiment %s' % (
                                     self.name, xpr2.name),
                                  rotation=-90, va='center', size='large',
                                  ha='left', transform=axe.transAxes)
                else:
                    xper = self.get_experiment(names[i * cols + j])
                    if not xper.hic_data and not xper.norm:
                        continue
                    xper.view(tad=tad, focus=focus, paint_tads=paint_tads,
                              axe=axe, show=False, logarithm=logarithm,
                              normalized=normalized, relative=relative,
                              decorate=decorate, savefig=False, clim=clim,
                              cmap=cmap)
        if savefig:
            tadbit_savefig(savefig)
        if show:
            plt.show()
Ejemplo n.º 39
0
def plot_distance_vs_interactions(data, min_diff=1, max_diff=1000, show=False,
                                  genome_seq=None, resolution=None, axe=None,
                                  savefig=None, normalized=False):
    """
    :param data: input file name, or HiC_data object or list of lists
    :param 10 min_diff: lower limit (in number of bins)
    :param 1000 max_diff: upper limit (in number of bins) to look for
    :param 100 resolution: group reads that are closer than this resolution
       parameter
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    
    """
    resolution = resolution or 1
    dist_intr = dict([(i, 0) for i in xrange(min_diff, max_diff)])
    if isinstance(data, str):
        fhandler = open(data)
        line = fhandler.next()
        while line.startswith('#'):
            line = fhandler.next()
        try:
            while True:
                _, cr1, ps1, _, _, _, _, cr2, ps2, _ = line.split('\t', 9)
                if cr1 != cr2:
                    line = fhandler.next()
                    continue
                diff = abs(int(ps1)  / resolution - int(ps2) / resolution)
                if max_diff > diff >= min_diff:
                    dist_intr[diff] += 1
                line = fhandler.next()
        except StopIteration:
            pass
        fhandler.close()
    elif isinstance(data, HiC_data):
        if normalized:
            get_data = lambda x, y: data[x, y] / data.bias[x] / data.bias[y]
        else:
            get_data = lambda x, y: data[x, y]
        max_diff = min(len(data), max_diff)
        if data.section_pos:
            for crm in data.section_pos:
                for diff in xrange(min_diff, min(
                    (max_diff, 1 + data.chromosomes[crm]))):
                    for i in xrange(data.section_pos[crm][0],
                                    data.section_pos[crm][1] - diff):
                        dist_intr[diff] += get_data(i, i + diff)
        else:
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i, i + diff]):
                        dist_intr[diff] += get_data(i, diff)
    else:
        if genome_seq:
            max_diff = min(max(genome_seq.values()), max_diff)
            cnt = 0
            for crm in genome_seq:
                for diff in xrange(min_diff, min(
                    (max_diff, genome_seq[crm]))):
                    for i in xrange(cnt, cnt + genome_seq[crm] - diff):
                        if not np.isnan(data[i][i + diff]):
                            dist_intr[diff] += data[i][i + diff]
                cnt += genome_seq[crm]
        else:
            max_diff = min(len(data), max_diff)
            for diff in xrange(min_diff, max_diff):
                for i in xrange(len(data) - diff):
                    if not np.isnan(data[i][i + diff]):
                        dist_intr[diff] += data[i][i + diff]
    if not axe:
        fig=plt.figure()
        axe = fig.add_subplot(111)
    # remove last part of the plot in case no interaction is count... reduce max_dist
    for diff in xrange(max_diff - 1, min_diff, -1):
        try:
            if not dist_intr[diff]:
                del(dist_intr[diff])
                max_diff -=1
                continue
        except KeyError:
            max_diff -=1
            continue
        break
    xp, yp = zip(*sorted(dist_intr.items(), key=lambda x:x[0]))
    x = []
    y = []
    for k in xrange(len(xp)):
        if yp[k]:
            x.append(xp[k])
            y.append(yp[k])
    axe.plot(x, y, 'k.')
    best = (float('-inf'), 0, 0, 0, 0, 0, 0, 0, 0, 0)
    logx = np.log(x)
    logy = np.log(y)
    ntries = 100
    # set k for better fit
    # for k in xrange(1, ntries/5, ntries/5/5):
    if resolution == 1:
        k = 1
        for i in xrange(3, ntries-2-k):
            v1 = i * len(x) / ntries
            try:
                a1, b1, r21, _, _ = linregress(logx[ :v1], logy[ :v1])
            except ValueError:
                a1 = b1 = r21 = 0
            r21 *= r21
            for j in xrange(i + 1 + k, ntries - 2 - k):
                v2 = j * len(x) / ntries
                try:
                    a2, b2, r22, _, _ = linregress(logx[v1+k:v2], logy[v1+k:v2])
                    a3, b3, r23, _, _ = linregress(logx[v2+k:  ], logy[v2+k: ])
                except ValueError:
                    a2 = b2 = r22 = 0
                    a3 = b3 = r23 = 0
                r2 = r21 + r22**2 + r23**2
                if r2 > best[0]:
                    best = (r2, v1, v2, a1, a2, a3,
                            b1, b2, b3, k)
        # plot line of best fit
        (v1, v2, 
         a1, a2, a3,
         b1, b2, b3, k) = best[1:]
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1+k:v2], yfit2(x[v1+k:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2+k:  ], yfit3(x[v2+k:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    else:
        # from 0.7 Mb
        v1 = 700000   / resolution
        # to 10 Mb
        v2 = 10000000 / resolution
        try:
            a1, b1, r21, _, _ = linregress(logx[  :v1], logy[  :v1])
        except ValueError:
            a1, b1, r21 = 0, 0, 0
        try:
            a2, b2, r22, _, _ = linregress(logx[v1:v2], logy[v1:v2])
        except ValueError:
            a2, b2, r22 = 0, 0, 0
        try:
            a3, b3, r23, _, _ = linregress(logx[v2:  ], logy[v2:  ])
        except ValueError:
            a3, b3, r23 = 0, 0, 0
        yfit1 = lambda xx: np.exp(b1 + a1*np.array (np.log(xx)))
        yfit2 = lambda xx: np.exp(b2 + a2*np.array (np.log(xx)))
        yfit3 = lambda xx: np.exp(b3 + a3*np.array (np.log(xx)))
        axe.plot(x[  :v1], yfit1(x[  :v1] ), color= 'yellow', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0-0.7 \mathrm{ Mb}' if resolution != 1 else '1', a1))
                 #label = r'$\alpha_1=%.2f$ (0-%d)' % (a1, x[v1]))
        axe.plot(x[v1:v2], yfit2(x[v1:v2]),  color= 'orange', lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '0.7-10 \mathrm{ Mb}' if resolution != 1 else '2', a2))
                 # label = r'$\alpha_2=%.2f$ (%d-%d)' % (a2, x[v1], x[v2]))
        axe.plot(x[v2:  ], yfit3(x[v2:  ] ), color= 'red'   , lw=2,
                 label = r'$\alpha_{%s}=%.2f$' % (
                     '10 \mathrm{ Mb}-\infty' if resolution != 1 else '3', a3))
                 # label = r'$\alpha_3=%.2f$ (%d-$\infty$)' % (a3, x[v2+k]))
    axe.set_ylabel('Log interaction count')
    axe.set_xlabel('Log genomic distance (resolution: %s)' % nicer(resolution))
    axe.legend(loc='lower left', frameon=False)
    axe.set_xscale('log')
    axe.set_yscale('log')
    axe.set_xlim((min_diff, max_diff))
    try:
        axe.set_ylim((0, max(y)))
    except ValueError:
        pass
    if savefig:
        tadbit_savefig(savefig)
        plt.close('all')
    elif show==True:
        plt.show()
        plt.close('all')
Ejemplo n.º 40
0
def filter_by_mean(matrx, draw_hist=False, silent=False, bads=None, savefig=None):
    """
    fits the distribution of Hi-C interaction count by column in the matrix to
    a polynomial. Then searches for the first possible 
    """
    nbins = 100
    if not bads:
        bads = {}
    # get sum of columns
    cols = []
    size = len(matrx)
    for c in sorted([[matrx.get(i+j*size, 0) for j in xrange(size) if not j in bads]
                     for i in xrange(size) if not i in bads], key=sum):
        cols.append(sum(c))
    cols = np.array(cols)
    if draw_hist:
        plt.figure(figsize=(9, 9))
    try:
        percentile = np.percentile(cols, 5)
    except IndexError:
        warn('WARNING: no columns to filter out')
        return bads
    # mad = np.median([abs(median - c ) for c in cols])
    best =(None, None, None, None)
    # bin the sum of columns
    xmin = min(cols)
    xmax = max(cols)
    y = np.linspace(xmin, xmax, nbins)
    hist = np.digitize(cols, y)
    x = [sum(hist == i) for i in range(1, nbins + 1)]
    if draw_hist:
        hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
    xp = range(0, int(cols[-1]))
    # check if the binning is correct
    # we want at list half of the bins with some data
    try:
        cnt = 0
        while list(x).count(0) > len(x)/2:
            cnt += 1
            cols = cols[:-1]
            xmin = min(cols)
            xmax = max(cols)
            y = np.linspace(xmin, xmax, nbins)
            hist = np.digitize(cols, y)
            x = [sum(hist == i) for i in range(1, nbins + 1)]
            if draw_hist:
                plt.clf()
                hist = plt.hist(cols, bins=100, alpha=.3, color='grey')
            xp = range(0, int(cols[-1]))
            if cnt > 10000:
                raise ValueError
        # find best polynomial fit in a given range
        for order in range(6, 18):
            z = np.polyfit(y, x, order)
            zp = np.polyder(z, m=1)
            roots = np.roots(np.polyder(z))
            # check that we are concave down, otherwise take next root
            pente = np.polyval(zp, abs(roots[-2] - roots[-1]) / 2 + roots[-1])
            if pente > 0:
                root = roots[-1]
            else:
                root = roots[-2]
            # root must be higher than zero
            if root <= 0:
                continue
            # and lower than the median
            if root >= percentile:
                continue
            p  = np.poly1d(z)
            R2 = get_r2(p, x, y)
            # try to avoid very large orders by weigthing negatively their fit
            if order > 13:
                R2 -= float(order)/30
            if best[0] < R2:
                best = (R2, order, p, z, root)
        try:
            p, z, root = best[2:]
            if draw_hist:
                xlims = plt.xlim()
                ylims = plt.ylim()
                a = plt.plot(xp, p(xp), "--", color='k')
                b = plt.vlines(root, 0, plt.ylim()[1], colors='r', linestyles='dashed')
                # c = plt.vlines(median - mad * 1.5, 0, 110, colors='g',
                #                linestyles='dashed')
                try:
                    plt.legend(a+[b], ['polyfit \n%s' % (
                        ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                                     '$%s%.1fx^%s$' % ('+' if j>0 else '', j,
                                                       '{' + str(i) + '}'))
                                 for i, j in enumerate(list(p)[::-1])])),
                                           'first solution of polynomial derivation'],
                               fontsize='x-small')
                except TypeError:
                    plt.legend(a+[b], ['polyfit \n%s' % (
                        ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                                     '$%s%.1fx^%s$' % ('+' if j>0 else '', j,
                                                       '{' + str(i) + '}'))
                                 for i, j in enumerate(list(p)[::-1])])),
                                           'first solution of polynomial derivation'])
                # plt.legend(a+[b]+[c], ['polyfit \n{}'.format (
                #     ''.join([sub('e([-+][0-9]+)', 'e^{\\1}',
                #                  '${}{:.1}x^{}$'.format ('+' if j>0 else '', j,
                #                                         '{' + str(i) + '}'))
                #              for i, j in enumerate(list(p)[::-1])])),
                #                        'first solution of polynomial derivation',
                #                        'median - (1.5 * median absolute deviation)'],
                #            fontsize='x-small')
                plt.ylim([0, ylims[1]])
                plt.xlim(xlims)
                plt.xlabel('Sum of interactions')
                plt.xlabel('Number of columns with a given value')
                if savefig:
                    tadbit_savefig(savefig)
                else:
                    plt.show()
            # label as bad the columns with sums lower than the root
            for i, col in enumerate([[matrx.get(i+j*size, 0)
                                      for j in xrange(size)]
                                     for i in xrange(size)]):
                if sum(col) < root:
                    bads[i] = sum(col)
            # now stored in Experiment._zeros, used for getting more accurate z-scores
            if bads and not silent:
                stderr.write(('\nWARNING: removing columns having less than %s ' +
                              'counts:\n %s\n') % (
                                 round(root, 3), ' '.join(
                                     ['%5s'%str(i + 1) + (''if (j + 1) % 20 else '\n')
                                      for j, i in enumerate(sorted(bads.keys()))])))
        except:
            if not silent:
                stderr.write('WARNING: Too many zeroes to filter columns.' +
                             ' SKIPPING...\n')
            if draw_hist:
                plt.xlabel('Sum of interactions')
                plt.xlabel('Number of columns with a given value')
                if savefig:
                    tadbit_savefig(savefig)
                else:
                    plt.show()
    except ValueError:
        if not silent:
            stderr.write('WARNING: Too few data to filter columns based on ' +
                         'mean value.\n')
    if draw_hist:
        plt.close('all')
    return bads
Ejemplo n.º 41
0
def plot_iterative_mapping(fnam1, fnam2, total_reads=None, axe=None, savefig=None):
    """
    :param fnam: input file name
    :param total_reads: total number of reads in the initial FASTQ file
    :param None axe: a matplotlib.axes.Axes object to define the plot
       appearance
    :param None savefig: path to a file where to save the image generated;
       if None, the image will be shown using matplotlib GUI (the extension
       of the file name will determine the desired format).
    :returns: a dictionary with the number of reads per mapped length
    """
    count_by_len = {}
    total_reads = total_reads or 1
    if not axe:
        fig=plt.figure()
        _ = fig.add_subplot(111)
    colors = ['olive', 'darkcyan']
    iteration = False
    for i, fnam in enumerate([fnam1, fnam2]):
        fhandler = open(fnam)
        line = fhandler.next()
        count_by_len[i] = {}
        while line.startswith('#'):
            if line.startswith('# MAPPED '):
                itr, num = line.split()[2:]
                count_by_len[i][int(itr)] = int(num)
            line = fhandler.next()
        if not count_by_len[i]:
            iteration = True
            try:
                while True:
                    _, length, _, _ = line.rsplit('\t', 3)
                    try:
                        count_by_len[i][int(length)] += 1
                    except KeyError:
                        count_by_len[i][int(length)] = 1
                    line = fhandler.next()
            except StopIteration:
                pass
        fhandler.close()
        lengths = sorted(count_by_len[i].keys())
        for k in lengths[::-1]:
            count_by_len[i][k] += sum([count_by_len[i][j]
                                       for j in lengths if j < k])
        plt.plot(lengths, [float(count_by_len[i][l]) / total_reads
                           for l in lengths],
                 label='read' + str(i + 1), linewidth=2, color=colors[i])
    if iteration:
        plt.xlabel('read length (bp)')
    else:
        plt.xlabel('Iteration number')
    if total_reads != 1:
        plt.ylabel('Proportion of mapped reads')
    else:
        plt.ylabel('Number of mapped reads')
    plt.legend(loc=4)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')
    return count_by_len
Ejemplo n.º 42
0
    def draw(self, focus=None, extras=None, ymax=None, ali_colors=('grey',),
             normalized=True, savefig=None, shape='ellipse'):
        """
        Draw alignments as a plot.
        
        :param None focus: can pass a tuple (bin_start, bin_stop) to display the
           alignment between these genomic bins
        :param None extras: list of coordinates (genomic bin) where to draw a
           red cross
        :param None ymax: limit the y axis up to a given value
        :param ('grey', ): successive colors for alignment
        :param True normalized: normalized Hi-C count are plotted instead of raw
           data.
        :param 'ellipse' shape: which kind of shape to use as schematic
           representation of TADs. Implemented: 'ellipse', 'rectangle',
           'triangle'
        :param None savefig: path to a file where to save the image generated;
           if None, the image will be shown using matplotlib GUI (the extension
           of the file name will determine the desired format).
        """
        from matplotlib.cm import jet
        from matplotlib import pyplot as plt
        experiments = self.__experiments

        maxres = max([e.resolution for e in experiments])
        facts = [maxres / e.resolution for e in experiments]
        

        siz = experiments[0].size
        if focus:
            figsiz = 4 + (focus[1] - focus[0]) / 30
        else:
            figsiz = 4 + siz / 30
        fig, axes = plt.subplots(nrows=len(experiments),
                                 sharex=True, sharey=True,
                                 figsize=(figsiz, 1 + len(experiments) * 1.8))
        fig.subplots_adjust(hspace=0)

        maxys = []
        for iex, xpr in enumerate(experiments):
            if not xpr.name in self:
                continue
            _tad_density_plot(xpr, maxys=maxys, normalized=normalized,
                              fact_res=facts[iex], axe=axes[iex],
                              extras=extras, shape=shape, focus=focus)
        # draw alignment columns
        start = focus[0] if focus else 1
        end = focus[1] if focus else xpr.tads[max(xpr.tads)]['end']
        maxy = (ymax or max(maxys)) + 0.4
        maxxs = []
        for iex in range(len(experiments)):
            starting = focus[0] if focus else 1
            ending = (focus[1] if focus
                      else experiments[iex].tads.values()[-1]['end'])
            axes[iex].hlines(1, 1, end, 'k', lw=1.5)
            axes[iex].set_ylim((0, maxy))
            maxxs.append(ending / facts[iex])
            axes[iex].text(starting + 1, float(maxy) / 20,
                           experiments[iex].name, {'ha': 'left',
                                                   'va': 'bottom'})
            axes[iex].set_yticks([float(i) / 2
                                  for i in range(1, int(maxy + .5) * 2)])
            if ymax:
                axes[iex].set_ylim((0, ymax))
            axes[iex].set_xlim(xmin=starting, xmax=max(maxxs))

        pos = {'ha': 'center', 'va': 'bottom'}
        for i, col in enumerate(self.itercolumns()):
            ends = sorted([(t['end'], j) for j, t in enumerate(col) if t['end']])
            beg = (ends[0][0] + 0.9) / facts[ends[0][1]]
            end = (ends[-1][0] + 1.1) / facts[ends[-1][1]]
            if focus:
                if beg < focus[0] or end > focus[1]:
                    continue
            axes[0].text(beg + float(end - beg) / 2, maxy + float(maxy) / 20,
                         str(i + 1), pos,
                         rotation=90, size='small')
            for iex, tad in enumerate(col):
                if not tad['end']:
                    continue
                axes[iex].axvspan(beg-.2, end+.2, alpha=0.2,
                                  color=ali_colors[i%(len(ali_colors))])
        axes[iex].set_xlabel('Genomic bin')
        tit1 = fig.suptitle("TAD borders' alignment", size='x-large')
        tit2 = axes[0].set_title("Alignment column number")
        tit2.set_y(1.3)
        plt.subplots_adjust(top=0.76)
        # This was for color bar instead of legend
        # ax1 = fig.add_axes([0.9 + 0.3/figsiz, 0.05, 0.2/figsiz, 0.9])
        # cb1 = colorbar.ColorbarBase(ax1, cmap=jet,
        #                             norm=colors.Normalize(vmin=0., vmax=1.))
        # cb1.set_label('Border prediction score')
        # cb1.ax.set_yticklabels([str(i)for i in range(1, 11)])
        fig.set_facecolor('white')
        plots = []
        for scr in xrange(1, 11):
            plots += plt.plot((100,),(100,), marker=6, ms=9,
                              color=jet(float(scr) / 10), mec='none')
        try:
            axes[-1].legend(plots,
                            [str(scr) for scr in xrange(1, 11)],
                            numpoints=1, title='Border scores',
                            fontsize='small', loc='lower left',
                            bbox_to_anchor=(1, 0.5))
        except TypeError:
            axes[-1].legend(plots,
                            [str(scr) for scr in xrange(1, 11)],
                            numpoints=1, title='Border scores',
                            loc='lower left',
                            bbox_to_anchor=(1, 0.5))
        if savefig:
            tadbit_savefig(savefig)
        else:
            plt.show()
Ejemplo n.º 43
0
def insert_sizes(fnam, savefig=None, nreads=None, max_size=99.9, axe=None,
                 xlog=False):
    """
    Plots the distribution of dangling-ends lengths
    :param fnam: input file name
    :param None savefig: path where to store the output images.
    :param 99.9 max_size: top percentage of distances to consider, within the
       top 0.01% are usually found very long outliers.
    :param False xlog: represent x axis in logarithmic scale

    :returns: the median value and the percentile inputed as max_size.
    """
    distr = {}
    genome_seq = OrderedDict()
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        if line.startswith('# CRM '):
            crm, clen = line[6:].split()
            genome_seq[crm] = int(clen)
        line = fhandler.next()
    des = []
    if nreads:
        nreads /= 2
    try:
        while True:
            (crm1, pos1, dir1, _, re1, _,
             crm2, pos2, dir2, _, re2) = line.strip().split('\t')[1:12]
            if re1==re2 and crm1 == crm2 and dir1 != dir2:
                pos1, pos2 = int(pos1), int(pos2)
                if (pos2 > pos1) == int(dir1):
                    des.append(abs(pos2 - pos1))
                if len(des) == nreads:
                    break
            line = fhandler.next()
    except StopIteration:
        pass
    fhandler.close()
    ax = setup_plot(axe, figsize=(10, 5.5))
    max_perc = np.percentile(des, max_size)
    perc99   = np.percentile(des, 99)
    perc01   = np.percentile(des, 1)
    perc50   = np.percentile(des, 50)
    perc95   = np.percentile(des, 95)
    perc05   = np.percentile(des, 5)
    desapan = ax.axvspan(perc95, perc99, facecolor='darkolivegreen', alpha=.3,
                         label='1-99%% DEs\n(%.0f-%.0f nts)' % (perc01, perc99))
    ax.axvspan(perc01, perc05, facecolor='darkolivegreen', alpha=.3)
    desapan = ax.axvspan(perc05, perc95, facecolor='darkseagreen', alpha=.3,
                         label='5-95%% DEs\n(%.0f-%.0f nts)' % (perc05, perc95))
    deshist = ax.hist(des, bins=100, range=(0, max_perc),
                      alpha=.7, color='darkred', label='Dangling-ends')
    ylims   = ax.get_ylim()
    plots   = []
    ax.set_xlabel('Genomic distance between reads')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of dangling-ends ' +
                 'lenghts\n(median: %s, top %.1f%%, up to %0.f nts)' % (
                     perc50, max_size, max_perc))
    if xlog:
        ax.set_xscale('log')
    ax.set_xlim((50, max_perc))
    plt.subplots_adjust(left=0.1, right=0.75)
    ax.legend(bbox_to_anchor=(1.4, 1), frameon=False)
    if savefig:
        tadbit_savefig(savefig)
    elif not axe:
        plt.show()
    plt.close('all')
    return perc50, max_perc