Ejemplo n.º 1
0
def get_radial_statistic(R,
                         func=np.nanmean,
                         genome='hg38'):
    """
    Compute a statistic on the radial data.
   
    Params:
    -------
        R: lists of radial positions, indexed by chromosome
        func: the function used to compute the statistic
        genome: target genome, to retrieve constants
    
    Returns:
    --------
        R_stat: array with a single number per chromosome, i.e. the statistic
    """


    SIZES = const.get_genome_sizes(genome) #chromosome sizes
    
    R_stat = np.zeros(len(SIZES))
    for i in range(len(R)):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            R_stat[i] = func(R[i])

    return R_stat
def get_bin_sizes(resolution, genome="hg38"):
    """
    Helper function to generate a list of bin sizes corresponding to the 
    chromosomes in a target genome. Each bin size reflects the number of bins
    that a given chromosome is discretized to at a given bin resolution.
    
    Params:
    -------
        resolution: the resolution at which the chromosomes will be binned,
                    in basepairs
        genome: genome type used to look up chromosome sizes
    
    Returns:
    --------
        bin_sizes_chr: list of bin sizes corresponding to each chromosome in the
                       target genome.
    """
    #Init constants
    SIZES = const.get_genome_sizes(genome)

    #Bin sizes for matrix per chromosome
    chr_count = len(SIZES)
    bin_sizes_chr = np.array([0])
    for i in range(1, chr_count):
        chr_size = SIZES[i]
        num_bins = int(np.ceil(SIZES[i] / resolution))
        bin_sizes_chr = np.append(bin_sizes_chr, num_bins)

    return bin_sizes_chr
Ejemplo n.º 3
0
def get_radial_dists(cells,
                     genome='hg38'):
    """
    Get the radial positions of all reads, indexed by chromosome.
    
    Params:
    -------
        cells: a list of the single cells, dataframe
        genome: target genome, to retrieve constants
    
    Returns:
    --------
        R: lists of radial read positions, indexed by chromosome
        
    """
    SIZES = const.get_genome_sizes(genome) #chromosome sizes

    R = [] # to record normed radial distances
    for i in range(len(SIZES)):
        R.append([])

    for cell in cells:
        chr_nums = cell["hg38_chr"].values
        radii = cell["norm_r_2D"].values
    
        for i in range(len(chr_nums)):
            R[chr_nums[i]].append(radii[i])

    return R
def make_genome_wide_matrix(cells, resolution=10 * 10**6, genome="hg38"):
    """
    Make a population ensemble genome wide distance matrix given a list of 
    single cells. For each cell, iterate over chromosomes (distinguishing 
    between homologs), compute pairwise distances between their corrsponding
    reads, and index into population matrix to append distances.

    Params:
    -------
        cells: list of cells, dataframe
        resolution: matrix resolution, in basepairs
        genome: target genome, to retreive constants
    
    Returns:
    --------
        GM: the genome wide distance matrix; each pixel is a list of distances
            observed at that corresponding pair of genomic positions, binned
            at the input resolution.
    """

    #Init constants
    SIZES = const.get_genome_sizes(genome)  #chromosome sizes
    KEYS = const.get_genome_keys(genome)  #dataframe keys
    chr_count = len(SIZES)

    #Bin sizes for matrix per chromosome
    bin_sizes_chr = get_bin_sizes(resolution, genome)

    #Total and cumulative bins
    total_bins = np.sum(bin_sizes_chr)

    #Init genome wide distance matrix
    GM = init_empty_matrix(total_bins)

    for cell in cells:
        for i in range(1, chr_count):
            for j in range(i, chr_count):
                #Handle multiple clusters (e.g. homologs) for a chromosome
                if i == j:
                    chro = cell.loc[cell[KEYS["chr"]] == i]
                    cluster_idxs = chro[KEYS["cluster"]].unique()

                    for idx in cluster_idxs:  #get intra cluster distances

                        ci = chro.loc[chro[KEYS["cluster"]] == idx]
                        if len(ci) < 1: continue  #need more than 1 read
                        GM = populate_tile(GM, i, j, ci, ci, resolution)

                #Nonhomologous clusters
                else:
                    ci = cell.loc[cell[KEYS["chr"]] == i]
                    cj = cell.loc[cell[KEYS["chr"]] == j]
                    if len(ci) < 1 or len(cj) < 1: continue
                    GM = populate_tile(GM, i, j, ci, cj, resolution)

    return GM
def make_ensemble_matrix(data,
                         chr_num,
                         resolution=2.5 * 10**6,
                         statistic=np.nanmean,
                         genome="mm10"):
    """
    Build an ensemble distance matrix for a particular chromosome given a 
    dataframe.
    
    Params:
    -------
         data: the dataframe (must be prefiltered on stage, parent, and chr)
         chr_num: desired chromosome number for ensemble distance
         resolution: matrix resolution in base pairs
         statistic: function to get distance metric 
         genome: species corresponding to the data
         
    Returns:
    --------
        A_ensemble: the ensemble distance matrix.
    
    """
    #Get all chromosome copies
    matrices, clusters = [], []
    chr_data = data.loc[data["chr"] == chr_num]
    cell_indexes = data["cell_index"].unique()

    for ci in cell_indexes:
        cell = chr_data.loc[chr_data["cell_index"] == ci]
        cluster = cell.loc[cell["chr"] == chr_num]
        if len(cluster) > 0: clusters.append(cluster)

    #Get matrix parameters
    SIZES = const.get_genome_sizes(genome)
    chr_size = SIZES[chr_num]
    num_bins = int(np.ceil(chr_size / resolution))

    #Make the ensemble matrix by concatenating all the single cell matrices
    A_ensemble = init_empty_matrix(num_bins)
    for cluster in clusters:
        A = make_distance_matrix(cluster, resolution=resolution, flatten=False)
        for i in range(len(A)):
            for j in range(len(A)):
                A_ensemble[i][j] = np.concatenate((A_ensemble[i][j], A[i][j]))

    #Flatten the matrix using distance metric function
    A_ensemble = flatten_matrix(A_ensemble, func=statistic)

    return A_ensemble
def make_distance_matrix(cluster,
                         resolution=2.5 * 10**6,
                         statistic=np.nanmean,
                         flatten=True,
                         genome="mm10"):
    """ 
    Function to generate a distance matrix from a chromosome cluster.
    
    Params:
    -------
        cluster: pandas dataframe with chromosome cluster information
        resolution: matrix resolution in base pairs
        statistic: function implementing desired distance matrix
        flatten: If true, apply the distance function, 
                 if false, return matrix with variable length 
        genome: human or mouse genome, string

    Returns:
    --------
        A: distance matrix as defined by the statistic
    """

    chr_num = cluster["chr"].unique()[0]

    if type(chr_num) != np.int64:
        raise ValueError("Cluster includes multiple chromosomes.")

    SIZES = const.get_genome_sizes(genome)
    KEYS = const.get_genome_keys(genome)

    chr_size = SIZES[chr_num]
    num_bins = int(np.ceil(chr_size / resolution))

    A = init_empty_matrix(num_bins)
    B = np.arange(0, num_bins + 1)  #bin vector

    #Get spatial position vector and genomic position vector
    R = np.array([
        cluster[KEYS["x"]].values, cluster[KEYS["y"]].values,
        cluster[KEYS["z"]].values
    ]).T
    P = np.array(cluster[KEYS["pos"]].values)

    #Bin position vector, then populate binned matrix from unbinned matrix of
    #pdists using binned indices
    P_inds = np.digitize(P, B * resolution) - 1
    R_pdist = distance.pdist(R)
    R_sf = distance.squareform(R_pdist)

    #Do the matrix binning
    for i in range(len(P_inds)):
        for j in range(i + 1, len(P_inds)):
            ii, jj = P_inds[i], P_inds[j]
            A[ii][jj].append((R_sf[i][j]))
            A[jj][ii].append((R_sf[i][j]))

    if flatten:
        #Now flatten lists into 2D matrix according to some statistic
        A = flatten_matrix(A, func=statistic)

    return A
def draw_genome_wide_matrix(A,
                            xlabel="\nGenomic Coordinate [Mb]",
                            clabel='\nSpatial Distance [um]',
                            resolution=10 * 10**6,
                            q=0.01,
                            genome='hg38'):
    """
    Draw a genome wide distance matrix.
    
    Params:
    -------
        A: distance matrix
        xlabel, clabel: x-axis and colorbar labels
        resolution: matrix resolution in basepairs
        q: percentile cutoff for clim
        genome: target genome, to retreive constants

    Returns:
    --------
        fig, ax: the figure and axes where the matrix is drawn
    """

    fig, ax = plt.subplots()

    SIZES = const.get_genome_sizes(genome)  #chromosome sizes, bp
    chr_count = len(SIZES)

    #Bin sizes for matrix per chromosome
    bin_sizes_chr = get_bin_sizes(resolution)
    total_bins = np.sum(bin_sizes_chr)
    sum_sizes = np.cumsum(bin_sizes_chr)

    clim = get_clims([A], q)  #clim = qth and 1-qth percentile values
    clim = (0, clim[1])  #start linear scale at 0

    #Draw outline around chromosome territories
    for i in range(2, chr_count + 1):
        offset0 = np.sum(bin_sizes_chr[:i - 1])
        offset1 = np.sum(bin_sizes_chr[:i])

        ax.hlines(offset0 - 1, offset0, offset1, lw=1, color='black')
        ax.hlines(offset1 - 1, offset0, offset1, lw=1, color='black')
        ax.vlines(offset0, offset0 - 1, offset1 - 1, lw=1, color='black')
        ax.vlines(offset1, offset0 - 1, offset1 - 1, lw=1, color='black')

    cmap = plt.get_cmap('seismic_r')
    cmap.set_bad(color='lightgrey')  #handle unmapped regions

    #Draw the matrix and interpolate adjacent unmapped regions
    cax = ax.imshow(A, cmap=cmap, interpolation='nearest')
    cax.set_clim(clim)
    cbar = fig.colorbar(cax, label=clabel)

    #Handle tick labels
    x_tick_labels = ["Chr 1       "]
    y_tick_labels = ["1"]
    for i in range(2, chr_count):
        if i == 23:
            x_tick_labels.append("X")
            y_tick_labels.append("X")
        elif i == 24:
            x_tick_labels.append("Y")
            y_tick_labels.append("Y")
        elif i > 1 and i < 9:
            x_tick_labels.append(str(i))
            y_tick_labels.append(str(i))
        elif i == 10:
            x_tick_labels.append(str(" ⋯ "))
            y_tick_labels.append(str(""))
        else:
            x_tick_labels.append(str(""))
            y_tick_labels.append(str(""))

    plt.xticks(sum_sizes)
    plt.yticks(sum_sizes)
    ax.set_xticklabels(x_tick_labels)
    ax.set_yticklabels(y_tick_labels)

    ax.set_xlabel(xlabel)
    ax.set_xlim(0, total_bins)
    ax.set_ylim(total_bins, 0)
    plt.tight_layout()
    plt.show()

    return fig, ax