def get_cdist(c1, c2, genome="hg38"):
    """
    Helper function to compute the pairwise spatial distance between two
    clusters (i.e. single chromosome copies).
    
    Params:
    -------
        c1: first cluster, dataframe    
        c2: second cluster, dataframe
        genome: genome type used to return dataframe keys, string

    Returns:
    --------
        R_cdist: pairwise distance between the two clusters
        P1: list of genomic positions for the first cluster
        P2: list of genomic positions for the second cluster
    """
    #Init genome-specific dataframe keys
    KEYS = const.get_genome_keys(genome)

    #Get spatial position vectors for the clusters
    R1 = np.array(
        [c1[KEYS["x"]].values, c1[KEYS["y"]].values, c1[KEYS["z"]].values]).T

    R2 = np.array(
        [c2[KEYS["x"]].values, c2[KEYS["y"]].values, c2[KEYS["z"]].values]).T

    #Get genomic position vectors for the clusters
    P1 = np.array(c1[KEYS["pos"]].values)
    P2 = np.array(c2[KEYS["pos"]].values)

    #Compute the pairwise distances
    R_cdist = distance.cdist(R1, R2)

    return (R_cdist, P1, P2)
Ejemplo n.º 2
0
def get_hull(cell, dim=3, genome='hg38'):
    """
    Construct a convex hull of a cell in n-d space.
    
    Params:
    -------
        cell: target cell, dataframe
        dims: number of dimensions for hull
        genome: target genome, for constants
    
    Returns:
    --------
        hull: scipy convex hull object
    """
    
    
    #Init genome-specific dataframe keys
    KEYS = const.get_genome_keys(genome)
    
    #Get spatial position vectors for the clusters
    R = []
    for i in range(dim):
        R.append(cell[KEYS['dim'][i]].values)
    R = np.array(R).T
    
    hull = ConvexHull(R)
    
    return hull
def get_pdists(cluster, genome="hg38"):
    """
    Get all pairwise euclidean distances within a cluster (e.g. single 
    chromsome copy, chromosome arm), as well as their genomic distances.
    Params:
    -------
        cluster: reads of interest, dataframe
        genome: target genome, to retreive constants, string

    Returns:
    --------
        R_pdist: list of all pairwise euclidean distances
        P_pdist: list of all pairwise genomic distances
     """

    KEYS = const.get_genome_keys(genome)

    #Get spatial position vector and genomic position vector
    R = np.array([
        cluster[KEYS["x"]].values, cluster[KEYS["y"]].values,
        cluster[KEYS["z"]].values
    ]).T
    P = np.array(cluster[KEYS["pos"]].values)

    R_pdist = distance.pdist(R)
    P_pdist = distance.pdist(np.array([P]).T)

    return R_pdist, P_pdist
def make_genome_wide_matrix(cells, resolution=10 * 10**6, genome="hg38"):
    """
    Make a population ensemble genome wide distance matrix given a list of 
    single cells. For each cell, iterate over chromosomes (distinguishing 
    between homologs), compute pairwise distances between their corrsponding
    reads, and index into population matrix to append distances.

    Params:
    -------
        cells: list of cells, dataframe
        resolution: matrix resolution, in basepairs
        genome: target genome, to retreive constants
    
    Returns:
    --------
        GM: the genome wide distance matrix; each pixel is a list of distances
            observed at that corresponding pair of genomic positions, binned
            at the input resolution.
    """

    #Init constants
    SIZES = const.get_genome_sizes(genome)  #chromosome sizes
    KEYS = const.get_genome_keys(genome)  #dataframe keys
    chr_count = len(SIZES)

    #Bin sizes for matrix per chromosome
    bin_sizes_chr = get_bin_sizes(resolution, genome)

    #Total and cumulative bins
    total_bins = np.sum(bin_sizes_chr)

    #Init genome wide distance matrix
    GM = init_empty_matrix(total_bins)

    for cell in cells:
        for i in range(1, chr_count):
            for j in range(i, chr_count):
                #Handle multiple clusters (e.g. homologs) for a chromosome
                if i == j:
                    chro = cell.loc[cell[KEYS["chr"]] == i]
                    cluster_idxs = chro[KEYS["cluster"]].unique()

                    for idx in cluster_idxs:  #get intra cluster distances

                        ci = chro.loc[chro[KEYS["cluster"]] == idx]
                        if len(ci) < 1: continue  #need more than 1 read
                        GM = populate_tile(GM, i, j, ci, ci, resolution)

                #Nonhomologous clusters
                else:
                    ci = cell.loc[cell[KEYS["chr"]] == i]
                    cj = cell.loc[cell[KEYS["chr"]] == j]
                    if len(ci) < 1 or len(cj) < 1: continue
                    GM = populate_tile(GM, i, j, ci, cj, resolution)

    return GM
def get_cell_clusters(cell, chr_nums, genome="hg38"):
    """
    Return clusters (i.e. single chromosome copies) from a single cell given
    a list of chromosome numbers.

    Params:
    -------
        cell: cell of interest, dataframe
        chr_nums: chromosomes of interest, list of ints
        genome: target genome, to retreive constants, string

    Returns:
    --------
        cell_clusters: list of dataframes corresponding to single chr copies
    """

    KEYS = const.get_genome_keys(genome)

    cell_clusters = []

    for chr_num in chr_nums:
        chro = cell.loc[cell[KEYS["chr"]] == chr_num]
        if len(chro) == 0: continue  #e.g. x chromsome not present
        cluster_nums = chro[KEYS["cluster"]].unique()

        if genome == "mm10":
            for cluster_num in cluster_nums:
                cell_clusters.append(chro.loc[chro[KEYS["cluster"]] == \
                                                   cluster_num])

        #Annoying but necessary logic due to cluster labeling in fibroblast data
        elif genome == "hg38":
            clusters_temp = []
            for cluster_num in cluster_nums:
                clusters_temp.append(chro.loc[chro[KEYS["cluster"]] == \
                                              cluster_num])

            clusters = sorted(clusters_temp, key=len, reverse=True)

            #If there are three or more clusters, discard all but the largest
            #two, corresponding to the putative chromosome territories. The
            #smaller clusters are the outliers.

            for i in range(len(clusters)):
                if len(clusters) > 1 and i < 2:
                    cell_clusters.append(clusters[i])

        else:
            raise ValueError("Genome not found.")

    return cell_clusters
Ejemplo n.º 6
0
def center_cell(cell, origin, dim=3, genome='hg38'):
    """
    Translate cell to a new origin.
    
    Params:
    -------
        cell: target cell, dataframe
        origin: spatial coordinates
        dim: dimensions for translation
        genome: target genome to retrieve constants
    
    Returns:
    --------
        cell: translated cell
    """

    KEYS = const.get_genome_keys(genome)
    
    for index, row in cell.iterrows():
        for i in range(dim):
            cell.at[index, KEYS['dim'][i]] = row[KEYS['dim'][i]] - origin[i]
    
    return cell
def make_distance_matrix(cluster,
                         resolution=2.5 * 10**6,
                         statistic=np.nanmean,
                         flatten=True,
                         genome="mm10"):
    """ 
    Function to generate a distance matrix from a chromosome cluster.
    
    Params:
    -------
        cluster: pandas dataframe with chromosome cluster information
        resolution: matrix resolution in base pairs
        statistic: function implementing desired distance matrix
        flatten: If true, apply the distance function, 
                 if false, return matrix with variable length 
        genome: human or mouse genome, string

    Returns:
    --------
        A: distance matrix as defined by the statistic
    """

    chr_num = cluster["chr"].unique()[0]

    if type(chr_num) != np.int64:
        raise ValueError("Cluster includes multiple chromosomes.")

    SIZES = const.get_genome_sizes(genome)
    KEYS = const.get_genome_keys(genome)

    chr_size = SIZES[chr_num]
    num_bins = int(np.ceil(chr_size / resolution))

    A = init_empty_matrix(num_bins)
    B = np.arange(0, num_bins + 1)  #bin vector

    #Get spatial position vector and genomic position vector
    R = np.array([
        cluster[KEYS["x"]].values, cluster[KEYS["y"]].values,
        cluster[KEYS["z"]].values
    ]).T
    P = np.array(cluster[KEYS["pos"]].values)

    #Bin position vector, then populate binned matrix from unbinned matrix of
    #pdists using binned indices
    P_inds = np.digitize(P, B * resolution) - 1
    R_pdist = distance.pdist(R)
    R_sf = distance.squareform(R_pdist)

    #Do the matrix binning
    for i in range(len(P_inds)):
        for j in range(i + 1, len(P_inds)):
            ii, jj = P_inds[i], P_inds[j]
            A[ii][jj].append((R_sf[i][j]))
            A[jj][ii].append((R_sf[i][j]))

    if flatten:
        #Now flatten lists into 2D matrix according to some statistic
        A = flatten_matrix(A, func=statistic)

    return A