def filter_intra_doublets(molecule_table: pd.DataFrame, prop: float = 0.1) -> pd.DataFrame: """Filters cells that present too much conflicting allele information. For each cellBC, calculates the most common allele for each intBC by UMI count. Also calculates the proportion of UMIs of alleles that conflict with the most common. If the proportion across all UMIs is > prop, filters out alignments with that cellBC from the DataFrame. Args: molecule_table: A molecule table of cellBC-UMI pairs to be filtered prop: The threshold representing the minimum proportion of conflicting UMIs needed to filter out a cellBC from the DataFrame Returns A filtered molecule table """ umis_per_allele = (molecule_table.groupby( ["cellBC", "intBC", "allele"])["UMI"].size().reset_index().sort_values("UMI", ascending=False)) umis_per_allele_unique = umis_per_allele.drop_duplicates( ["cellBC", "intBC"]) umis_per_cellBC = umis_per_allele.groupby("cellBC")["UMI"].sum() conflicting_umis_per_cellBC = ( umis_per_cellBC - umis_per_allele_unique.groupby("cellBC")["UMI"].sum()) prop_multi_alleles_per_cellBC = (conflicting_umis_per_cellBC / umis_per_cellBC) passing_mask = prop_multi_alleles_per_cellBC <= prop passing_cellBCs = set(prop_multi_alleles_per_cellBC.index[passing_mask]) logger.debug( f"Filtered {(~passing_mask).sum()} cellBCs with too much conflicitng " "allele information.") return molecule_table[molecule_table["cellBC"].isin(passing_cellBCs)]
def wrapper(*args, **kwargs): df = wrapped(*args, **kwargs) umi_count = df["UMI"].dtype != object logger.debug( f"Resulting {'alleletable' if umi_count else 'molecule_table'} statistics:" ) logger.debug(f"# Reads: {df['readCount'].sum()}") logger.debug( f"# UMIs: {df['UMI'].sum() if umi_count else df.shape[0]}") logger.debug(f"# Cell BCs: {df['cellBC'].nunique()}") return df
def map_intbcs(molecule_table: pd.DataFrame) -> pd.DataFrame: """Assign one allele to each intBC/cellBC pair. For each intBC/cellBC pairing, selects the most frequent allele (by read count, and then by UMI) and removes alignments that don't have that allele. Args: molecule_table: A molecule table of cellBC-UMI pairs to be filtered Returns: An allele table with one allele per cellBC-intBC pair """ # Have to drop out all intBCs that are NaN molecule_table = molecule_table.dropna(subset=["intBC"]) # For each cellBC-intBC pair, select the allele that has the highest # readCount; on ties, use UMI count allele_table = ( molecule_table.groupby(["cellBC", "intBC", "allele"]) .agg({"readCount": "sum", "UMI": "count"}) .reset_index() .sort_values(["UMI", "readCount"], ascending=False) ) duplicated_mask = allele_table.duplicated(["cellBC", "intBC"]) mapped_alleles = set( allele_table[~duplicated_mask][ ["cellBC", "intBC", "allele"] ].itertuples(index=False, name=None) ) # True for rows that contain the mapped allele; False for ones to filter out selection_mask = ( molecule_table[["cellBC", "intBC", "allele"]] .apply(tuple, axis=1) .isin(mapped_alleles) ) mapped_table = molecule_table[selection_mask] logger.debug(f"Alleles removed: {duplicated_mask.sum()}") logger.debug(f"UMIs removed: {(~selection_mask).sum()}") return mapped_table
def filter_inter_doublets(at: pd.DataFrame, rule: float = 0.35) -> pd.DataFrame: """Filters out cells whose kinship with their assigned lineage is low. Essentially, filters out cells that have ambigious kinship across multiple lineage groups. For every cell, calculates the kinship it has with its assigned lineage, with kinship defined as the weighted proportion of intBCs it shares with the intBC set for a lineage (see compute_lg_membership for more details on the weighting). If that kinship is <= rule, then it is filtered out. Args: at: An allele table of cellBC-intBC-allele groups to be filtered rule: The minimum kinship threshold which a cell needs to pass in order to be included in the final DataFrame Returns: A filtered allele table """ ibc_sets = {} dropouts = {} for lg_name, at_lg in at.groupby(["lineageGrp"]): ibc_sets[lg_name], dropouts[lg_name] = get_intbc_set(at_lg) # Calculate kinship for each lineage group for each cell n_filtered = 0 passing_cellBCs = [] for cellBC, at_cellBC in at.groupby("cellBC"): lg = int(at_cellBC["lineageGrp"].iloc[0]) mem = compute_lg_membership(at_cellBC, ibc_sets, dropouts) if mem[lg] < rule: n_filtered += 1 else: passing_cellBCs.append(cellBC) n_cells = at["cellBC"].nunique() logger.debug(f"Filtered {n_filtered} inter-doublets of {n_cells} cells") return at[at["cellBC"].isin(passing_cellBCs)]
def wrapper(*args, **kwargs): logger.debug(f"Keyword arguments: {kwargs}") return wrapped(*args, **kwargs)
def call_lineage_groups( input_df: pd.DataFrame, output_directory: str, min_umi_per_cell: int = 10, min_avg_reads_per_umi: float = 2.0, min_cluster_prop: float = 0.005, min_intbc_thresh: float = 0.05, inter_doublet_threshold: float = 0.35, kinship_thresh: float = 0.25, plot: bool = False, ) -> pd.DataFrame: """Assigns cells to their clonal populations. Performs multiple rounds of filtering and assigning to lineage groups: 1. Iteratively generates putative lineage groups by forming intBC groups for each lineage group and then assigning cells based on how many intBCs they share with each intBC group (kinship). 2. Refines these putative groups by removing non-informative intBCs and reassigning cells through kinship. 3. Removes all inter-lineage doublets, defined as cells that have relatively equal kinship scores across multiple lineages and whose assignments are therefore ambigious. 4. Finally, performs one more round of filtering non-informative intBCs and cellBCs with low UMI counts before returning a final table of lineage assignments, allele information, and read and umi counts for each sample. Args: input_df: The allele table of cellBC-UMI-allele groups to be annotated with lineage assignments output_directory: The folder to store the final table as well as plots min_umi_per_cell: The threshold specifying the minimum number of UMIs a cell needs in order to not be filtered during filtering min_avg_reads_per_umi: The threshold specifying the minimum coverage (i.e. average) reads per UMI in a cell needed in order for that cell not to be filtered during filtering min_cluster_prop: The minimum cluster size in the putative lineage assignment step, as a proportion of the number of cells min_intbc_thresh: The threshold specifying the minimum proportion of cells in a lineage group that need to have an intBC in order for it be retained during filtering. Also specifies the minimum proportion of cells that share an intBC with the most frequent intBC in forming putative lineage groups inter_doublet_threshold: The threshold specifying the minimum proportion of kinship a cell shares with its assigned lineage group out of all lineage groups for it to be retained during doublet filtering kinship_thresh: The threshold specifying the minimum proportion of intBCs shared between a cell and the intBC set of a lineage group needed to assign that cell to that lineage group in putative assignment plot: Indicates whether to generate plots Returns: None, saves output allele table to file. """ logger.info( f"{input_df.shape[0]} UMIs (rows), with {input_df.shape[1]} attributes (columns)" ) logger.info(str(len(input_df["cellBC"].unique())) + " Cells") # Create a pivot_table piv = pd.pivot_table(input_df, index="cellBC", columns="intBC", values="UMI", aggfunc="count") piv = piv.div(piv.sum(axis=1), axis=0) # Reorder piv columns by binarized intBC frequency pivbin = piv.copy() pivbin[pivbin > 0] = 1 intBC_sums = pivbin.sum(0) ordered_intBCs = intBC_sums.sort_values(ascending=False).index.tolist() piv = piv[ordered_intBCs] min_clust_size = int(min_cluster_prop * piv.shape[0]) logger.info("Assigning initial lineage groups...") logger.info(f"Clustering with minimum cluster size {min_clust_size}...") piv_assigned = lineage_utils.assign_lineage_groups( piv, min_clust_size, min_intbc_thresh=min_intbc_thresh, kinship_thresh=kinship_thresh, ) logger.info("Refining lineage groups...") logger.info( "Redefining lineage groups by removing low proportion intBCs...") master_LGs, master_intBCs = lineage_utils.filter_intbcs_lg_sets( piv_assigned, min_intbc_thresh=min_intbc_thresh) logger.info("Reassigning cells to refined lineage groups by kinship...") kinship_scores = lineage_utils.score_lineage_kinships( piv_assigned, master_LGs, master_intBCs) logger.info("Annotating alignment table with refined lineage groups...") allele_table = lineage_utils.annotate_lineage_groups( input_df, kinship_scores, master_intBCs) if inter_doublet_threshold: logger.info( f"Filtering out inter-lineage group doublets with proportion {inter_doublet_threshold}..." ) allele_table = doublet_utils.filter_inter_doublets( allele_table, rule=inter_doublet_threshold) logger.info( "Filtering out low proportion intBCs in finalized lineage groups...") filtered_lgs = lineage_utils.filter_intbcs_final_lineages( allele_table, min_intbc_thresh=min_intbc_thresh) allele_table = lineage_utils.filtered_lineage_group_to_allele_table( filtered_lgs) logger.debug("Final lineage group assignments:") for n, g in allele_table.groupby(["lineageGrp"]): logger.debug(f"LG {n}: " + str(len(g["cellBC"].unique())) + " cells") logger.info("Filtering out low UMI cell barcodes...") allele_table = utilities.filter_cells( allele_table, min_umi_per_cell=int(min_umi_per_cell), min_avg_reads_per_umi=min_avg_reads_per_umi, ) allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(int) if plot: logger.info("Producing Plots...") at_pivot_I = pd.pivot_table( allele_table, index="cellBC", columns="intBC", values="UMI", aggfunc="count", ) at_pivot_I.fillna(value=0, inplace=True) at_pivot_I[at_pivot_I > 0] = 1 logger.info("Producing pivot table heatmap...") lineage_utils.plot_overlap_heatmap(allele_table, at_pivot_I, output_directory) logger.info("Plotting filtered lineage group pivot table heatmap...") lineage_utils.plot_overlap_heatmap_lg(allele_table, at_pivot_I, output_directory) return allele_table
def find_top_lg( PIVOT_in: pd.DataFrame, iteration: int, min_intbc_prop: float = 0.2, kinship_thresh: float = 0.2, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Algorithm to creates lineage groups from a pivot table of UMI counts for each cellBC-intBC pair. First, identifies the most frequent intBC. Then, selects all intBCs that share a proportion of cells >= min_intbc_prop with the most frequent and defines that as the cluster set. Then finds all cells that have >= kinship_thresh intBCs that are in the cluster set and include them in the cluster. Finally outputs the cluster as the lineage group. Args: pivot_in: The input pivot table of UMI counts for each cellBC-intBC pair iteration: The cluster number and iteration number of the iterative wrapper function min_intbc_thresh: In order for an intBC to be included in the cluster set, it must have more than this proportion of cells shared with the most frequent intBC kinship_thresh: Determines the proportion of intBCs that a cell needs to share with the cluster in order to included in that cluster Returns: A pivot table of cells labled with lineage group assignments, and a pivot table of the remaining unassigned cells """ # Calculate sum of observed intBCs, identify top intBC intBC_sums = PIVOT_in.sum(0).sort_values(ascending=False) intBC_top = intBC_sums.index[0] # Take subset of PIVOT table that contain cells that have the top intBC subPIVOT_in = PIVOT_in[PIVOT_in[intBC_top] > 0] subPIVOT_in_sums = subPIVOT_in.sum(0) ordered_intBCs2 = subPIVOT_in_sums.sort_values( ascending=False).index.tolist() subPIVOT_in = subPIVOT_in[ordered_intBCs2] # Binarize subPIVOT_in[subPIVOT_in > 0] = 1 # Define intBC set subPIVOT_in_sums2 = subPIVOT_in.sum(0) total = subPIVOT_in_sums2[intBC_top] intBC_sums_filt = subPIVOT_in_sums2[subPIVOT_in_sums2 >= min_intbc_prop * total] # Reduce PIV to only intBCs considered in set intBC_set = intBC_sums_filt.index.tolist() PIV_set = PIVOT_in.iloc[:, PIVOT_in.columns.isin(intBC_set)] # Calculate fraction of UMIs within intBC_set ("kinship") for each cell # in PIV_set f_inset = PIV_set.sum(axis=1) # define set of cells with good kinship f_inset_filt = f_inset[f_inset >= kinship_thresh] LG_cells = f_inset_filt.index.tolist() # Return updated PIV with LG_cells removed PIV_noLG = PIVOT_in.iloc[~PIVOT_in.index.isin(LG_cells), :] # Return PIV with LG_cells assigned PIV_LG = PIVOT_in.iloc[PIVOT_in.index.isin(LG_cells), :].copy() PIV_LG["lineageGrp"] = iteration + 1 # Print statements logger.debug( f"LG {iteration+1} Assignment: {PIV_LG.shape[0]} cells assigned") return PIV_LG, PIV_noLG