Example #1
0
def filter_molecule_table(
    input_df: pd.DataFrame,
    output_directory: str,
    min_umi_per_cell: int = 10,
    min_avg_reads_per_umi: float = 2.0,
    min_reads_per_umi: int = -1,
    intbc_prop_thresh: float = 0.5,
    intbc_umi_thresh: int = 10,
    intbc_dist_thresh: int = 1,
    doublet_threshold: float = 0.35,
    allow_allele_conflicts: bool = False,
    plot: bool = False,
) -> pd.DataFrame:
    """Filters and corrects a molecule table of cellBC-UMI pairs.

    Performs the following steps on the alignments in a DataFrame:
        1. Filters out UMIs with read count < `min_reads_per_umi`. If
            `min_reads_per_umi` is less than 0, a dynamic threshold is
            calculated as `(99th percentile of read counts) // 10`.
        2. Filters out cellBCs with unique UMIs < `min_umi_per_cell` and
            average read count per UMI < `min_avg_reads_per_umi`.
        3. Error corrects intBCs by changing intBCs with low UMI counts to
            intBCs with the same allele and a close sequence
        4. Filters out cellBCs that contain too much conflicting allele
            information as intra-lineage doublets
        5. Chooses one allele for each cellBC-intBC pair, by selecting the most
            common. This is not performed when `allow_allele_conflicts` is True.

    Args:
        input_df: A molecule table, i.e. cellBC-UMI pairs. Note that
            each cellBC should only contain one instance of each UMI
        output_directory: The output directory path to store plots
        min_umi_per_cell: The threshold specifying the minimum number of UMIs
            in a cell needed to be retained during filtering
        min_avg_reads_per_umi: The threshold specifying the minimum coverage
            (i.e. average) reads per UMI in a cell needed in order for that
            cell to be retained during filtering
        min_reads_per_umi: The threshold specifying the minimum read count
            needed for a UMI to be retained during filtering. Set dynamically
            if value is < 0.
        intbc_prop_thresh: The threshold specifying the maximum proportion of
            the total UMI counts for a intBC to be corrected to another
        intbc_umi_thresh: The threshold specifying the maximum UMI count for
            an intBC needs to be corrected to another
        intbc_dist_thresh: The threshold specifying the maximum Levenshtein
            Distance between sequences for an intBC to be corrected to another
        doublet_threshold: The threshold specifying the maximum proportion of
            conflicting alleles information allowed to for an intBC to be
            retained in doublet filtering. Set to None to skip doublet filtering
        allow_allele_conflicts: Whether or not to allow multiple alleles to be
            assigned to each cellBC-intBC pair. For fully single-cell data,
            this option should be set to False, since each cell is expected to
            have a single allele state for each intBC. However, this option
            should be set to True for chemistries that may result in multiple
            physical cells being captured for each barcode.
        plot: Indicates whether to plot the change in intBC and cellBC counts
            across filtering stages

    Returns:
        A filtered and corrected allele table of cellBC-UMI-allele groups
    """
    input_df["status"] = "good"
    input_df.sort_values("readCount", ascending=False, inplace=True)
    rc_profile, upi_profile, upc_profile = {}, {}, {}

    logger.info("Logging initial stats...")
    if plot:
        (
            rc_profile["Init"],
            upi_profile["Init"],
            upc_profile["Init"],
        ) = utilities.record_stats(input_df)

    if min_reads_per_umi < 0:
        R = input_df["readCount"]
        if list(R):
            min_reads_per_umi = np.percentile(R, 99) // 10
        else:
            min_reads_per_umi = 0
    logger.info(f"Filtering UMIs with less than {min_reads_per_umi} reads...")
    filtered_df = utilities.filter_umis(input_df,
                                        min_reads_per_umi=min_reads_per_umi)
    if plot:
        (
            rc_profile["Filtered_UMI"],
            upi_profile["Filtered_UMI"],
            upc_profile["Filtered_UMI"],
        ) = utilities.record_stats(filtered_df)

    logger.info(
        f"Filtering out cellBCs with fewer than {min_umi_per_cell} UMIs and"
        f"less than {min_avg_reads_per_umi} average reads per UMI...")
    filtered_df = utilities.filter_cells(
        filtered_df,
        min_umi_per_cell=min_umi_per_cell,
        min_avg_reads_per_umi=min_avg_reads_per_umi,
    )
    if plot:
        (
            rc_profile["CellFilter"],
            upi_profile["CellFilter"],
            upc_profile["CellFilter"],
        ) = utilities.record_stats(filtered_df)

    if intbc_dist_thresh > 0:
        logger.info("Error correcting intBCs...")
        filtered_df = utilities.error_correct_intbc(
            filtered_df,
            prop=intbc_prop_thresh,
            umi_count_thresh=intbc_umi_thresh,
            dist_thresh=intbc_dist_thresh,
        )

    if plot:
        (
            rc_profile["Process_intBC"],
            upi_profile["Process_intBC"],
            upc_profile["Process_intBC"],
        ) = utilities.record_stats(filtered_df)

    if doublet_threshold and not allow_allele_conflicts:
        logger.info(
            f"Filtering out intra-lineage group doublets with proportion {doublet_threshold}..."
        )
        filtered_df = doublet_utils.filter_intra_doublets(
            filtered_df, prop=doublet_threshold)

    if not allow_allele_conflicts:
        logger.info("Mapping remaining intBC conflicts...")
        filtered_df = map_utils.map_intbcs(filtered_df)
    if plot:
        (
            rc_profile["Final"],
            upi_profile["Final"],
            upc_profile["Final"],
        ) = utilities.record_stats(filtered_df)

    # Count total filtered cellBCs
    cellBC_count = 0
    for name, grp in filtered_df.groupby(["cellBC"]):
        cellBC_count += 1

    if plot:
        stages = [
            "Init",
            "CellFilter",
            "Filtered_UMI",
            "Process_intBC",
            "Final",
        ]

        # Plot Read Per UMI Histogram
        h = plt.figure(figsize=(14, 10))
        for n in stages:
            ax = plt.hist(rc_profile[n],
                          label=n,
                          histtype="step",
                          log=True,
                          bins=200)
        plt.legend()
        plt.ylabel("Frequency")
        plt.xlabel("Number of Reads")
        plt.title("Reads Per UMI")
        plt.savefig(os.path.join(output_directory, "reads_per_umi.png"))
        plt.close()

        h = plt.figure(figsize=(14, 10))
        for n in stages:
            ax = plt.plot(upc_profile[n], label=n)
        plt.legend()
        plt.ylabel("Number of UMIs")
        plt.xlabel("Rank Order")
        plt.xscale("log", basex=10)
        plt.yscale("log", basey=10)
        plt.title("UMIs per CellBC")
        plt.savefig(os.path.join(output_directory, "umis_per_cellbc.png"))
        plt.close()

        h = plt.figure(figsize=(14, 10))
        for n in stages:
            ax = plt.hist(upi_profile[n],
                          label=n,
                          histtype="step",
                          log=True,
                          bins=200)
        plt.legend()
        plt.ylabel("Frequency")
        plt.xlabel("Number of UMIs")
        plt.title("UMIs per intBC")
        plt.savefig(os.path.join(output_directory, "umis_per_intbc.png"))
        plt.close()

    logger.info(
        f"Overall, filtered {cellBC_count} cells, with {filtered_df.shape[0]} UMIs."
    )

    filtered_df.set_index("readName", inplace=True)
    filtered_df.reset_index(inplace=True)

    return filtered_df
Example #2
0
def call_lineage_groups(
    input_df: pd.DataFrame,
    output_directory: str,
    min_umi_per_cell: int = 10,
    min_avg_reads_per_umi: float = 2.0,
    min_cluster_prop: float = 0.005,
    min_intbc_thresh: float = 0.05,
    inter_doublet_threshold: float = 0.35,
    kinship_thresh: float = 0.25,
    plot: bool = False,
) -> pd.DataFrame:
    """Assigns cells to their clonal populations.

    Performs multiple rounds of filtering and assigning to lineage groups:
        1. Iteratively generates putative lineage groups by forming intBC
        groups for each lineage group and then assigning cells based on how
        many intBCs they share with each intBC group (kinship).

        2. Refines these putative groups by removing non-informative intBCs
        and reassigning cells through kinship.

        3. Removes all inter-lineage doublets, defined as cells that have
        relatively equal kinship scores across multiple lineages and whose
        assignments are therefore ambigious.

        4. Finally, performs one more round of filtering non-informative intBCs
        and cellBCs with low UMI counts before returning a final table of
        lineage assignments, allele information, and read and umi counts for
        each sample.

    Args:
        input_df: The allele table of cellBC-UMI-allele groups to be annotated
            with lineage assignments
        output_directory: The folder to store the final table as well as plots
        min_umi_per_cell: The threshold specifying the minimum number of UMIs a
            cell needs in order to not be filtered during filtering
        min_avg_reads_per_umi: The threshold specifying the minimum coverage
            (i.e. average) reads per UMI in a cell needed in order for that
            cell not to be filtered during filtering
        min_cluster_prop: The minimum cluster size in the putative lineage
            assignment step, as a proportion of the number of cells
        min_intbc_thresh: The threshold specifying the minimum proportion of
            cells in a lineage group that need to have an intBC in order for it
            be retained during filtering. Also specifies the minimum proportion
            of cells that share an intBC with the most frequent intBC in
            forming putative lineage groups
        inter_doublet_threshold: The threshold specifying the minimum proportion
            of kinship a cell shares with its assigned lineage group out of all
            lineage groups for it to be retained during doublet filtering
        kinship_thresh: The threshold specifying the minimum proportion of
            intBCs shared between a cell and the intBC set of a lineage group
            needed to assign that cell to that lineage group in putative
            assignment
        plot: Indicates whether to generate plots

    Returns:
        None, saves output allele table to file.
    """
    logger.info(
        f"{input_df.shape[0]} UMIs (rows), with {input_df.shape[1]} attributes (columns)"
    )
    logger.info(str(len(input_df["cellBC"].unique())) + " Cells")

    # Create a pivot_table
    piv = pd.pivot_table(input_df,
                         index="cellBC",
                         columns="intBC",
                         values="UMI",
                         aggfunc="count")
    piv = piv.div(piv.sum(axis=1), axis=0)

    # Reorder piv columns by binarized intBC frequency
    pivbin = piv.copy()
    pivbin[pivbin > 0] = 1
    intBC_sums = pivbin.sum(0)
    ordered_intBCs = intBC_sums.sort_values(ascending=False).index.tolist()
    piv = piv[ordered_intBCs]
    min_clust_size = int(min_cluster_prop * piv.shape[0])

    logger.info("Assigning initial lineage groups...")
    logger.info(f"Clustering with minimum cluster size {min_clust_size}...")
    piv_assigned = lineage_utils.assign_lineage_groups(
        piv,
        min_clust_size,
        min_intbc_thresh=min_intbc_thresh,
        kinship_thresh=kinship_thresh,
    )

    logger.info("Refining lineage groups...")
    logger.info(
        "Redefining lineage groups by removing low proportion intBCs...")
    master_LGs, master_intBCs = lineage_utils.filter_intbcs_lg_sets(
        piv_assigned, min_intbc_thresh=min_intbc_thresh)

    logger.info("Reassigning cells to refined lineage groups by kinship...")
    kinship_scores = lineage_utils.score_lineage_kinships(
        piv_assigned, master_LGs, master_intBCs)

    logger.info("Annotating alignment table with refined lineage groups...")
    allele_table = lineage_utils.annotate_lineage_groups(
        input_df, kinship_scores, master_intBCs)
    if inter_doublet_threshold:
        logger.info(
            f"Filtering out inter-lineage group doublets with proportion {inter_doublet_threshold}..."
        )
        allele_table = doublet_utils.filter_inter_doublets(
            allele_table, rule=inter_doublet_threshold)

    logger.info(
        "Filtering out low proportion intBCs in finalized lineage groups...")
    filtered_lgs = lineage_utils.filter_intbcs_final_lineages(
        allele_table, min_intbc_thresh=min_intbc_thresh)

    allele_table = lineage_utils.filtered_lineage_group_to_allele_table(
        filtered_lgs)

    logger.debug("Final lineage group assignments:")
    for n, g in allele_table.groupby(["lineageGrp"]):
        logger.debug(f"LG {n}: " + str(len(g["cellBC"].unique())) + " cells")

    logger.info("Filtering out low UMI cell barcodes...")
    allele_table = utilities.filter_cells(
        allele_table,
        min_umi_per_cell=int(min_umi_per_cell),
        min_avg_reads_per_umi=min_avg_reads_per_umi,
    )
    allele_table["lineageGrp"] = allele_table["lineageGrp"].astype(int)

    if plot:
        logger.info("Producing Plots...")
        at_pivot_I = pd.pivot_table(
            allele_table,
            index="cellBC",
            columns="intBC",
            values="UMI",
            aggfunc="count",
        )
        at_pivot_I.fillna(value=0, inplace=True)
        at_pivot_I[at_pivot_I > 0] = 1

        logger.info("Producing pivot table heatmap...")
        lineage_utils.plot_overlap_heatmap(allele_table, at_pivot_I,
                                           output_directory)

        logger.info("Plotting filtered lineage group pivot table heatmap...")
        lineage_utils.plot_overlap_heatmap_lg(allele_table, at_pivot_I,
                                              output_directory)

    return allele_table
Example #3
0
def resolve_umi_sequence(
    molecule_table: pd.DataFrame,
    output_directory: str,
    min_umi_per_cell: int = 10,
    min_avg_reads_per_umi: float = 2.0,
    plot: bool = True,
) -> pd.DataFrame:
    """Resolve a consensus sequence for each UMI.

    This procedure will perform UMI and cellBC filtering on the basis of reads
    per UMI and UMIs per cell and then assign the most abundant sequence to
    each UMI if there is a set of conflicting sequences per UMI.

    Args:
        molecule_table: molecule table to resolve
        output_directory: Directory to store results
        min_umi_per_cell: The threshold specifying the minimum number of UMIs
            in a cell needed to be retained during filtering
        min_avg_reads_per_umi: The threshold specifying the minimum coverage
            (i.e. average) reads per UMI in a cell needed for that cell to be
            retained during filtering

    Returns:
        A molecule table with unique mappings between cellBC-UMI pairs.
    """
    if plot:
        # -------------------- Plot # of sequences per UMI -------------------- #
        equivClass_group = (molecule_table.groupby(["cellBC", "UMI"]).agg({
            "grpFlag":
            "count"
        }).sort_values("grpFlag", ascending=False).reset_index())

        _ = plt.figure(figsize=(8, 5))
        plt.hist(
            equivClass_group["grpFlag"],
            bins=range(1, equivClass_group["grpFlag"].max()),
        )
        plt.title("Unique Seqs per cellBC+UMI")
        plt.yscale("log", basey=10)
        plt.xlabel("Number of Unique Seqs")
        plt.ylabel("Count (Log)")
        plt.savefig(os.path.join(output_directory, "seqs_per_equivClass.png"))
        plt.close()

    # ----------------- Select most abundant sequence ------------------ #

    mt_filter = {}
    total_numReads = {}
    top_reads = {}
    second_reads = {}
    first_reads = {}

    unique_pairs = molecule_table.groupby(["cellBC", "UMI"], sort=False)

    for _, group in tqdm(
            unique_pairs,
            total=len(unique_pairs.size()),
            desc="Resolving UMI sequences",
    ):

        # base case - only one sequence
        if group.shape[0] == 1:
            good_readName = group["readName"].iloc[0]
            mt_filter[good_readName] = False
            total_numReads[good_readName] = group["readCount"]
            top_reads[good_readName] = group["readCount"]

        # more commonly - many sequences for a given UMI
        else:
            group_sort = group.sort_values("readCount",
                                           ascending=False,
                                           ignore_index=True)
            good_readName = group_sort["readName"].iloc[0]

            # keep the first entry (highest readCount)
            mt_filter[good_readName] = False

            total_numReads[good_readName] = group_sort["readCount"].sum()
            top_reads[good_readName] = group_sort["readCount"].iloc[0]
            second_reads[good_readName] = group_sort["readCount"].iloc[1]
            first_reads[good_readName] = group_sort["readCount"].iloc[0]

            # mark remaining UMIs for filtering
            for i in range(1, group.shape[0]):
                bad_readName = group_sort["readName"].iloc[i]
                mt_filter[bad_readName] = True

    # apply the filter using the hash table created above
    molecule_table["filter"] = molecule_table["readName"].map(mt_filter)
    n_filtered = molecule_table[molecule_table["filter"]].shape[0]

    logger.info(f"Filtered out {n_filtered} reads.")

    # filter based on status & reindex
    filt_molecule_table = molecule_table[~molecule_table["filter"]].copy()
    filt_molecule_table.drop(columns=["filter"], inplace=True)

    if plot:
        # ---------------- Plot Diagnositics after Resolving ---------------- #
        h = plt.figure(figsize=(14, 10))
        plt.plot(list(top_reads.values()), list(total_numReads.values()), "r.")
        plt.ylabel("Total Reads")
        plt.xlabel("Number Reads for Picked Sequence")
        plt.title("Total vs. Top Reads for Picked Sequence")
        plt.savefig(
            os.path.join(output_directory, "total_vs_top_reads_pickSeq.png"))
        plt.close()

        h = plt.figure(figsize=(14, 10))
        plt.plot(list(first_reads.values()), list(second_reads.values()), "r.")
        plt.ylabel("Number Reads for Second Best Sequence")
        plt.xlabel("Number Reads for Picked Sequence")
        plt.title("Second Best vs. Top Reads for Picked Sequence")
        plt.savefig(
            os.path.join(output_directory, "second_vs_top_reads_pickSeq.png"))
        plt.close()

    filt_molecule_table = utilities.filter_cells(
        filt_molecule_table,
        min_umi_per_cell=min_umi_per_cell,
        min_avg_reads_per_umi=min_avg_reads_per_umi,
    )
    return filt_molecule_table