Beispiel #1
0
def load(kmers_fpath: str) -> pd.DataFrame:
    """Load in a previously counted k-mer frequencies table.

    Parameters
    ----------
    kmers_fpath : str
        Path to kmer frequency table

    Returns
    -------
    pd.DataFrame
        index='contig', cols=[kmer, kmer, ...]

    Raises
    -------
    FileNotFoundError
        `kmers_fpath` does not exist or is empty
    TableFormatError
        `kmers_fpath` file format is invalid

    """
    if not os.path.exists(kmers_fpath) or not os.path.getsize(kmers_fpath):
        raise FileNotFoundError(kmers_fpath)
    try:
        df = pd.read_csv(kmers_fpath, sep="\t", index_col="contig")
    except (ValueError, TypeError):
        raise TableFormatError(f"contig column not found in {kmers_fpath}!")
    return df
Beispiel #2
0
def parse(bed: str, out: str = None, force: bool = False) -> pd.DataFrame:
    """Calculate coverages from bed file.

    Parameters
    ----------
    bed : str
        </path/to/file.bed>

    out : str
        if provided will write to `out`. I.e. </path/to/coverage.tsv>

    force : bool
        force overwrite of `out` if it already exists (default is False).

    Returns
    -------
    pd.DataFrame
        index='contig', col='coverage'

    Raises
    -------
    ValueError
        `out` incorrectly formatted to be read as pandas DataFrame.
    FileNotFoundError
        `bed` does not exist

    """
    if out and os.path.exists(out) and os.path.getsize(out):
        try:
            cols = ["contig", "coverage"]
            return pd.read_csv(out, sep="\t", usecols=cols, index_col="contig")
        except ValueError:
            raise TableFormatError(out)
    if not os.path.exists(bed):
        raise FileNotFoundError(bed)
    names = ["contig", "depth", "bases", "length", "depth_fraction"]
    df = pd.read_csv(bed, sep="\t", names=names, index_col="contig")
    criterion1 = df.depth != 0
    criterion2 = df.index != "genome"
    df = df[criterion1 & criterion2]
    df = df.assign(depth_product=lambda x: x.depth * x.bases)
    dff = df.groupby("contig")["depth_product", "bases"].sum()
    dff = dff.assign(coverage=lambda x: x.depth_product / x.bases)
    if out and (not os.path.exists(out) or (os.path.exists(out) and force)):
        dff.to_csv(out, sep="\t", index=True, header=True)
        logger.debug(f"{out} written")
    msg = (
        f"{os.path.basename(out)} shape: {dff.shape}" if out else f"shape: {dff.shape}"
    )
    logger.debug(msg)
    return dff[["coverage"]]
def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )

    parser = argparse.ArgumentParser(
        description=
        "Autometa Large-data-mode binning by contig set selection using max-partition-size",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--kmers",
        help="Path to k-mer counts table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--coverages",
        help="Path to metagenome coverages table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--gc-content",
        help="Path to metagenome GC contents table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--markers",
        help="Path to Autometa annotated markers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--taxonomy",
        metavar="filepath",
        help="Path to Autometa assigned taxonomies table",
        required=True,
    )
    parser.add_argument(
        "--output-binning",
        help="Path to write Autometa binning results",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-main",
        help="Path to write Autometa main table used during/after binning",
        metavar="filepath",
    )
    parser.add_argument(
        "--clustering-method",
        help="Clustering algorithm to use for recursive binning.",
        choices=["dbscan", "hdbscan"],
        default="dbscan",
    )
    parser.add_argument(
        "--completeness",
        help="completeness cutoff to retain cluster."
        " e.g. cluster completeness >= `completeness`",
        default=20.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--purity",
        help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`",
        default=95.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--cov-stddev-limit",
        help="coverage standard deviation limit to retain cluster"
        " e.g. cluster coverage standard deviation <= `cov-stddev-limit`",
        default=25.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--gc-stddev-limit",
        help="GC content standard deviation limit to retain cluster"
        " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`",
        default=5.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--norm-method",
        help="kmer normalization method to use on kmer counts",
        default="am_clr",
        choices=[
            "am_clr",
            "ilr",
            "clr",
        ],
    )
    parser.add_argument(
        "--pca-dims",
        help=
        "PCA dimensions to reduce normalized kmer frequencies prior to embedding",
        default=50,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--embed-method",
        help="kmer embedding method to use on normalized kmer frequencies",
        default="bhsne",
        choices=[
            "bhsne",
            "umap",
            "sksne",
            "trimap",
        ],
    )
    parser.add_argument(
        "--embed-dims",
        help="Embedding dimensions to reduce normalized kmers table after PCA.",
        default=2,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--max-partition-size",
        help=
        "Maximum number of contigs to consider for a recursive binning batch.",
        default=10000,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--starting-rank",
        help="Canonical rank at which to begin subsetting taxonomy",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--reverse-ranks",
        action="store_true",
        default=False,
        help="Reverse order at which to split taxonomy by canonical-rank."
        " When `--reverse-ranks` is given, contigs will be split in order of"
        " species, genus, family, order, class, phylum, superkingdom.",
    )
    parser.add_argument(
        "--cache",
        help="Directory to store itermediate checkpoint files during binning"
        " (If this is provided and the job fails, the script will attempt to"
        " begin from the checkpoints in this cache directory).",
        metavar="dirpath",
    )
    parser.add_argument(
        "--binning-checkpoints",
        help="File path to store itermediate contig binning results"
        " (The `--cache` argument is required for this feature). If  "
        "`--cache` is provided without this argument, a binning checkpoints file will be created.",
        metavar="filepath",
    )
    parser.add_argument(
        "--rank-filter",
        help=
        "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--rank-name-filter",
        help=
        "Only retrieve contigs with this name corresponding to `--rank-filter` column",
        default="bacteria",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="log debug information",
    )
    parser.add_argument(
        "--cpus",
        default=-1,
        metavar="int",
        type=int,
        help=
        "Number of cores to use by clustering method (default will try to use as many as are available)",
    )
    args = parser.parse_args()

    counts_df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
    # First check if we are performing binning with taxonomic partitioning
    if args.taxonomy:
        main_df = read_annotations(
            [args.coverages, args.gc_content, args.taxonomy])
        main_df = filter_taxonomy(df=main_df,
                                  rank=args.rank_filter,
                                  name=args.rank_name_filter)
    else:
        main_df = read_annotations([args.coverages, args.gc_content])
        embed_df = get_kmer_embedding(
            counts=counts_df,
            norm_method=args.norm_method,
            pca_dimensions=args.pca_dims,
            embed_dimensions=args.embed_dims,
            embed_method=args.embed_method,
            cache_fpath=None,
        )
        main_df = pd.merge(main_df,
                           embed_df,
                           how="left",
                           left_index=True,
                           right_index=True)

    # Prepare our markers dataframe
    markers_df = load_markers(args.markers, format="wide")

    # Ensure we have marker-containing contigs available to check binning quality...
    if main_df.loc[main_df.index.isin(markers_df.index)].empty:
        raise TableFormatError(
            "No markers for contigs in table. Unable to assess binning quality"
        )
    if main_df.shape[0] <= 1:
        raise BinningError("Not enough contigs in table for binning")

    contigs_containing_markers_count = main_df.index.isin(
        markers_df.index).sum()
    contigs_containing_markers_percent = (contigs_containing_markers_count /
                                          main_df.shape[0] * 100)
    logger.info(
        f"{contigs_containing_markers_count:,} sequences contain markers ({contigs_containing_markers_percent:.2f}% of total in binning features table)"
    )
    logger.info(f"Selected clustering method: {args.clustering_method}")

    main_out = cluster_by_taxon_partitioning(
        main=main_df,
        counts=counts_df,
        markers=markers_df,
        norm_method=args.norm_method,
        pca_dimensions=args.pca_dims,
        embed_dimensions=args.embed_dims,
        embed_method=args.embed_method,
        max_partition_size=args.max_partition_size,
        completeness=args.completeness,
        purity=args.purity,
        coverage_stddev=args.cov_stddev_limit,
        gc_content_stddev=args.gc_stddev_limit,
        starting_rank=args.starting_rank,
        method=args.clustering_method,
        reverse_ranks=args.reverse_ranks,
        cache=args.cache,
        binning_checkpoints_fpath=args.binning_checkpoints,
        n_jobs=args.cpus,
        verbose=args.verbose,
    )

    write_results(
        results=main_out,
        binning_output=args.output_binning,
        full_output=args.output_main,
    )
Beispiel #4
0
def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )
    parser = argparse.ArgumentParser(
        description="Perform marker gene guided binning of "
        "metagenome contigs using annotations (when available) of sequence "
        "composition, coverage and homology.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--kmers",
        help="Path to embedded k-mers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--coverages",
        help="Path to metagenome coverages table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--gc-content",
        help="Path to metagenome GC contents table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--markers",
        help="Path to Autometa annotated markers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-binning",
        help="Path to write Autometa binning results",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-main",
        help="Path to write Autometa main table used during/after binning",
        metavar="filepath",
    )
    parser.add_argument(
        "--clustering-method",
        help="Clustering algorithm to use for recursive binning.",
        choices=["dbscan", "hdbscan"],
        default="dbscan",
    )
    parser.add_argument(
        "--completeness",
        help="completeness cutoff to retain cluster."
        " e.g. cluster completeness >= `completeness`",
        default=20.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--purity",
        help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`",
        default=95.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--cov-stddev-limit",
        help="coverage standard deviation limit to retain cluster"
        " e.g. cluster coverage standard deviation <= `cov-stddev-limit`",
        default=25.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--gc-stddev-limit",
        help="GC content standard deviation limit to retain cluster"
        " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`",
        default=5.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--taxonomy",
        metavar="filepath",
        help="Path to Autometa assigned taxonomies table",
    )
    parser.add_argument(
        "--starting-rank",
        help="Canonical rank at which to begin subsetting taxonomy",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--reverse-ranks",
        action="store_true",
        default=False,
        help="Reverse order at which to split taxonomy by canonical-rank."
        " When `--reverse-ranks` is given, contigs will be split in order of"
        " species, genus, family, order, class, phylum, superkingdom.",
    )
    parser.add_argument(
        "--rank-filter",
        help=
        "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--rank-name-filter",
        help=
        "Only retrieve contigs with this name corresponding to `--rank-filter` column",
        default="bacteria",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="log debug information",
    )
    parser.add_argument(
        "--cpus",
        default=-1,
        metavar="int",
        type=int,
        help=
        "Number of cores to use by clustering method (default will try to use as many as are available)",
    )
    args = parser.parse_args()

    # First check if we are performing binning with taxonomic partitioning
    if args.taxonomy:
        main_df = read_annotations(
            [args.kmers, args.coverages, args.gc_content, args.taxonomy])
        main_df = filter_taxonomy(df=main_df,
                                  rank=args.rank_filter,
                                  name=args.rank_name_filter)
    else:
        main_df = read_annotations(
            [args.kmers, args.coverages, args.gc_content])

    # Prepare our markers dataframe
    markers_df = load_markers(args.markers, format="wide")

    # Ensure we have marker-containing contigs available to check binning quality...
    try:
        if main_df.loc[main_df.index.isin(markers_df.index)].empty:
            raise TableFormatError(
                "No markers for contigs in table. Unable to assess binning quality"
            )
        if main_df.shape[0] <= 1:
            raise BinningError("Not enough contigs in table for binning")
    except (TableFormatError, BinningError) as err:
        logger.warn(err)
        # Using an http error status code...
        # From: https://kinsta.com/blog/http-status-codes/#200-status-codes
        # 204: “No Content.”
        # This code means that the server has successfully processed the request
        # but is not going to return any content.
        sys.exit(204)
    logger.info(f"Selected clustering method: {args.clustering_method}")

    if args.taxonomy:
        main_out = taxon_guided_binning(
            main=main_df,
            markers=markers_df,
            completeness=args.completeness,
            purity=args.purity,
            coverage_stddev=args.cov_stddev_limit,
            gc_content_stddev=args.gc_stddev_limit,
            method=args.clustering_method,
            starting_rank=args.starting_rank,
            reverse_ranks=args.reverse_ranks,
            n_jobs=args.cpus,
            verbose=args.verbose,
        )
    else:
        # Perform clustering w/o taxonomy
        main_out = get_clusters(
            main=main_df,
            markers_df=markers_df,
            completeness=args.completeness,
            purity=args.purity,
            coverage_stddev=args.cov_stddev_limit,
            gc_content_stddev=args.gc_stddev_limit,
            method=args.clustering_method,
            n_jobs=args.cpus,
            verbose=args.verbose,
        )

    write_results(
        results=main_out,
        binning_output=args.output_binning,
        full_output=args.output_main,
    )
Beispiel #5
0
def taxon_guided_binning(
    main: pd.DataFrame,
    markers: pd.DataFrame,
    completeness: float = 20.0,
    purity: float = 95.0,
    coverage_stddev: float = 25.0,
    gc_content_stddev: float = 5.0,
    starting_rank: str = "superkingdom",
    method: str = "dbscan",
    reverse_ranks: bool = False,
    n_jobs: int = -1,
    verbose: bool = False,
) -> pd.DataFrame:
    """Perform clustering of contigs by provided `method` and use metrics to
    filter clusters that should be retained via `completeness` and `purity`
    thresholds.

    Parameters
    ----------
    main : pd.DataFrame
        index=contig,
        cols=['x','y', 'coverage', 'gc_content']
        taxa cols should be present if `taxonomy` is True.
        i.e. [taxid,superkingdom,phylum,class,order,family,genus,species]

    markers : pd.DataFrame
        wide format, i.e. index=contig cols=[marker,marker,...]

    completeness : float, optional
        Description of parameter `completeness` (the default is 20.).

    purity : float, optional
        purity threshold to retain cluster (the default is 95.0).
        e.g. cluster purity >= purity_cutoff

    coverage_stddev : float, optional
        cluster coverage threshold to retain cluster (the default is 25.0).

    gc_content_stddev : float, optional
        cluster GC content threshold to retain cluster (the default is 5.0).

    starting_rank : str, optional
        Starting canonical rank at which to begin subsetting taxonomy (the default is superkingdom).
        Choices are superkingdom, phylum, class, order, family, genus, species.

    method : str, optional
        Clustering `method` (the default is 'dbscan').
        choices = ['dbscan','hdbscan']

    reverse_ranks : bool, optional
        False - [superkingdom,phylum,class,order,family,genus,species] (Default)
        True - [species,genus,family,order,class,phylum,superkingdom]

    verbose : bool, optional
        log stats for each recursive_dbscan clustering iteration

    Returns
    -------
    pd.DataFrame
        main with ['cluster','completeness','purity'] columns added

    Raises
    -------
    TableFormatError
        No marker information is availble for contigs to be binned.
    """
    # First check needs to ensure we have markers available to check binning quality...
    if main.loc[main.index.isin(markers.index)].empty:
        raise TableFormatError(
            "No markers for contigs in table. Unable to assess binning quality"
        )
    if main.shape[0] <= 1:
        raise BinningError("Not enough contigs in table for binning")

    logger.info(f"Using {method} clustering method")
    if reverse_ranks:
        # species, genus, family, order, class, phylum, superkingdom
        ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
    else:
        # superkingdom, phylum, class, order, family, genus, species
        ranks = [
            rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"
        ]
    starting_rank_index = ranks.index(starting_rank)
    ranks = ranks[starting_rank_index:]
    logger.debug(f"Using ranks: {', '.join(ranks)}")
    clustered_contigs = set()
    num_clusters = 0
    clusters = []
    for rank in ranks:
        # TODO: We should account for novel taxa here instead of removing 'unclassified'
        unclassified_filter = main[rank] != "unclassified"
        main_grouped_by_rank = main.loc[unclassified_filter].groupby(rank)
        taxa_counts = main_grouped_by_rank[rank].count()
        n_contigs_in_taxa = taxa_counts.sum()
        n_taxa = taxa_counts.index.nunique()
        logger.info(
            f"Examining {rank}: {n_taxa:,} unique taxa ({n_contigs_in_taxa:,} contigs)"
        )
        # Group contigs by rank and find best clusters within subset
        for rank_name_txt, dff in main_grouped_by_rank:
            if dff.empty:
                continue
            # Only cluster contigs that have not already been assigned a bin.
            # First filter with 'cluster' column
            rank_df = (dff.loc[dff["cluster"].isna()]
                       if "cluster" in dff.columns else dff)
            # Second filter with previous clustering rounds' clustered contigs
            if clustered_contigs:
                rank_df = rank_df.loc[~rank_df.index.isin(clustered_contigs)]
            # After all of the filters, are there multiple contigs to cluster?
            if rank_df.empty:
                continue
            # Find best clusters
            logger.debug(
                f"Examining taxonomy: {rank} : {rank_name_txt} : {rank_df.shape}"
            )
            clusters_df = get_clusters(
                main=rank_df,
                markers_df=markers,
                completeness=completeness,
                purity=purity,
                coverage_stddev=coverage_stddev,
                gc_content_stddev=gc_content_stddev,
                method=method,
                n_jobs=n_jobs,
                verbose=verbose,
            )
            # Store clustered contigs
            is_clustered = clusters_df["cluster"].notnull()
            clustered = clusters_df.loc[is_clustered]
            if clustered.empty:
                continue
            clustered_contigs.update({contig for contig in clustered.index})
            translation = {
                c: f"bin_{1+i+num_clusters:04d}"
                for i, c in enumerate(clustered.cluster.unique())
            }

            def rename_cluster(c):
                return translation[c]

            clustered.cluster = clustered.cluster.map(rename_cluster)
            num_clusters += clustered.cluster.nunique()
            clusters.append(clustered)

    if not clusters:
        raise BinningError("Failed to recover any clusters from dataset")
    # At this point we've went through all ranks and have clusters for each canonical-rank
    # We place these into one table and then...
    clustered_df = pd.concat(clusters, sort=True)
    # create a dataframe for any contigs *not* in the clustered dataframe
    unclustered_df = main.loc[~main.index.isin(clustered_df.index)]
    unclustered_df["cluster"] = pd.NA
    return pd.concat([clustered_df, unclustered_df], sort=True)
Beispiel #6
0
def run_dbscan(
    df: pd.DataFrame,
    eps: float,
    n_jobs: int = -1,
    dropcols: List[str] = [
        "cluster",
        "purity",
        "completeness",
        "coverage_stddev",
        "gc_content_stddev",
    ],
) -> pd.DataFrame:
    """Run clustering on `df` at provided `eps`.

    Notes
    -----

        * documentation for sklearn `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_
        * documentation for `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/index.html>`_

    Parameters
    ----------
    df : pd.DataFrame
        Contigs with embedded k-mer frequencies as ['x_1','x_2',..., 'x_ndims'] columns and 'coverage' column

    eps : float
        The maximum distance between two samples for one to be considered
        as in the neighborhood of the other. This is not a maximum bound
        on the distances of points within a cluster. This is the most
        important DBSCAN parameter to choose appropriately for your data set
        and distance function. See `DBSCAN docs <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ for more details.

    dropcols : list, optional
        Drop columns in list from `df`
        (the default is ['cluster','purity','completeness','coverage_stddev','gc_content_stddev']).

    Returns
    -------
    pd.DataFrame
        `df` with 'cluster' column added

    Raises
    -------
    BinningError
        Dataframe is missing kmer/coverage annotations

    """
    # Ignore any errors raised from trying to drop columns that do not exist in our df.
    df.drop(columns=dropcols, inplace=True, errors="ignore")
    n_samples = df.shape[0]
    if n_samples == 1:
        clusters = pd.Series([pd.NA], index=df.index, name="cluster")
        return pd.merge(df,
                        clusters,
                        how="left",
                        left_index=True,
                        right_index=True)
    if np.any(df.isnull()):
        raise TableFormatError(
            f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
        )
    # NOTE: all of our kmer embedded columns should correspond from "x_1" to "x_{embedding_dimensions}"
    cols = [col for col in df.columns if "x_" in col or col == "coverage"]
    # Subset what will go into clusterer to only kmer and coverage information
    X = df.loc[:, cols].to_numpy()
    # Perform clustering
    clusterer = DBSCAN(eps=eps, min_samples=1, n_jobs=n_jobs).fit(X)
    clusters = pd.Series(clusterer.labels_, index=df.index, name="cluster")
    return pd.merge(df,
                    clusters,
                    how="left",
                    left_index=True,
                    right_index=True)
Beispiel #7
0
def run_hdbscan(
    df: pd.DataFrame,
    min_cluster_size: int,
    min_samples: int,
    cache_dir: str = None,
    core_dist_n_jobs: int = -1,
) -> pd.DataFrame:
    """Run clustering on `df` at provided `min_cluster_size`.

    Notes
    -----

        * Reasoning for parameter: `cluster_selection_method <https://hdbscan.readthedocs.io/en/latest/parameter_selection.html#leaf-clustering>`_
        * Reasoning for parameters: `min_cluster_size and min_samples <https://hdbscan.readthedocs.io/en/latest/parameter_selection.html>`_
        * Documentation for `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/index.html>`_

    Parameters
    ----------
    df : pd.DataFrame
        Contigs with embedded k-mer frequencies as ['x','y'] columns and optionally 'coverage' column

    min_cluster_size : int
        The minimum size of clusters; single linkage splits that contain
        fewer points than this will be considered points "falling out" of a
        cluster rather than a cluster splitting into two new clusters.

    min_samples : int
        The number of samples in a neighborhood for a point to be
        considered a core point.

    cache_dir : str, optional
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    core_dist_n_jobs: int
        Number of parallel jobs to run in core distance computations.
        For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used.

    Returns
    -------
    pd.DataFrame
        `df` with 'cluster' column added

    Raises
    -------
    ValueError
        sets `usecols` and `dropcols` may not share elements
    TableFormatError
        `df` is missing k-mer or coverage annotations.

    """
    # Ignore any errors raised from trying to drop previous 'cluster' in our df.
    df = df.drop(columns="cluster", errors="ignore")
    n_samples = df.shape[0]
    if n_samples == 1:
        clusters = pd.Series([pd.NA], index=df.index, name="cluster")
        return pd.merge(df,
                        clusters,
                        how="left",
                        left_index=True,
                        right_index=True)

    # NOTE: all of our kmer embedded columns should correspond from "x_1" to "x_{embedding_dimensions}"
    features_cols = [
        col for col in df.columns if "x_" in col or col == "coverage"
    ]
    # Subset what will go into clusterer to only features (kmer and coverage information)
    features_df = df[features_cols]
    if np.any(features_df.isnull()):
        raise TableFormatError(
            f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
        )
    # Fit and predict clusters
    clusters = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_method="leaf",
        allow_single_cluster=True,
        memory=cache_dir,
        core_dist_n_jobs=core_dist_n_jobs,
    ).fit_predict(features_df.to_numpy())
    clusters = pd.Series(clusters, index=df.index, name="cluster")
    # NOTE: HDBSCAN labels outliers with -1
    outlier_label = -1
    clusters = clusters.loc[clusters.ne(outlier_label)]
    return pd.merge(df,
                    clusters,
                    how="left",
                    left_index=True,
                    right_index=True)
Beispiel #8
0
def embed(
    kmers: Union[str, pd.DataFrame],
    out: str = None,
    force: bool = False,
    embed_dimensions: int = 2,
    pca_dimensions: int = 50,
    method: str = "bhsne",
    perplexity: float = 30.0,
    seed: int = 42,
    **method_args: Dict,
) -> pd.DataFrame:
    """Embed k-mers using provided `method`.

    Notes
    -----

        * `sklearn.manifold.TSNE <https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE>`_
        * `tsne.bh_sne <https://pypi.org/project/tsne/>`_
        * `UMAP <https://umap-learn.readthedocs.io/en/latest/>`_
        * `densMAP <https://umap-learn.readthedocs.io/en/latest/densmap_demo.html#better-preserving-local-density-with-densmap>`_
        * `TriMap <https://github.com/eamid/trimap>`_

    Parameters
    ----------
    kmers : str or pd.DataFrame
        </path/to/input/kmers.normalized.tsv>
    out : str, optional
        </path/to/output/kmers.out.tsv> If provided will write to `out`.
    force: bool, optional
        Whether to overwrite existing `out` file.
    embed_dimensions : int, optional
        embed_dimensions` to embed k-mer frequencies (the default is 2).
    pca_dimensions : int, optional
        Reduce k-mer frequencies dimensions to `pca_dimensions` (the default is 50).
        If zero, will skip this step.
    method : str, optional
        embedding method to use (the default is 'bhsne').
        choices include sksne, bhsne, umap, trimap and densmap.
    perplexity : float, optional
        hyperparameter used to tune sksne and bhsne (the default is 30.0).
    seed: int, optional
        Seed to use for `method`. Allows for reproducibility from random state.
    **method_args : dict, optional
        Other arguments to be supplied to respective `method`.

    Returns
    -------
    pd.DataFrame
        out dataframe with index='contig' and cols=['x','y','z']

    Raises
    -------
    TypeError
        Provided `kmers` is not a str or pd.DataFrame.
    TableFormatError
        Provided `kmers` or `out` are not formatted correctly for use.
    ValueError
        Provided `method` is not an available choice.
    FileNotFoundError
        `kmers` type must be a pd.DataFrame or filepath.
    """
    if isinstance(kmers,
                  str) and os.path.exists(kmers) and os.path.getsize(kmers):
        try:
            df = pd.read_csv(kmers, sep="\t", index_col="contig")
        except ValueError:
            raise TableFormatError(f"contig column not found in {kmers}")
    elif isinstance(kmers, pd.DataFrame):
        df = kmers
    else:
        raise TypeError(kmers)
    if out and os.path.exists(out) and os.path.getsize(out) and not force:
        logger.debug(f"k-mers frequency embedding already exists {out}")
        try:
            return pd.read_csv(out, sep="\t", index_col="contig")
        except ValueError:
            raise TableFormatError(f"contig column not found in {out}")

    if df.empty:
        kmers_desc = f"kmers:{kmers} type:{type(kmers)}"
        embed_desc = f"out:{out} type:{type(out)}"
        requirements = f"Given pd.DataFrame is empty!"
        raise FileNotFoundError(f"{kmers_desc} {embed_desc} {requirements}")

    method = method.lower()
    choices = {"umap", "sksne", "bhsne", "densmap", "trimap"}
    if method not in choices:
        raise ValueError(
            f"{method} not in embedding methods. Choices: {', '.join(choices)}"
        )
    # PCA
    n_samples, n_components = df.shape
    # Drop any rows that all cols contain NaN. This may occur if the contig length is below the k-mer size
    X = df.dropna(axis="index", how="all").fillna(0).to_numpy()
    # Set random state using provided seed
    random_state = np.random.RandomState(seed)
    if isinstance(pca_dimensions, str):
        try:
            int(pca_dimensions)
        except Exception as e:
            raise TypeError(
                f"pca_dimensions must be an integer! given: {pca_dimensions}")
    if n_components > pca_dimensions and pca_dimensions != 0:
        logger.debug(
            f"Performing decomposition with PCA (seed {seed}): {n_components} to {pca_dimensions} dims"
        )
        X = PCA(n_components=pca_dimensions,
                random_state=random_state).fit_transform(X)
        # X = PCA(n_components='mle').fit_transform(X)
        n_samples, n_components = X.shape

    logger.debug(
        f"{method}: {n_samples} data points and {n_components} dimensions")

    # Adjust perplexity according to the number of data points
    n_rows = n_samples - 1
    scaler = 3.0
    if n_rows < (scaler * perplexity):
        perplexity = (n_rows / scaler) - 1

    def do_sksne():
        return TSNE(
            n_components=embed_dimensions,
            perplexity=perplexity,
            random_state=random_state,
        ).fit_transform(X)

    def do_bhsne():
        return bh_sne(data=X,
                      d=embed_dimensions,
                      perplexity=perplexity,
                      random_state=random_state)

    # def do_densne():
    #     return densne.run_densne(
    #         X, no_dims=embed_dimensions, perplexity=perplexity, rand_seed=random_state,
    #     )

    method_is_densmap = method == "densmap"

    def do_UMAP():
        return UMAP(
            n_neighbors=15,
            n_components=embed_dimensions,
            metric="euclidean",
            random_state=random_state,
            densmap=method_is_densmap,
        ).fit_transform(X)

    def do_trimap():
        return trimap.TRIMAP(n_dims=embed_dimensions,
                             verbose=False).fit_transform(X)

    # TODO: Add "densne":do_densne() to dispatcher when easy install of densne is available.
    dispatcher = {
        "sksne": do_sksne,
        "bhsne": do_bhsne,
        "umap": do_UMAP,
        "densmap": do_UMAP,
        "trimap": do_trimap,
    }
    logger.debug(f"Performing embedding with {method} (seed {seed})")
    try:
        X = dispatcher[method](**method_args)
    except ValueError as err:
        if method == "sksne":
            logger.warning(
                f"embed_dimensions ({embed_dimensions}) is too high for sksne. Reducing to 3."
            )
            embed_dimensions = 3
            X = dispatcher[method](**method_args)
        else:
            raise err
    embedded_df = pd.DataFrame(X, index=df.index)
    # embedded kmers will follow columns of x_1 to x_{embed_dimensions}
    # Make 1-indexed instead of 0-index
    embedded_df.columns = embedded_df.columns.map(lambda x: f"x_{int(x)+1}")
    if out:
        embedded_df.to_csv(out, sep="\t", index=True, header=True)
        logger.debug(f"embedded.shape {embedded_df.shape} : Written {out}")
    return embedded_df
Beispiel #9
0
def embed(
    kmers: Union[str, pd.DataFrame],
    out: str = None,
    force: bool = False,
    embed_dimensions: int = 2,
    pca_dimensions: int = 50,
    method: str = "bhsne",
    perplexity: float = 30.0,
    seed: int = 42,
    n_jobs: int = -1,
    **method_kwargs: Dict[str, Any],
) -> pd.DataFrame:
    """Embed k-mers using provided `method`.

    Notes
    -----

        * `sklearn.manifold.TSNE <https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE>`_
        * `tsne.bh_sne <https://pypi.org/project/tsne/>`_
        * `UMAP <https://umap-learn.readthedocs.io/en/latest/>`_
        * `densMAP <https://umap-learn.readthedocs.io/en/latest/densmap_demo.html#better-preserving-local-density-with-densmap>`_
        * `TriMap <https://github.com/eamid/trimap>`_

    Parameters
    ----------
    kmers : str or pd.DataFrame
        </path/to/input/kmers.normalized.tsv>

    out : str, optional
        </path/to/output/kmers.out.tsv> If provided will write to `out`.

    force: bool, optional
        Whether to overwrite existing `out` file.

    embed_dimensions : int, optional
        embed_dimensions` to embed k-mer frequencies (the default is 2).

        The output embedded kmers will follow columns of `x_1` to `x_{embed_dimensions}`

        NOTE: The columns are 1-indexed, i.e. at x_1 *not* x_0

    pca_dimensions : int, optional
        Reduce k-mer frequencies dimensions to `pca_dimensions` (the default is 50).
        If zero, will skip this step.

    method : str, optional
        embedding method to use (the default is 'bhsne').
        choices include sksne, bhsne, umap, trimap and densmap.

    perplexity : float, optional
        hyperparameter used to tune sksne and bhsne (the default is 30.0).

    seed: int, optional
        Seed to use for `method`. Allows for reproducibility from random state.

    n_jobs: int, optional

        Used with `sksne`, `densmap` and `umap`, (the default is -1 which will attempt to use all available CPUs)

        Note
        ----

        For n_jobs below -1, (CPUS + 1 + n_jobs) are used. For example with n_jobs=-2, all CPUs but one are used.

        * scikit-learn TSNE `n_jobs glossary <https://scikit-learn.org/stable/glossary.html#term-n_jobs>`_
        * UMAP and DensMAP's
        `invocation <https://github.com/lmcinnes/umap/blob/2c5232f7b946efab30e279c0b095b37f5648ed8b/umap/umap_.py#L328-L341>`_
        use this with
        `pynndescent <https://github.com/lmcinnes/pynndescent/blob/cc6ed32e25f7afb14913bff04d3b01723b33e5b5/pynndescent/pynndescent_.py#L629-L632>`_


    **method_kwargs : Dict[str, Any], optional

        Other keyword arguments (kwargs) to be supplied to respective `method`.

        Examples
        --------

        Set UMAP(verbose=True, output_dens=True) using **method_kwargs
        >>> embed_df = kmers.embed(
            norm_df,
            method='densmap',
            embed_dimensions=2,
            n_jobs=None,
            **{
                'verbose': True,
                'output_dens': True,
            }
        )

        NOTE: Setting duplicate arguments will result in an error

        Here we specify ``UMAP(densmap=True)`` using ``method='densmap'``
        and also attempt to overwrite to ``UMAP(densmap=False)``
        with the method_kwargs, ``**{'densmap':False}``, resulting
        in a TypeError.

        >>> embed_df = kmers.embed(
            df,
            method='densmap',
            embed_dimensions=2,
            n_jobs=4,
            **{'densmap': False}
        )
        TypeError: umap.umap_.UMAP() got multiple values for keyword argument 'densmap'

        Typically, you will not require the use of method_kwargs as this is only available
        for applying advanced parameter settings to any of the available embedding methods.

    Returns
    -------
    pd.DataFrame
        out dataframe with index='contig' and cols=['x','y','z']

    Raises
    -------
    TypeError
        Provided `kmers` is not a str or pd.DataFrame.
    TableFormatError
        Provided `kmers` or `out` are not formatted correctly for use.
    ValueError
        Provided `method` is not an available choice.
    FileNotFoundError
        `kmers` type must be a pd.DataFrame or filepath.
    """
    if isinstance(kmers,
                  str) and os.path.exists(kmers) and os.path.getsize(kmers):
        try:
            df = pd.read_csv(kmers, sep="\t", index_col="contig")
        except ValueError:
            raise TableFormatError(f"contig column not found in {kmers}")
    elif isinstance(kmers, pd.DataFrame):
        df = kmers
    else:
        raise TypeError(kmers)
    if out and os.path.exists(out) and os.path.getsize(out) and not force:
        logger.debug(f"k-mers frequency embedding already exists {out}")
        try:
            return pd.read_csv(out, sep="\t", index_col="contig")
        except ValueError:
            raise TableFormatError(f"contig column not found in {out}")

    if df.empty:
        kmers_desc = f"kmers:{kmers} type:{type(kmers)}"
        embed_desc = f"out:{out} type:{type(out)}"
        requirements = f"Given pd.DataFrame is empty!"
        raise FileNotFoundError(f"{kmers_desc} {embed_desc} {requirements}")

    method = method.lower()
    choices = {"umap", "sksne", "bhsne", "densmap", "trimap"}
    if method not in choices:
        raise ValueError(
            f"{method} not in embedding methods. Choices: {', '.join(choices)}"
        )
    # PCA
    n_samples, n_components = df.shape
    # Drop any rows that all cols contain NaN. This may occur if the contig length is below the k-mer size
    X = df.dropna(axis="index", how="all").fillna(0).to_numpy()
    # Set random state using provided seed
    random_state = np.random.RandomState(seed)
    if isinstance(pca_dimensions, str):
        try:
            int(pca_dimensions)
        except Exception as e:
            raise TypeError(
                f"pca_dimensions must be an integer! given: {pca_dimensions}")
    if n_components > pca_dimensions and pca_dimensions != 0:
        logger.debug(
            f"Performing decomposition with PCA (seed {seed}): {n_components} to {pca_dimensions} dims"
        )
        X = PCA(n_components=pca_dimensions,
                random_state=random_state).fit_transform(X)
        # X = PCA(n_components='mle').fit_transform(X)
        n_samples, n_components = X.shape

    logger.debug(
        f"{method}: {n_samples} data points and {n_components} dimensions")

    # Adjust perplexity according to the number of data points
    n_rows = n_samples - 1
    scaler = 3.0
    if n_rows < (scaler * perplexity):
        perplexity = (n_rows / scaler) - 1

    def do_sksne():
        return TSNE(
            n_components=embed_dimensions,
            perplexity=perplexity,
            random_state=random_state,
            n_jobs=n_jobs,
            **method_kwargs,
        ).fit_transform(X)

    def do_bhsne():
        return bh_sne(
            data=X,
            d=embed_dimensions,
            perplexity=perplexity,
            random_state=random_state,
            **method_kwargs,
        )

    # def do_densne():
    #     return densne.run_densne(
    #         X, no_dims=embed_dimensions, perplexity=perplexity, rand_seed=random_state, **method_kwargs
    #     )

    method_is_densmap = method == "densmap"

    def do_UMAP():
        return UMAP(
            n_neighbors=15,
            n_components=embed_dimensions,
            metric="euclidean",
            random_state=random_state,
            densmap=method_is_densmap,
            n_jobs=n_jobs,
            **method_kwargs,
        ).fit_transform(X)

    def do_trimap():
        return TRIMAP(n_dims=embed_dimensions, verbose=False,
                      **method_kwargs).fit_transform(X)

    # TODO: Add "densne":do_densne() to dispatcher when easy install of densne is available.
    dispatcher = {
        "sksne": do_sksne,
        "bhsne": do_bhsne,
        "umap": do_UMAP,
        "densmap": do_UMAP,
        "trimap": do_trimap,
    }
    logger.debug(f"Performing embedding with {method} (seed {seed})")
    try:
        X = dispatcher[method]()
    except ValueError as err:
        if method == "sksne":
            logger.warning(
                f"embed_dimensions ({embed_dimensions}) is too high for sksne. Reducing to 3."
            )
            embed_dimensions = 3
            X = dispatcher[method]()
        else:
            raise err

    embed_cols = [f"x_{col}" for col in range(1, embed_dimensions + 1)]
    if isinstance(X, tuple):
        # When method_kwargs = **{'output_dens': True}
        # X : tuple[np.ndarray, np.ndarray, np.ndarray]
        # X : tuple[embedding, original local radii, embedding local radii]
        output_dens_ndarray_cols = [
            embed_cols,
            ["original_local_radius"],
            ["embedded_local_radius"],
        ]
        embedded_df = pd.concat(
            [
                pd.DataFrame(result, index=df.index, columns=cols)
                for result, cols in zip(X, output_dens_ndarray_cols)
            ],
            axis=1,
        )
    elif isinstance(X, np.ndarray):
        embedded_df = pd.DataFrame(X, index=df.index, columns=embed_cols)
    else:
        logger.warning(
            f"Unrecognized {method} transform (method_kwargs={method_kwargs}) output type: {type(X)}"
        )
        embedded_df = pd.DataFrame(X, index=df.index, columns=embed_cols)
    if out:
        embedded_df.to_csv(out, sep="\t", index=True, header=True)
        logger.debug(f"embedded.shape {embedded_df.shape} : Written {out}")
    return embedded_df