예제 #1
0
def get_metabin_stats(
    bin_df: pd.DataFrame,
    markers: Union[str, pd.DataFrame],
    cluster_col: str = "cluster",
) -> pd.DataFrame:
    """Retrieve statistics for all clusters recovered from Autometa binning.

    Parameters
    ----------
    bin_df : pd.DataFrame
        Autometa binning table. index=contig, cols=['cluster','length', 'gc_content', 'coverage', ...]
    markers : str,pd.DataFrame
        Path to or pd.DataFrame of markers table corresponding to contigs in `bin_df`
    cluster_col : str, optional
        Clustering column by which to group metabins

    Returns
    -------
    pd.DataFrame
        dataframe consisting of various metagenome-assembled genome statistics indexed by cluster.

    Raises
    ------
    TypeError
        markers should be a path to or pd.DataFrame of a markers table corresponding to contigs in `bin_df`

    ValueError
        One of the required columns (`cluster_col`, coverage, length, gc_content) was not found in `bin_df`
    """
    logger.info(f"Retrieving metabins' stats for {cluster_col}")
    if isinstance(markers, str) or isinstance(markers, Path):
        markers_df = load_markers(markers)
    elif isinstance(markers, pd.DataFrame):
        markers_df = markers
    else:
        raise TypeError(
            f"`markers` should be a path to or pd.DataFrame of a markers table corresponding to contigs in `bin_df`. Provided: {type(markers)}, {markers}"
        )

    metabin_stat_cols = [cluster_col, "coverage", "length", "gc_content"]
    for col in metabin_stat_cols:
        if col not in bin_df.columns:
            raise ValueError(
                f"Required column ({col}) not in bin_df columns: {bin_df.columns}"
            )
    # If the indices do not match, marker calculations will fail
    if bin_df.index.name != "contig":
        raise ValueError(
            f"binning dataframe must be indexed by contig. given: {bin_df.index.name}."
            "\n\tTry:"
            "\n\t\tbin_df.set_index('contig', inplace=True)")

    df = bin_df[metabin_stat_cols].fillna(value={
        cluster_col: "unclustered"
    }).copy()

    clusters = df.join(markers_df, how="outer").groupby(cluster_col)

    percent_metagenome_size = clusters.length.sum() / df.length.sum() * 100
    percent_metagenome_seqs = clusters.size() / df.shape[0] * 100
    marker_counts = clusters[markers_df.columns].sum()
    cluster_marker_sum = marker_counts.sum(axis=1)
    redundant_marker_count = marker_counts.gt(1).sum(axis=1)
    single_copy_marker_count = marker_counts.eq(1).sum(axis=1)
    unique_marker_count = marker_counts.ge(1).sum(axis=1)
    expected_unique_marker_count = markers_df.shape[1]
    completeness = unique_marker_count / expected_unique_marker_count * 100
    purity = single_copy_marker_count / unique_marker_count * 100
    stats_df = pd.DataFrame({
        "nseqs":
        clusters.size(),
        "size (bp)":
        clusters.length.sum(),
        "completeness":
        completeness,
        "purity":
        purity,
        "marker_sum":
        cluster_marker_sum,
        "unique_marker_count":
        unique_marker_count,
        "single_copy_marker_count":
        single_copy_marker_count,
        "redundant_marker_count":
        redundant_marker_count,
        "expected_unique_marker_count":
        expected_unique_marker_count,
        "percent_of_metagenome_seqs":
        percent_metagenome_seqs,
        "percent_of_metagenome_size":
        percent_metagenome_size,
        "N90":
        clusters.apply(fragmentation_metric, quality_measure=0.9),
        "N50":
        clusters.apply(fragmentation_metric, quality_measure=0.5),
        "N10":
        clusters.apply(fragmentation_metric, quality_measure=0.1),
    })
    coverage_stats = get_agg_stats(clusters, "coverage")
    gc_content_stats = get_agg_stats(clusters, "gc_content")
    return (pd.concat([stats_df, coverage_stats, gc_content_stats],
                      axis=1).round(2).convert_dtypes())
예제 #2
0
def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )

    parser = argparse.ArgumentParser(
        description=
        "Autometa Large-data-mode binning by contig set selection using max-partition-size",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--kmers",
        help="Path to k-mer counts table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--coverages",
        help="Path to metagenome coverages table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--gc-content",
        help="Path to metagenome GC contents table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--markers",
        help="Path to Autometa annotated markers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--taxonomy",
        metavar="filepath",
        help="Path to Autometa assigned taxonomies table",
        required=True,
    )
    parser.add_argument(
        "--output-binning",
        help="Path to write Autometa binning results",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-main",
        help="Path to write Autometa main table used during/after binning",
        metavar="filepath",
    )
    parser.add_argument(
        "--clustering-method",
        help="Clustering algorithm to use for recursive binning.",
        choices=["dbscan", "hdbscan"],
        default="dbscan",
    )
    parser.add_argument(
        "--completeness",
        help="completeness cutoff to retain cluster."
        " e.g. cluster completeness >= `completeness`",
        default=20.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--purity",
        help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`",
        default=95.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--cov-stddev-limit",
        help="coverage standard deviation limit to retain cluster"
        " e.g. cluster coverage standard deviation <= `cov-stddev-limit`",
        default=25.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--gc-stddev-limit",
        help="GC content standard deviation limit to retain cluster"
        " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`",
        default=5.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--norm-method",
        help="kmer normalization method to use on kmer counts",
        default="am_clr",
        choices=[
            "am_clr",
            "ilr",
            "clr",
        ],
    )
    parser.add_argument(
        "--pca-dims",
        help=
        "PCA dimensions to reduce normalized kmer frequencies prior to embedding",
        default=50,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--embed-method",
        help="kmer embedding method to use on normalized kmer frequencies",
        default="bhsne",
        choices=[
            "bhsne",
            "umap",
            "sksne",
            "trimap",
        ],
    )
    parser.add_argument(
        "--embed-dims",
        help="Embedding dimensions to reduce normalized kmers table after PCA.",
        default=2,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--max-partition-size",
        help=
        "Maximum number of contigs to consider for a recursive binning batch.",
        default=10000,
        metavar="int",
        type=int,
    )
    parser.add_argument(
        "--starting-rank",
        help="Canonical rank at which to begin subsetting taxonomy",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--reverse-ranks",
        action="store_true",
        default=False,
        help="Reverse order at which to split taxonomy by canonical-rank."
        " When `--reverse-ranks` is given, contigs will be split in order of"
        " species, genus, family, order, class, phylum, superkingdom.",
    )
    parser.add_argument(
        "--cache",
        help="Directory to store itermediate checkpoint files during binning"
        " (If this is provided and the job fails, the script will attempt to"
        " begin from the checkpoints in this cache directory).",
        metavar="dirpath",
    )
    parser.add_argument(
        "--binning-checkpoints",
        help="File path to store itermediate contig binning results"
        " (The `--cache` argument is required for this feature). If  "
        "`--cache` is provided without this argument, a binning checkpoints file will be created.",
        metavar="filepath",
    )
    parser.add_argument(
        "--rank-filter",
        help=
        "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--rank-name-filter",
        help=
        "Only retrieve contigs with this name corresponding to `--rank-filter` column",
        default="bacteria",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="log debug information",
    )
    parser.add_argument(
        "--cpus",
        default=-1,
        metavar="int",
        type=int,
        help=
        "Number of cores to use by clustering method (default will try to use as many as are available)",
    )
    args = parser.parse_args()

    counts_df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
    # First check if we are performing binning with taxonomic partitioning
    if args.taxonomy:
        main_df = read_annotations(
            [args.coverages, args.gc_content, args.taxonomy])
        main_df = filter_taxonomy(df=main_df,
                                  rank=args.rank_filter,
                                  name=args.rank_name_filter)
    else:
        main_df = read_annotations([args.coverages, args.gc_content])
        embed_df = get_kmer_embedding(
            counts=counts_df,
            norm_method=args.norm_method,
            pca_dimensions=args.pca_dims,
            embed_dimensions=args.embed_dims,
            embed_method=args.embed_method,
            cache_fpath=None,
        )
        main_df = pd.merge(main_df,
                           embed_df,
                           how="left",
                           left_index=True,
                           right_index=True)

    # Prepare our markers dataframe
    markers_df = load_markers(args.markers, format="wide")

    # Ensure we have marker-containing contigs available to check binning quality...
    if main_df.loc[main_df.index.isin(markers_df.index)].empty:
        raise TableFormatError(
            "No markers for contigs in table. Unable to assess binning quality"
        )
    if main_df.shape[0] <= 1:
        raise BinningError("Not enough contigs in table for binning")

    contigs_containing_markers_count = main_df.index.isin(
        markers_df.index).sum()
    contigs_containing_markers_percent = (contigs_containing_markers_count /
                                          main_df.shape[0] * 100)
    logger.info(
        f"{contigs_containing_markers_count:,} sequences contain markers ({contigs_containing_markers_percent:.2f}% of total in binning features table)"
    )
    logger.info(f"Selected clustering method: {args.clustering_method}")

    main_out = cluster_by_taxon_partitioning(
        main=main_df,
        counts=counts_df,
        markers=markers_df,
        norm_method=args.norm_method,
        pca_dimensions=args.pca_dims,
        embed_dimensions=args.embed_dims,
        embed_method=args.embed_method,
        max_partition_size=args.max_partition_size,
        completeness=args.completeness,
        purity=args.purity,
        coverage_stddev=args.cov_stddev_limit,
        gc_content_stddev=args.gc_stddev_limit,
        starting_rank=args.starting_rank,
        method=args.clustering_method,
        reverse_ranks=args.reverse_ranks,
        cache=args.cache,
        binning_checkpoints_fpath=args.binning_checkpoints,
        n_jobs=args.cpus,
        verbose=args.verbose,
    )

    write_results(
        results=main_out,
        binning_output=args.output_binning,
        full_output=args.output_main,
    )
예제 #3
0
def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )
    parser = argparse.ArgumentParser(
        description="Perform marker gene guided binning of "
        "metagenome contigs using annotations (when available) of sequence "
        "composition, coverage and homology.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--kmers",
        help="Path to embedded k-mers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--coverages",
        help="Path to metagenome coverages table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--gc-content",
        help="Path to metagenome GC contents table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--markers",
        help="Path to Autometa annotated markers table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-binning",
        help="Path to write Autometa binning results",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-main",
        help="Path to write Autometa main table used during/after binning",
        metavar="filepath",
    )
    parser.add_argument(
        "--clustering-method",
        help="Clustering algorithm to use for recursive binning.",
        choices=["dbscan", "hdbscan"],
        default="dbscan",
    )
    parser.add_argument(
        "--completeness",
        help="completeness cutoff to retain cluster."
        " e.g. cluster completeness >= `completeness`",
        default=20.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--purity",
        help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`",
        default=95.0,
        metavar="0 < float <= 100",
        type=float,
    )
    parser.add_argument(
        "--cov-stddev-limit",
        help="coverage standard deviation limit to retain cluster"
        " e.g. cluster coverage standard deviation <= `cov-stddev-limit`",
        default=25.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--gc-stddev-limit",
        help="GC content standard deviation limit to retain cluster"
        " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`",
        default=5.0,
        metavar="float",
        type=float,
    )
    parser.add_argument(
        "--taxonomy",
        metavar="filepath",
        help="Path to Autometa assigned taxonomies table",
    )
    parser.add_argument(
        "--starting-rank",
        help="Canonical rank at which to begin subsetting taxonomy",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--reverse-ranks",
        action="store_true",
        default=False,
        help="Reverse order at which to split taxonomy by canonical-rank."
        " When `--reverse-ranks` is given, contigs will be split in order of"
        " species, genus, family, order, class, phylum, superkingdom.",
    )
    parser.add_argument(
        "--rank-filter",
        help=
        "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`",
        default="superkingdom",
        choices=[
            "superkingdom",
            "phylum",
            "class",
            "order",
            "family",
            "genus",
            "species",
        ],
    )
    parser.add_argument(
        "--rank-name-filter",
        help=
        "Only retrieve contigs with this name corresponding to `--rank-filter` column",
        default="bacteria",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="log debug information",
    )
    parser.add_argument(
        "--cpus",
        default=-1,
        metavar="int",
        type=int,
        help=
        "Number of cores to use by clustering method (default will try to use as many as are available)",
    )
    args = parser.parse_args()

    # First check if we are performing binning with taxonomic partitioning
    if args.taxonomy:
        main_df = read_annotations(
            [args.kmers, args.coverages, args.gc_content, args.taxonomy])
        main_df = filter_taxonomy(df=main_df,
                                  rank=args.rank_filter,
                                  name=args.rank_name_filter)
    else:
        main_df = read_annotations(
            [args.kmers, args.coverages, args.gc_content])

    # Prepare our markers dataframe
    markers_df = load_markers(args.markers, format="wide")

    # Ensure we have marker-containing contigs available to check binning quality...
    try:
        if main_df.loc[main_df.index.isin(markers_df.index)].empty:
            raise TableFormatError(
                "No markers for contigs in table. Unable to assess binning quality"
            )
        if main_df.shape[0] <= 1:
            raise BinningError("Not enough contigs in table for binning")
    except (TableFormatError, BinningError) as err:
        logger.warn(err)
        # Using an http error status code...
        # From: https://kinsta.com/blog/http-status-codes/#200-status-codes
        # 204: “No Content.”
        # This code means that the server has successfully processed the request
        # but is not going to return any content.
        sys.exit(204)
    logger.info(f"Selected clustering method: {args.clustering_method}")

    if args.taxonomy:
        main_out = taxon_guided_binning(
            main=main_df,
            markers=markers_df,
            completeness=args.completeness,
            purity=args.purity,
            coverage_stddev=args.cov_stddev_limit,
            gc_content_stddev=args.gc_stddev_limit,
            method=args.clustering_method,
            starting_rank=args.starting_rank,
            reverse_ranks=args.reverse_ranks,
            n_jobs=args.cpus,
            verbose=args.verbose,
        )
    else:
        # Perform clustering w/o taxonomy
        main_out = get_clusters(
            main=main_df,
            markers_df=markers_df,
            completeness=args.completeness,
            purity=args.purity,
            coverage_stddev=args.cov_stddev_limit,
            gc_content_stddev=args.gc_stddev_limit,
            method=args.clustering_method,
            n_jobs=args.cpus,
            verbose=args.verbose,
        )

    write_results(
        results=main_out,
        binning_output=args.output_binning,
        full_output=args.output_main,
    )
예제 #4
0
def fixture_markers(markers_fpath):
    return load_markers(markers_fpath)
def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )

    parser = argparse.ArgumentParser(
        description="Recruit unclustered contigs given metagenome annotations and Autometa binning results."
        " Note: All tables must contain a 'contig' column to be used as the unique table index",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--kmers", help="Path to normalized kmer frequencies table.", required=True
    )
    parser.add_argument("--coverage", help="Path to coverage table.", required=True)
    parser.add_argument(
        "--binning",
        help="Path to autometa binning output [will look for col='cluster']",
        required=True,
    )
    parser.add_argument(
        "--markers", help="Path to domain-specific markers table.", required=True
    )
    parser.add_argument(
        "--output-binning",
        help="Path to output unclustered recruitment table.",
        required=True,
    )
    parser.add_argument(
        "--output-main",
        help="Path to write Autometa main table used during/after unclustered recruitment.",
        required=False,
    )
    parser.add_argument(
        "--output-features",
        help="Path to write Autometa features table used during unclustered recruitment.",
        required=False,
    )
    parser.add_argument("--taxonomy", help="Path to taxonomy table.")
    parser.add_argument(
        "--taxa-dimensions",
        help="Num of dimensions to reduce taxonomy encodings",
        type=int,
    )
    parser.add_argument(
        "--additional-features",
        help="Path to additional features with which to add to classifier training data.",
        nargs="*",
        default=[],
    )
    parser.add_argument(
        "--confidence",
        help="Percent confidence to allow classification (confidence = num. consistent predictions/num. classifications)",
        default=1.0,
        type=float,
    )
    parser.add_argument(
        "--num-classifications",
        help="Num classifications for predicting/validating contig cluster recruitment",
        default=10,
        type=int,
    )
    parser.add_argument(
        "--classifier",
        help="classifier to use for recruitment of contigs",
        default="decision_tree",
        choices=["decision_tree", "random_forest"],
    )
    parser.add_argument(
        "--kmer-dimensions",
        help="Num of dimensions to reduce normalized k-mer frequencies",
        default=50,
        type=int,
    )
    parser.add_argument(
        "--seed",
        help="Seed to use for RandomState when initializing classifiers.",
        default=42,
        type=int,
    )
    args = parser.parse_args()

    features = get_features(
        kmers=args.kmers,
        coverage=args.coverage,
        annotations=args.additional_features,
        taxonomy=args.taxonomy,
        kmer_dimensions=args.kmer_dimensions,
        taxa_dimensions=args.taxa_dimensions,
    )
    bin_df = pd.read_csv(
        args.binning, sep="\t", index_col="contig", usecols=["contig", "cluster"]
    )
    prev_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0]
    if not prev_num_unclustered:
        logger.warning("No unclustered contigs are available to recruit!")
        sys.exit(204)
    markers_df = load_markers(fpath=args.markers, format="wide")

    logger.debug(
        f"classifier={args.classifier}, seed={args.seed}, n.estimators={args.num_classifications}, confidence={args.confidence*100}%"
    )

    n_runs = 0
    while True:
        n_runs += 1

        train_data, test_data = train_test_split_and_subset(
            binning=bin_df, features=features, markers=markers_df
        )

        # Perform cross-validation with n. iterations (num. estimators)
        predictions_df = get_confidence_filtered_predictions(
            train_data=train_data,
            test_data=test_data,
            num_classifications=args.num_classifications,
            confidence=args.confidence,
            classifier=args.classifier,
            seed=args.seed,
        )
        # Filter out any predictions that would reduce cluster purity
        predictions_df = filter_contaminating_predictions(
            predictions=predictions_df, markers=markers_df, binning=bin_df
        )
        # Stop if no contigs are recruited to clusters
        if predictions_df.empty:
            break

        bin_df = add_predictions(binning=bin_df, predictions=predictions_df)

    # Unclustered recruitment finished
    # Determine the resulting number of unclustered contigs.
    now_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0]

    n_recruited = prev_num_unclustered - now_num_unclustered
    logger.info(
        f"unclustered {prev_num_unclustered} -> {now_num_unclustered} (recruited {n_recruited} contigs) in {n_runs} runs"
    )
    # Re-read in the binning dataframe to merge with the newly recruited contigs
    prev_bin_df = pd.read_csv(args.binning, sep="\t", index_col="contig")
    bin_df.rename(columns={"cluster": "recruited_cluster"}, inplace=True)
    binning_df = pd.merge(
        prev_bin_df[["cluster"]],
        bin_df[["recruited_cluster"]],
        left_index=True,
        right_index=True,
    )
    # Write unclustered recruitment results into binning df
    # index = 'contig', cols = ['cluster', 'recruited_cluster']
    binning_df.to_csv(
        args.output_binning, sep="\t", index=True, header=True, float_format="%.5f"
    )
    if args.output_main:
        main_df = pd.merge(
            prev_bin_df,
            bin_df[["recruited_cluster"]],
            left_index=True,
            right_index=True,
        )
        main_df.to_csv(
            args.output_main, sep="\t", index=True, header=True, float_format="%.5f"
        )
    if args.output_features:
        # Outputs features matrix used as input to recruitment algorithm
        features.to_csv(
            args.output_features, sep="\t", index=True, header=True, float_format="%.5f"
        )