def get_metabin_stats( bin_df: pd.DataFrame, markers: Union[str, pd.DataFrame], cluster_col: str = "cluster", ) -> pd.DataFrame: """Retrieve statistics for all clusters recovered from Autometa binning. Parameters ---------- bin_df : pd.DataFrame Autometa binning table. index=contig, cols=['cluster','length', 'gc_content', 'coverage', ...] markers : str,pd.DataFrame Path to or pd.DataFrame of markers table corresponding to contigs in `bin_df` cluster_col : str, optional Clustering column by which to group metabins Returns ------- pd.DataFrame dataframe consisting of various metagenome-assembled genome statistics indexed by cluster. Raises ------ TypeError markers should be a path to or pd.DataFrame of a markers table corresponding to contigs in `bin_df` ValueError One of the required columns (`cluster_col`, coverage, length, gc_content) was not found in `bin_df` """ logger.info(f"Retrieving metabins' stats for {cluster_col}") if isinstance(markers, str) or isinstance(markers, Path): markers_df = load_markers(markers) elif isinstance(markers, pd.DataFrame): markers_df = markers else: raise TypeError( f"`markers` should be a path to or pd.DataFrame of a markers table corresponding to contigs in `bin_df`. Provided: {type(markers)}, {markers}" ) metabin_stat_cols = [cluster_col, "coverage", "length", "gc_content"] for col in metabin_stat_cols: if col not in bin_df.columns: raise ValueError( f"Required column ({col}) not in bin_df columns: {bin_df.columns}" ) # If the indices do not match, marker calculations will fail if bin_df.index.name != "contig": raise ValueError( f"binning dataframe must be indexed by contig. given: {bin_df.index.name}." "\n\tTry:" "\n\t\tbin_df.set_index('contig', inplace=True)") df = bin_df[metabin_stat_cols].fillna(value={ cluster_col: "unclustered" }).copy() clusters = df.join(markers_df, how="outer").groupby(cluster_col) percent_metagenome_size = clusters.length.sum() / df.length.sum() * 100 percent_metagenome_seqs = clusters.size() / df.shape[0] * 100 marker_counts = clusters[markers_df.columns].sum() cluster_marker_sum = marker_counts.sum(axis=1) redundant_marker_count = marker_counts.gt(1).sum(axis=1) single_copy_marker_count = marker_counts.eq(1).sum(axis=1) unique_marker_count = marker_counts.ge(1).sum(axis=1) expected_unique_marker_count = markers_df.shape[1] completeness = unique_marker_count / expected_unique_marker_count * 100 purity = single_copy_marker_count / unique_marker_count * 100 stats_df = pd.DataFrame({ "nseqs": clusters.size(), "size (bp)": clusters.length.sum(), "completeness": completeness, "purity": purity, "marker_sum": cluster_marker_sum, "unique_marker_count": unique_marker_count, "single_copy_marker_count": single_copy_marker_count, "redundant_marker_count": redundant_marker_count, "expected_unique_marker_count": expected_unique_marker_count, "percent_of_metagenome_seqs": percent_metagenome_seqs, "percent_of_metagenome_size": percent_metagenome_size, "N90": clusters.apply(fragmentation_metric, quality_measure=0.9), "N50": clusters.apply(fragmentation_metric, quality_measure=0.5), "N10": clusters.apply(fragmentation_metric, quality_measure=0.1), }) coverage_stats = get_agg_stats(clusters, "coverage") gc_content_stats = get_agg_stats(clusters, "gc_content") return (pd.concat([stats_df, coverage_stats, gc_content_stats], axis=1).round(2).convert_dtypes())
def main(): import argparse import logging as logger logger.basicConfig( format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logger.DEBUG, ) parser = argparse.ArgumentParser( description= "Autometa Large-data-mode binning by contig set selection using max-partition-size", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--kmers", help="Path to k-mer counts table", metavar="filepath", required=True, ) parser.add_argument( "--coverages", help="Path to metagenome coverages table", metavar="filepath", required=True, ) parser.add_argument( "--gc-content", help="Path to metagenome GC contents table", metavar="filepath", required=True, ) parser.add_argument( "--markers", help="Path to Autometa annotated markers table", metavar="filepath", required=True, ) parser.add_argument( "--taxonomy", metavar="filepath", help="Path to Autometa assigned taxonomies table", required=True, ) parser.add_argument( "--output-binning", help="Path to write Autometa binning results", metavar="filepath", required=True, ) parser.add_argument( "--output-main", help="Path to write Autometa main table used during/after binning", metavar="filepath", ) parser.add_argument( "--clustering-method", help="Clustering algorithm to use for recursive binning.", choices=["dbscan", "hdbscan"], default="dbscan", ) parser.add_argument( "--completeness", help="completeness cutoff to retain cluster." " e.g. cluster completeness >= `completeness`", default=20.0, metavar="0 < float <= 100", type=float, ) parser.add_argument( "--purity", help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`", default=95.0, metavar="0 < float <= 100", type=float, ) parser.add_argument( "--cov-stddev-limit", help="coverage standard deviation limit to retain cluster" " e.g. cluster coverage standard deviation <= `cov-stddev-limit`", default=25.0, metavar="float", type=float, ) parser.add_argument( "--gc-stddev-limit", help="GC content standard deviation limit to retain cluster" " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`", default=5.0, metavar="float", type=float, ) parser.add_argument( "--norm-method", help="kmer normalization method to use on kmer counts", default="am_clr", choices=[ "am_clr", "ilr", "clr", ], ) parser.add_argument( "--pca-dims", help= "PCA dimensions to reduce normalized kmer frequencies prior to embedding", default=50, metavar="int", type=int, ) parser.add_argument( "--embed-method", help="kmer embedding method to use on normalized kmer frequencies", default="bhsne", choices=[ "bhsne", "umap", "sksne", "trimap", ], ) parser.add_argument( "--embed-dims", help="Embedding dimensions to reduce normalized kmers table after PCA.", default=2, metavar="int", type=int, ) parser.add_argument( "--max-partition-size", help= "Maximum number of contigs to consider for a recursive binning batch.", default=10000, metavar="int", type=int, ) parser.add_argument( "--starting-rank", help="Canonical rank at which to begin subsetting taxonomy", default="superkingdom", choices=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species", ], ) parser.add_argument( "--reverse-ranks", action="store_true", default=False, help="Reverse order at which to split taxonomy by canonical-rank." " When `--reverse-ranks` is given, contigs will be split in order of" " species, genus, family, order, class, phylum, superkingdom.", ) parser.add_argument( "--cache", help="Directory to store itermediate checkpoint files during binning" " (If this is provided and the job fails, the script will attempt to" " begin from the checkpoints in this cache directory).", metavar="dirpath", ) parser.add_argument( "--binning-checkpoints", help="File path to store itermediate contig binning results" " (The `--cache` argument is required for this feature). If " "`--cache` is provided without this argument, a binning checkpoints file will be created.", metavar="filepath", ) parser.add_argument( "--rank-filter", help= "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`", default="superkingdom", choices=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species", ], ) parser.add_argument( "--rank-name-filter", help= "Only retrieve contigs with this name corresponding to `--rank-filter` column", default="bacteria", ) parser.add_argument( "--verbose", action="store_true", default=False, help="log debug information", ) parser.add_argument( "--cpus", default=-1, metavar="int", type=int, help= "Number of cores to use by clustering method (default will try to use as many as are available)", ) args = parser.parse_args() counts_df = pd.read_csv(args.kmers, sep="\t", index_col="contig") # First check if we are performing binning with taxonomic partitioning if args.taxonomy: main_df = read_annotations( [args.coverages, args.gc_content, args.taxonomy]) main_df = filter_taxonomy(df=main_df, rank=args.rank_filter, name=args.rank_name_filter) else: main_df = read_annotations([args.coverages, args.gc_content]) embed_df = get_kmer_embedding( counts=counts_df, norm_method=args.norm_method, pca_dimensions=args.pca_dims, embed_dimensions=args.embed_dims, embed_method=args.embed_method, cache_fpath=None, ) main_df = pd.merge(main_df, embed_df, how="left", left_index=True, right_index=True) # Prepare our markers dataframe markers_df = load_markers(args.markers, format="wide") # Ensure we have marker-containing contigs available to check binning quality... if main_df.loc[main_df.index.isin(markers_df.index)].empty: raise TableFormatError( "No markers for contigs in table. Unable to assess binning quality" ) if main_df.shape[0] <= 1: raise BinningError("Not enough contigs in table for binning") contigs_containing_markers_count = main_df.index.isin( markers_df.index).sum() contigs_containing_markers_percent = (contigs_containing_markers_count / main_df.shape[0] * 100) logger.info( f"{contigs_containing_markers_count:,} sequences contain markers ({contigs_containing_markers_percent:.2f}% of total in binning features table)" ) logger.info(f"Selected clustering method: {args.clustering_method}") main_out = cluster_by_taxon_partitioning( main=main_df, counts=counts_df, markers=markers_df, norm_method=args.norm_method, pca_dimensions=args.pca_dims, embed_dimensions=args.embed_dims, embed_method=args.embed_method, max_partition_size=args.max_partition_size, completeness=args.completeness, purity=args.purity, coverage_stddev=args.cov_stddev_limit, gc_content_stddev=args.gc_stddev_limit, starting_rank=args.starting_rank, method=args.clustering_method, reverse_ranks=args.reverse_ranks, cache=args.cache, binning_checkpoints_fpath=args.binning_checkpoints, n_jobs=args.cpus, verbose=args.verbose, ) write_results( results=main_out, binning_output=args.output_binning, full_output=args.output_main, )
def main(): import argparse import logging as logger logger.basicConfig( format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logger.DEBUG, ) parser = argparse.ArgumentParser( description="Perform marker gene guided binning of " "metagenome contigs using annotations (when available) of sequence " "composition, coverage and homology.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--kmers", help="Path to embedded k-mers table", metavar="filepath", required=True, ) parser.add_argument( "--coverages", help="Path to metagenome coverages table", metavar="filepath", required=True, ) parser.add_argument( "--gc-content", help="Path to metagenome GC contents table", metavar="filepath", required=True, ) parser.add_argument( "--markers", help="Path to Autometa annotated markers table", metavar="filepath", required=True, ) parser.add_argument( "--output-binning", help="Path to write Autometa binning results", metavar="filepath", required=True, ) parser.add_argument( "--output-main", help="Path to write Autometa main table used during/after binning", metavar="filepath", ) parser.add_argument( "--clustering-method", help="Clustering algorithm to use for recursive binning.", choices=["dbscan", "hdbscan"], default="dbscan", ) parser.add_argument( "--completeness", help="completeness cutoff to retain cluster." " e.g. cluster completeness >= `completeness`", default=20.0, metavar="0 < float <= 100", type=float, ) parser.add_argument( "--purity", help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`", default=95.0, metavar="0 < float <= 100", type=float, ) parser.add_argument( "--cov-stddev-limit", help="coverage standard deviation limit to retain cluster" " e.g. cluster coverage standard deviation <= `cov-stddev-limit`", default=25.0, metavar="float", type=float, ) parser.add_argument( "--gc-stddev-limit", help="GC content standard deviation limit to retain cluster" " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`", default=5.0, metavar="float", type=float, ) parser.add_argument( "--taxonomy", metavar="filepath", help="Path to Autometa assigned taxonomies table", ) parser.add_argument( "--starting-rank", help="Canonical rank at which to begin subsetting taxonomy", default="superkingdom", choices=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species", ], ) parser.add_argument( "--reverse-ranks", action="store_true", default=False, help="Reverse order at which to split taxonomy by canonical-rank." " When `--reverse-ranks` is given, contigs will be split in order of" " species, genus, family, order, class, phylum, superkingdom.", ) parser.add_argument( "--rank-filter", help= "Taxonomy column canonical rank to subset by provided value of `--rank-name-filter`", default="superkingdom", choices=[ "superkingdom", "phylum", "class", "order", "family", "genus", "species", ], ) parser.add_argument( "--rank-name-filter", help= "Only retrieve contigs with this name corresponding to `--rank-filter` column", default="bacteria", ) parser.add_argument( "--verbose", action="store_true", default=False, help="log debug information", ) parser.add_argument( "--cpus", default=-1, metavar="int", type=int, help= "Number of cores to use by clustering method (default will try to use as many as are available)", ) args = parser.parse_args() # First check if we are performing binning with taxonomic partitioning if args.taxonomy: main_df = read_annotations( [args.kmers, args.coverages, args.gc_content, args.taxonomy]) main_df = filter_taxonomy(df=main_df, rank=args.rank_filter, name=args.rank_name_filter) else: main_df = read_annotations( [args.kmers, args.coverages, args.gc_content]) # Prepare our markers dataframe markers_df = load_markers(args.markers, format="wide") # Ensure we have marker-containing contigs available to check binning quality... try: if main_df.loc[main_df.index.isin(markers_df.index)].empty: raise TableFormatError( "No markers for contigs in table. Unable to assess binning quality" ) if main_df.shape[0] <= 1: raise BinningError("Not enough contigs in table for binning") except (TableFormatError, BinningError) as err: logger.warn(err) # Using an http error status code... # From: https://kinsta.com/blog/http-status-codes/#200-status-codes # 204: “No Content.” # This code means that the server has successfully processed the request # but is not going to return any content. sys.exit(204) logger.info(f"Selected clustering method: {args.clustering_method}") if args.taxonomy: main_out = taxon_guided_binning( main=main_df, markers=markers_df, completeness=args.completeness, purity=args.purity, coverage_stddev=args.cov_stddev_limit, gc_content_stddev=args.gc_stddev_limit, method=args.clustering_method, starting_rank=args.starting_rank, reverse_ranks=args.reverse_ranks, n_jobs=args.cpus, verbose=args.verbose, ) else: # Perform clustering w/o taxonomy main_out = get_clusters( main=main_df, markers_df=markers_df, completeness=args.completeness, purity=args.purity, coverage_stddev=args.cov_stddev_limit, gc_content_stddev=args.gc_stddev_limit, method=args.clustering_method, n_jobs=args.cpus, verbose=args.verbose, ) write_results( results=main_out, binning_output=args.output_binning, full_output=args.output_main, )
def fixture_markers(markers_fpath): return load_markers(markers_fpath)
def main(): import argparse import logging as logger logger.basicConfig( format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logger.DEBUG, ) parser = argparse.ArgumentParser( description="Recruit unclustered contigs given metagenome annotations and Autometa binning results." " Note: All tables must contain a 'contig' column to be used as the unique table index", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--kmers", help="Path to normalized kmer frequencies table.", required=True ) parser.add_argument("--coverage", help="Path to coverage table.", required=True) parser.add_argument( "--binning", help="Path to autometa binning output [will look for col='cluster']", required=True, ) parser.add_argument( "--markers", help="Path to domain-specific markers table.", required=True ) parser.add_argument( "--output-binning", help="Path to output unclustered recruitment table.", required=True, ) parser.add_argument( "--output-main", help="Path to write Autometa main table used during/after unclustered recruitment.", required=False, ) parser.add_argument( "--output-features", help="Path to write Autometa features table used during unclustered recruitment.", required=False, ) parser.add_argument("--taxonomy", help="Path to taxonomy table.") parser.add_argument( "--taxa-dimensions", help="Num of dimensions to reduce taxonomy encodings", type=int, ) parser.add_argument( "--additional-features", help="Path to additional features with which to add to classifier training data.", nargs="*", default=[], ) parser.add_argument( "--confidence", help="Percent confidence to allow classification (confidence = num. consistent predictions/num. classifications)", default=1.0, type=float, ) parser.add_argument( "--num-classifications", help="Num classifications for predicting/validating contig cluster recruitment", default=10, type=int, ) parser.add_argument( "--classifier", help="classifier to use for recruitment of contigs", default="decision_tree", choices=["decision_tree", "random_forest"], ) parser.add_argument( "--kmer-dimensions", help="Num of dimensions to reduce normalized k-mer frequencies", default=50, type=int, ) parser.add_argument( "--seed", help="Seed to use for RandomState when initializing classifiers.", default=42, type=int, ) args = parser.parse_args() features = get_features( kmers=args.kmers, coverage=args.coverage, annotations=args.additional_features, taxonomy=args.taxonomy, kmer_dimensions=args.kmer_dimensions, taxa_dimensions=args.taxa_dimensions, ) bin_df = pd.read_csv( args.binning, sep="\t", index_col="contig", usecols=["contig", "cluster"] ) prev_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0] if not prev_num_unclustered: logger.warning("No unclustered contigs are available to recruit!") sys.exit(204) markers_df = load_markers(fpath=args.markers, format="wide") logger.debug( f"classifier={args.classifier}, seed={args.seed}, n.estimators={args.num_classifications}, confidence={args.confidence*100}%" ) n_runs = 0 while True: n_runs += 1 train_data, test_data = train_test_split_and_subset( binning=bin_df, features=features, markers=markers_df ) # Perform cross-validation with n. iterations (num. estimators) predictions_df = get_confidence_filtered_predictions( train_data=train_data, test_data=test_data, num_classifications=args.num_classifications, confidence=args.confidence, classifier=args.classifier, seed=args.seed, ) # Filter out any predictions that would reduce cluster purity predictions_df = filter_contaminating_predictions( predictions=predictions_df, markers=markers_df, binning=bin_df ) # Stop if no contigs are recruited to clusters if predictions_df.empty: break bin_df = add_predictions(binning=bin_df, predictions=predictions_df) # Unclustered recruitment finished # Determine the resulting number of unclustered contigs. now_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0] n_recruited = prev_num_unclustered - now_num_unclustered logger.info( f"unclustered {prev_num_unclustered} -> {now_num_unclustered} (recruited {n_recruited} contigs) in {n_runs} runs" ) # Re-read in the binning dataframe to merge with the newly recruited contigs prev_bin_df = pd.read_csv(args.binning, sep="\t", index_col="contig") bin_df.rename(columns={"cluster": "recruited_cluster"}, inplace=True) binning_df = pd.merge( prev_bin_df[["cluster"]], bin_df[["recruited_cluster"]], left_index=True, right_index=True, ) # Write unclustered recruitment results into binning df # index = 'contig', cols = ['cluster', 'recruited_cluster'] binning_df.to_csv( args.output_binning, sep="\t", index=True, header=True, float_format="%.5f" ) if args.output_main: main_df = pd.merge( prev_bin_df, bin_df[["recruited_cluster"]], left_index=True, right_index=True, ) main_df.to_csv( args.output_main, sep="\t", index=True, header=True, float_format="%.5f" ) if args.output_features: # Outputs features matrix used as input to recruitment algorithm features.to_csv( args.output_features, sep="\t", index=True, header=True, float_format="%.5f" )