def evaluate_classification( predictions: Iterable, reference: str, ncbi: Union[str, NCBI], keep_averages=["weighted avg", "samples avg"], ) -> Tuple[pd.DataFrame, List[Dict[str, str]]]: """Evaluate classification `predictions` against provided `reference` Parameters ---------- predictions : Iterable Paths to taxonomic predictions (tab-delimited files of contig and taxid columns) reference : str Path to ground truths (tab-delimited file containing at least contig and taxid columns) ncbi : Union[str, NCBI] Path to NCBI databases directory or instance of autometa NCBI class keep_averages : list, optional averages to keep from classification report, by default ["weighted avg", "samples avg"] Returns ------- Tuple[pd.DataFrame, List[dict]] Metrics """ if not ncbi: raise ValueError("--ncbi is required for the classification benchmark!") # Read in community reference assignments reference = ( pd.read_csv( reference, sep="\t", usecols=["contig", "taxid"], index_col="contig" ) # Convert the taxid dtype to int .convert_dtypes() # Drop any contigs missing taxid classification .dropna(axis="index") ) # Instantiate NCBI so we can coordinate taxids ncbi = NCBI(ncbi) if isinstance(ncbi, str) else ncbi all_metrics = [] all_reports = [] # Compute metrics for all provided predictions for prediction in predictions: # convert and merge taxids of reference assignments and predictions labels = get_target_labels( prediction=prediction, reference=reference, ncbi=ncbi ) # Compute metrics across all canonical ranks report = compute_classification_metrics(labels) report.update({"dataset": os.path.basename(prediction)}) all_reports.append(report) averages = {k: v for k, v in report.items() if k in keep_averages} for average, scores in averages.items(): metrics = scores metrics.update( {"average": average, "dataset": os.path.basename(prediction)} ) all_metrics.append(metrics) df = pd.DataFrame(all_metrics).set_index("dataset") return df, all_reports
def get_taxonomy(self, num_orfs: int = 2): logger.info("Making taxonomy test data...") # Get diamond blastp output table orf_column = 0 blastp = pd.read_csv(self.taxonmy_blastp, sep="\t", index_col=orf_column, header=None) # Get number of unique ORFs set by `num_orfs`, default is 2. orf_hits = set(blastp.index.unique().tolist()[:num_orfs]) blastp = blastp.loc[orf_hits] blastp.reset_index(inplace=True) if num_orfs == 2: # NODE_38_length_280079_cov_224.186_1 and NODE_38_length_280079_cov_224.186_2 # together have 400 hits assert blastp.shape == ( 400, 12, ), f"shape: {blastp.shape}\ncolumns: {blastp.columns}" blastp_query_orfs = { f">{record.id}": str(record.seq) for record in SeqIO.parse(self.taxonomy_orfs, "fasta") if not record.id in orf_hits } ncbi = NCBI(self.taxonomy_ncbi) # Get prot.accession2taxid datastructure and subset by taxids encountered in blastp output. sacc_column = 1 blastp_accessions = set(blastp[sacc_column].unique().tolist()) acc2taxids = subset_acc2taxids(blastp_accessions, ncbi) accessions = {k for k in acc2taxids.keys()} blastp = blastp.set_index(sacc_column).loc[accessions].reset_index() blastp = blastp.set_index(orf_column).reset_index() assert blastp.shape[0] == len( acc2taxids ), f"blastp shape: {blastp.shape}\tnum. acc2taxids: {len(acc2taxids)}" # Get nodes.dmp, names.dmp and merged.dmp data structures. nodes = ncbi.nodes names = ncbi.names # Merged are only necessary if taxids have been deprecated or suppressed blastp_taxids = acc2taxids.values() merged = { old: new for old, new in ncbi.merged.items() if old in blastp_taxids } self.data["taxonomy"] = { "prot_orfs": blastp_query_orfs, "blastp": blastp.to_json(), "acc2taxid": acc2taxids, "merged": merged, "nodes": nodes, "names": names, }
def write_reports(reports: Iterable[Dict], outdir: str, ncbi: NCBI) -> None: """Write taxid multi-label classification reports in `reports` Parameters ---------- reports : Iterable[Dict] List of classification report dicts from each classification benchmarking evaluation outdir : str Directory path to write reports ncbi : NCBI autometa.taxonomy.ncbi.NCBI instance for taxid name and rank look-up. Returns ------- NoneType """ # First create the output directory if it does not exist if not os.path.isdir(outdir) or not os.path.exists(outdir): os.makedirs(outdir) logger.info(f"Created new directory: {outdir}") # Now format each report then write out to outdir for report in reports: # Get dataset to name report filepath dataset = report.pop("dataset") dataset = dataset.replace(".tsv", "").replace(".gz", "") dataset = f"{dataset}_classification_report.tsv.gz" report_filepath = os.path.join(outdir, dataset) # Remove overall averages: # Remove any rows of the report that are averages of other rows (These can easily be retrieved with DataFrame later if needed.) avgs = [k for k in report if " avg" in k] for avg in avgs: report.pop(avg) # Reshape from wide to long report_df = pd.DataFrame(report).transpose() # Add human-readable taxonomic information according to taxid classification benchmarks report_df["name"] = report_df.index.map(lambda taxid: ncbi.name(taxid)) report_df["rank"] = report_df.index.map(lambda taxid: ncbi.rank(taxid)) report_df.index.name = "taxid" report_df.to_csv(report_filepath, sep="\t", index=True, header=True) logger.info(f"Wrote {len(reports):,} report(s) to {outdir}")
def is_consistent_with_other_orfs(taxid: int, rank: str, rank_counts: Dict[str, Dict], ncbi: NCBI) -> bool: """Determines whether the majority of proteins in a contig, with rank equal to or above the given rank, are common ancestors of the taxid. If the majority are, this function returns True, otherwise it returns False. Parameters ---------- taxid : int `taxid` to search against other taxids at `rank` in `rank_counts`. rank : str Canonical rank to search in `rank_counts`. Choices: species, genus, family, order, class, phylum, superkingdom. rank_counts : dict LCA canonical rank counts retrieved from ORFs respective to a contig. e.g. {canonical_rank: {taxid: num_hits, ...}, ...} ncbi : NCBI instance Instance or subclass of NCBI from autometa.taxonomy.ncbi. Returns ------- boolean If the majority of ORFs in a contig are equal or above given rank then return True, otherwise return False. """ rank_index = NCBI.CANONICAL_RANKS.index(rank) ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:] # Now we total up the consistent and inconsistent ORFs consistent = 0 inconsistent = 0 for rank_name in ranks_to_consider: if rank_name not in rank_counts: continue for rank_taxid, count in rank_counts[rank_name].items(): if ncbi.is_common_ancestor(rank_taxid, taxid): consistent += count else: inconsistent += count if consistent > inconsistent: # COMBAK: See issue-#48: This could also return the ratio of consistent # to inconsistent to give the user an idea of the consistency of the # taxon assignments. return True else: return False
def main(): import argparse import logging as logger logger.basicConfig( format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logger.DEBUG, ) parser = argparse.ArgumentParser( description="Benchmark classification, clustering or binning-classification against reference assignments for the provided simulated/synthetic community.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--benchmark", help="Type of benchmarking to perform", choices={"clustering", "classification", "binning-classification"}, required=True, ) parser.add_argument( "--predictions", help="Path to Autometa predictions (May specify multiple if they all correspond to the same `--reference` community ", metavar="filepath", nargs="*", ) parser.add_argument( "--reference", help="Path to community reference assignments", metavar="filepath", required=True, ) parser.add_argument( "--output-wide", help="Path to write benchmarking evaluation metrics (each metric receives its own column) (Default: `benchmark_type`_benchmarks.tsv.gz", metavar="filepath", required=False, ) parser.add_argument( "--output-long", help="Path to write clustering evaluation metrics (metrics are stacked into one 'metric' column)", metavar="filepath", required=False, ) parser.add_argument( "--output-classification-reports", help="Path to write classification evaluation reports", metavar="dirpath", required=False, ) parser.add_argument( "--ncbi", help="Path to NCBI databases directory (Required with --benchmark=classification)", metavar="dirpath", required=False, ) args = parser.parse_args() logger.info(f"Evaluating {args.benchmark} benchmarks") if args.benchmark == "clustering": df = evaluate_clustering(predictions=args.predictions, reference=args.reference) if args.output_long: # Write out stacked dataframe for visualization with `plot-cluster-evaluation-metrics.R` dff = df.stack() dff.index.name = ("dataset", "metric") dff.name = "score" dff = ( dff.to_frame() .reset_index(level=1) .rename(columns={"level_1": "metric"}) ) dff.to_csv(args.output_long, sep="\t", index=True, header=True) logger.info( f"Wrote {dff.index.nunique()} datasets (stacked) metrics to {args.output_long}" ) elif args.benchmark == "classification": ncbi = NCBI(args.ncbi) df, reports = evaluate_classification( predictions=args.predictions, reference=args.reference, ncbi=ncbi, ) if args.output_classification_reports: write_reports( reports=reports, outdir=args.output_classification_reports, ncbi=ncbi, ) else: # args.benchmark == "binning-classification": df = evaluate_binning_classification( predictions=args.predictions, reference=args.reference ) output_wide = ( f"{args.benchmark}_benchmarks.tsv.gz" if not args.output_wide else args.output_wide ) df.to_csv(output_wide, sep="\t", index=True, header=True) logger.info(f"Wrote {df.shape[0]} datasets metrics to {output_wide}")
def get_target_labels( prediction: str, reference: Union[str, pd.DataFrame], ncbi: Union[str, NCBI] ) -> namedtuple: """Retrieve taxid lineage as target labels from merge of `reference` and `prediction`. Note ---- The exact label value matters for these metrics as we are looking at the available target labels for classification (not clustering) Parameters ---------- prediction : str Path to contig taxid predictions reference : Union[str, pd.DataFrame] Path to ground truth contig taxids ncbi : Union[str, NCBI] Path to NCBI databases directory or instance of autometa NCBI class. Returns ------- namedtuple Targets namedtuple with fields 'true', 'pred' and 'target_names' Raises ------ ValueError Provided reference is not a pd.DataFrame or path to reference assignments file. ValueError The provided reference community and predictions do not match """ pred_df = pd.read_csv( prediction, sep="\t", index_col="contig", usecols=["contig", "taxid"] ).convert_dtypes() # Convert taxids of 0 to 1 (some taxon-profilers assign unclassified to 0) unclassified_contigs = pred_df[pred_df.taxid.eq(0)].index.unique().tolist() if unclassified_contigs: logger.debug(unclassified_contigs) logger.debug(f"Converting {pred_df.taxid.eq(0).sum():,} taxids from 0 to 1") pred_df.taxid = pred_df.taxid.map(lambda tid: 1 if tid == 0 else tid) unclassified_contigs = ( pred_df[pred_df.taxid.eq("unclassified")].index.unique().tolist() ) if unclassified_contigs: logger.debug(unclassified_contigs) logger.debug( f"Converting {pred_df.taxid.eq('unclassified').sum():,} taxids from 'unclassified' to 1" ) pred_df.taxid = pred_df.taxid.map(lambda tid: 1 if tid == "unclassified" else tid) if not isinstance(reference, pd.DataFrame) and isinstance(reference, str): ref_df = ( pd.read_csv( reference, sep="\t", index_col="contig", usecols=["contig", "taxid"], ) .dropna(axis="index") .convert_dtypes() ) elif not isinstance(reference, pd.DataFrame) and not isinstance(reference, str): raise ValueError(f"reference is an invalid argument type: {type(reference)}") else: ref_df = reference # Merge reference_assignments and predictions main_df = pd.merge( pred_df, ref_df, how="inner", left_index=True, right_index=True, suffixes=("_pred", "_true"), ) if main_df.empty: raise ValueError( "The provided reference community and predictions do not match!" ) # Convert any old taxids to new taxids from merged.dmp ncbi = NCBI(ncbi) if isinstance(ncbi, str) else ncbi main_df.taxid_pred = main_df.taxid_pred.map( lambda tid: ncbi.convert_taxid_dtype(tid) ) main_df.taxid_true = main_df.taxid_true.map( lambda tid: ncbi.convert_taxid_dtype(tid) ) # Create binary encoded matrix for multi-label classification metrics # First join strings s.t. taxid|taxid|... to be used with pd.str.get_dummies(sep='|') main_df["true_lineage"] = main_df.taxid_true.map( lambda t: "|".join( str(l.get("taxid")) for l in ncbi.lineage(t, canonical=False) ) ) main_df["pred_lineage"] = main_df.taxid_pred.map( lambda t: "|".join( str(l.get("taxid")) for l in ncbi.lineage(t, canonical=False) ) ) # Now create our binary encoded matrices (NOTE: These are multi-label classification matrices) y_true = main_df.true_lineage.str.get_dummies() y_pred = main_df.pred_lineage.str.get_dummies() # Now we need to ensure our columns have one-to-one correspondence with both dataframes # Retrieve columns in y_true but not in y_pred absent_y_true_cols = y_true.loc[:, ~y_true.columns.isin(y_pred.columns)].columns # Retrieve columns in y_pred but not in y_true absent_y_pred_cols = y_pred.loc[:, ~y_pred.columns.isin(y_true.columns)].columns # Now add these columns with 0's to reflect their absence in the other respective dataframe y_pred.loc[:, absent_y_true_cols] = 0 y_true.loc[:, absent_y_pred_cols] = 0 # Now we need to ensure all column indices correspond to each other between dataframes all_cols = y_true.columns.tolist() y_pred = y_pred[all_cols] return Targets(true=y_true, pred=y_pred, target_names=all_cols)
def main(): import argparse import logging as logger logger.basicConfig( format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logger.DEBUG, ) parser = argparse.ArgumentParser( description= "Summarize Autometa results writing genome fastas and their respective" " taxonomies/assembly metrics for respective metagenomes", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--binning-main", help= "Path to Autometa binning main table (output from --binning-main argument)", metavar="filepath", required=True, ) parser.add_argument( "--markers", help= "Path to annotated markers respective to domain (bacteria or archaea) binned", metavar="filepath", required=True, ) parser.add_argument( "--metagenome", help="Path to metagenome assembly", metavar="filepath", required=True, ) parser.add_argument( "--ncbi", help= "Path to user NCBI databases directory (Required for retrieving metabin taxonomies)", metavar="dirpath", required=False, ) parser.add_argument( "--binning-column", help="Binning column to use for grouping metabins", metavar="str", required=False, default="cluster", ) parser.add_argument( "--output-stats", help="Path to write metabins stats table", metavar="filepath", required=True, ) parser.add_argument( "--output-taxonomy", help="Path to write metabins taxonomies table", metavar="filepath", required=True, ) parser.add_argument( "--output-metabins", help= "Path to output directory. (Directory must not exist. This directory will be created.)", metavar="dirpath", required=True, ) args = parser.parse_args() bin_df = pd.read_csv(args.binning_main, sep="\t", index_col="contig") if bin_df.empty: logger.error(f"{args.binning} empty...") exit(1) # First write out directory with fasta files per each metabin write_cluster_records( bin_df=bin_df, metagenome=args.metagenome, outdir=args.output_metabins, cluster_col=args.binning_column, ) # Now retrieve stats for each metabin metabin_stats_df = get_metabin_stats( bin_df=bin_df, markers=args.markers, cluster_col=args.binning_column, ) metabin_stats_df.to_csv(args.output_stats, sep="\t", index=True, header=True) logger.info(f"Wrote metabin stats to {args.output_stats}") # Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method. if "taxid" in bin_df.columns: if not args.ncbi: logger.warn( "taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..." ) else: ncbi = NCBI(dirpath=args.ncbi) taxa_df = get_metabin_taxonomies(bin_df=bin_df, ncbi=ncbi, cluster_col=args.binning_column) taxa_df.to_csv(args.output_taxonomy, sep="\t", index=True, header=True)
def get_metabin_taxonomies(bin_df: pd.DataFrame, ncbi: NCBI, cluster_col: str = "cluster") -> pd.DataFrame: """Retrieve taxonomies of all clusters recovered from Autometa binning. Parameters ---------- bin_df : pd.DataFrame Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks] ncbi : autometa.taxonomy.ncbi.NCBI instance Autometa NCBI class instance cluster_col : str, optional Clustering column by which to group metabins Returns ------- pd.DataFrame Dataframe consisting of cluster taxonomy with taxid and canonical rank. Indexed by cluster """ logger.info(f"Retrieving metabin taxonomies for {cluster_col}") canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"] is_clustered = bin_df[cluster_col].notnull() bin_df = bin_df[is_clustered] outcols = [cluster_col, "length", "taxid", *canonical_ranks] tmp_lines = (bin_df[outcols].to_csv(sep="\t", index=False, header=False, line_terminator="\n").split("\n")) taxonomies = {} # Here we prepare our datastructure for the majority_vote.rank_taxids(...) function. for line in tmp_lines: if not line: # Account for end of file where we have empty string. continue llist = line.strip().split("\t") cluster = llist[0] length = int(llist[1]) taxid = int(llist[2]) ranks = llist[3:] for rank, canonical_rank in zip(ranks, canonical_ranks): if rank != "unclassified": break if cluster not in taxonomies: taxonomies.update({cluster: {canonical_rank: {taxid: length}}}) elif canonical_rank not in taxonomies[cluster]: taxonomies[cluster].update({canonical_rank: {taxid: length}}) elif taxid not in taxonomies[cluster][canonical_rank]: taxonomies[cluster][canonical_rank].update({taxid: length}) else: taxonomies[cluster][canonical_rank][taxid] += length cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi) # With our cluster taxonomies, let's place these into a dataframe for easy data accession cluster_taxa_df = pd.Series(data=cluster_taxonomies, name="taxid").to_frame() # With the list of taxids, we'll retrieve their complete canonical-rank information lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(), fillna=True) # Now put it all together cluster_taxa_df = pd.merge(cluster_taxa_df, lineage_df, how="left", left_on="taxid", right_index=True) cluster_taxa_df.index.name = cluster_col return cluster_taxa_df
def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int: """Determine the lowest majority given `rank_counts` by first attempting to get a taxid that leads in counts with the highest specificity in terms of canonical rank. Parameters ---------- rank_counts : dict {canonical_rank:{taxid:num_hits, ...}, rank2: {...}, ...} ncbi : NCBI instance NCBI object from autometa.taxonomy.ncbi Returns ------- int Taxid above the lowest majority threshold. """ taxid_totals = {} for rank in NCBI.CANONICAL_RANKS: if rank not in rank_counts: continue rank_index = NCBI.CANONICAL_RANKS.index(rank) ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:] for taxid in rank_counts[rank]: # Make a dictionary to total the number of canonical ranks hit # while traversing the path so that we can add 'unclassified' to # any that don't exist. Later we need to make sure that # 'unclassified' doesn't ever win ranks_in_path = { rank_to_consider: 0 for rank_to_consider in ranks_to_consider } # We need to add to taxid_totals for each taxid in the tax_path current_taxid = taxid current_rank = rank while current_taxid != 1: if current_rank not in set(NCBI.CANONICAL_RANKS): current_taxid = ncbi.parent(current_taxid) current_rank = ncbi.rank(current_taxid) continue ranks_in_path[current_rank] += 1 if current_rank not in taxid_totals: taxid_totals.update({current_rank: {current_taxid: 1}}) current_taxid = ncbi.parent(current_taxid) current_rank = ncbi.rank(current_taxid) continue if current_taxid in taxid_totals[current_rank]: taxid_totals[current_rank][current_taxid] += 1 else: taxid_totals[current_rank][current_taxid] = 1 current_taxid = ncbi.parent(current_taxid) current_rank = ncbi.rank(current_taxid) # Now go through ranks_in_path. Where total = 0, add 'unclassified' for rank_to_consider in ranks_to_consider: if ranks_in_path[rank_to_consider] == 0: if rank_to_consider not in taxid_totals: taxid_totals[rank_to_consider] = {"unclassified": 1} elif "unclassified" in taxid_totals[rank_to_consider]: taxid_totals[rank_to_consider]["unclassified"] += 1 else: taxid_totals[rank_to_consider]["unclassified"] = 1 # If there are any gaps in the taxonomy paths for any of the proteins in the contig, # we need to add 'unclassified' to the relevant canonical taxonomic rank. # However, we must never allow 'unclassified' to win! (That just won't really tell us anything) # Now we need to determine which is the first level to have a majority for rank in NCBI.CANONICAL_RANKS: total_votes = 0 taxid_leader = None taxid_leader_votes = 0 if not rank in taxid_totals: continue for taxid in taxid_totals[rank]: taxid_votes = taxid_totals[rank][taxid] total_votes += taxid_votes if taxid_votes > taxid_leader_votes: taxid_leader = taxid taxid_leader_votes = taxid_votes majority_threshold = float(total_votes) / 2 if taxid_leader_votes > majority_threshold and taxid_leader != "unclassified": return taxid_leader # Just in case return 1
def fixture_ncbi(ncbi_dir): return NCBI(dirpath=ncbi_dir, verbose=False)