Python NCBIの例、autometa.taxonomy.ncbi.NCBI Pythonの例

コード例 #1

0

ファイルを表示

ファイル: benchmark.py プロジェクト: jason-c-kwan/Autometa

def evaluate_classification(
    predictions: Iterable,
    reference: str,
    ncbi: Union[str, NCBI],
    keep_averages=["weighted avg", "samples avg"],
) -> Tuple[pd.DataFrame, List[Dict[str, str]]]:
    """Evaluate classification `predictions` against provided `reference`

    Parameters
    ----------
    predictions : Iterable
        Paths to taxonomic predictions (tab-delimited files of contig and taxid columns)
    reference : str
        Path to ground truths (tab-delimited file containing at least contig and taxid columns)
    ncbi : Union[str, NCBI]
        Path to NCBI databases directory or instance of autometa NCBI class
    keep_averages : list, optional
        averages to keep from classification report, by default ["weighted avg", "samples avg"]

    Returns
    -------
    Tuple[pd.DataFrame, List[dict]]
        Metrics
    """
    if not ncbi:
        raise ValueError("--ncbi is required for the classification benchmark!")
    # Read in community reference assignments
    reference = (
        pd.read_csv(
            reference, sep="\t", usecols=["contig", "taxid"], index_col="contig"
        )
        # Convert the taxid dtype to int
        .convert_dtypes()
        # Drop any contigs missing taxid classification
        .dropna(axis="index")
    )
    # Instantiate NCBI so we can coordinate taxids
    ncbi = NCBI(ncbi) if isinstance(ncbi, str) else ncbi
    all_metrics = []
    all_reports = []
    # Compute metrics for all provided predictions
    for prediction in predictions:
        # convert and merge taxids of reference assignments and predictions
        labels = get_target_labels(
            prediction=prediction, reference=reference, ncbi=ncbi
        )
        # Compute metrics across all canonical ranks
        report = compute_classification_metrics(labels)
        report.update({"dataset": os.path.basename(prediction)})
        all_reports.append(report)
        averages = {k: v for k, v in report.items() if k in keep_averages}
        for average, scores in averages.items():
            metrics = scores
            metrics.update(
                {"average": average, "dataset": os.path.basename(prediction)}
            )
            all_metrics.append(metrics)
    df = pd.DataFrame(all_metrics).set_index("dataset")
    return df, all_reports

コード例 #2

0

ファイルを表示

ファイル: make_test_data.py プロジェクト: jason-c-kwan/Autometa

    def get_taxonomy(self, num_orfs: int = 2):
        logger.info("Making taxonomy test data...")
        # Get diamond blastp output table
        orf_column = 0
        blastp = pd.read_csv(self.taxonmy_blastp,
                             sep="\t",
                             index_col=orf_column,
                             header=None)
        # Get number of unique ORFs set by `num_orfs`, default is 2.
        orf_hits = set(blastp.index.unique().tolist()[:num_orfs])
        blastp = blastp.loc[orf_hits]
        blastp.reset_index(inplace=True)
        if num_orfs == 2:
            # NODE_38_length_280079_cov_224.186_1 and NODE_38_length_280079_cov_224.186_2
            # together have 400 hits
            assert blastp.shape == (
                400,
                12,
            ), f"shape: {blastp.shape}\ncolumns: {blastp.columns}"

        blastp_query_orfs = {
            f">{record.id}": str(record.seq)
            for record in SeqIO.parse(self.taxonomy_orfs, "fasta")
            if not record.id in orf_hits
        }

        ncbi = NCBI(self.taxonomy_ncbi)
        # Get prot.accession2taxid datastructure and subset by taxids encountered in blastp output.
        sacc_column = 1
        blastp_accessions = set(blastp[sacc_column].unique().tolist())
        acc2taxids = subset_acc2taxids(blastp_accessions, ncbi)
        accessions = {k for k in acc2taxids.keys()}
        blastp = blastp.set_index(sacc_column).loc[accessions].reset_index()
        blastp = blastp.set_index(orf_column).reset_index()
        assert blastp.shape[0] == len(
            acc2taxids
        ), f"blastp shape: {blastp.shape}\tnum. acc2taxids: {len(acc2taxids)}"
        # Get nodes.dmp, names.dmp and merged.dmp data structures.
        nodes = ncbi.nodes
        names = ncbi.names
        # Merged are only necessary if taxids have been deprecated or suppressed
        blastp_taxids = acc2taxids.values()
        merged = {
            old: new
            for old, new in ncbi.merged.items() if old in blastp_taxids
        }

        self.data["taxonomy"] = {
            "prot_orfs": blastp_query_orfs,
            "blastp": blastp.to_json(),
            "acc2taxid": acc2taxids,
            "merged": merged,
            "nodes": nodes,
            "names": names,
        }

コード例 #3

0

ファイルを表示

ファイル: benchmark.py プロジェクト: jason-c-kwan/Autometa

def write_reports(reports: Iterable[Dict], outdir: str, ncbi: NCBI) -> None:
    """Write taxid multi-label classification reports in `reports`

    Parameters
    ----------
    reports : Iterable[Dict]
        List of classification report dicts from each classification benchmarking evaluation
    outdir : str
        Directory path to write reports
    ncbi : NCBI
        autometa.taxonomy.ncbi.NCBI instance for taxid name and rank look-up.

    Returns
    -------
    NoneType

    """
    # First create the output directory if it does not exist
    if not os.path.isdir(outdir) or not os.path.exists(outdir):
        os.makedirs(outdir)
        logger.info(f"Created new directory: {outdir}")
    # Now format each report then write out to outdir
    for report in reports:
        # Get dataset to name report filepath
        dataset = report.pop("dataset")
        dataset = dataset.replace(".tsv", "").replace(".gz", "")
        dataset = f"{dataset}_classification_report.tsv.gz"
        report_filepath = os.path.join(outdir, dataset)
        # Remove overall averages:
        # Remove any rows of the report that are averages of other rows (These can easily be retrieved with DataFrame later if needed.)
        avgs = [k for k in report if " avg" in k]
        for avg in avgs:
            report.pop(avg)
        # Reshape from wide to long
        report_df = pd.DataFrame(report).transpose()
        # Add human-readable taxonomic information according to taxid classification benchmarks
        report_df["name"] = report_df.index.map(lambda taxid: ncbi.name(taxid))
        report_df["rank"] = report_df.index.map(lambda taxid: ncbi.rank(taxid))
        report_df.index.name = "taxid"
        report_df.to_csv(report_filepath, sep="\t", index=True, header=True)
    logger.info(f"Wrote {len(reports):,} report(s) to {outdir}")

コード例 #4

0

ファイルを表示

def is_consistent_with_other_orfs(taxid: int, rank: str,
                                  rank_counts: Dict[str,
                                                    Dict], ncbi: NCBI) -> bool:
    """Determines whether the majority of proteins in a contig, with rank equal
    to or above the given rank, are common ancestors of the taxid.

    If the majority are, this function returns True, otherwise it returns False.

    Parameters
    ----------
    taxid : int
        `taxid` to search against other taxids at `rank` in `rank_counts`.
    rank : str
        Canonical rank to search in `rank_counts`.
        Choices: species, genus, family, order, class, phylum, superkingdom.
    rank_counts : dict
        LCA canonical rank counts retrieved from ORFs respective to a contig.
        e.g. {canonical_rank: {taxid: num_hits, ...}, ...}
    ncbi : NCBI instance
        Instance or subclass of NCBI from autometa.taxonomy.ncbi.

    Returns
    -------
    boolean
        If the majority of ORFs in a contig are equal or above given rank then
        return True, otherwise return False.

    """
    rank_index = NCBI.CANONICAL_RANKS.index(rank)
    ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:]
    # Now we total up the consistent and inconsistent ORFs
    consistent = 0
    inconsistent = 0
    for rank_name in ranks_to_consider:
        if rank_name not in rank_counts:
            continue
        for rank_taxid, count in rank_counts[rank_name].items():
            if ncbi.is_common_ancestor(rank_taxid, taxid):
                consistent += count
            else:
                inconsistent += count
    if consistent > inconsistent:
        # COMBAK: See issue-#48: This could also return the ratio of consistent
        # to inconsistent to give the user an idea of the consistency of the
        # taxon assignments.
        return True
    else:
        return False

コード例 #5

0

ファイルを表示

ファイル: benchmark.py プロジェクト: jason-c-kwan/Autometa

def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )
    parser = argparse.ArgumentParser(
        description="Benchmark classification, clustering or binning-classification against reference assignments for the provided simulated/synthetic community.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--benchmark",
        help="Type of benchmarking to perform",
        choices={"clustering", "classification", "binning-classification"},
        required=True,
    )
    parser.add_argument(
        "--predictions",
        help="Path to Autometa predictions (May specify multiple if they all correspond to the same `--reference` community ",
        metavar="filepath",
        nargs="*",
    )
    parser.add_argument(
        "--reference",
        help="Path to community reference assignments",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-wide",
        help="Path to write benchmarking evaluation metrics (each metric receives its own column) (Default: `benchmark_type`_benchmarks.tsv.gz",
        metavar="filepath",
        required=False,
    )
    parser.add_argument(
        "--output-long",
        help="Path to write clustering evaluation metrics (metrics are stacked into one 'metric' column)",
        metavar="filepath",
        required=False,
    )
    parser.add_argument(
        "--output-classification-reports",
        help="Path to write classification evaluation reports",
        metavar="dirpath",
        required=False,
    )
    parser.add_argument(
        "--ncbi",
        help="Path to NCBI databases directory (Required with --benchmark=classification)",
        metavar="dirpath",
        required=False,
    )
    args = parser.parse_args()

    logger.info(f"Evaluating {args.benchmark} benchmarks")
    if args.benchmark == "clustering":
        df = evaluate_clustering(predictions=args.predictions, reference=args.reference)
        if args.output_long:
            # Write out stacked dataframe for visualization with `plot-cluster-evaluation-metrics.R`
            dff = df.stack()
            dff.index.name = ("dataset", "metric")
            dff.name = "score"
            dff = (
                dff.to_frame()
                .reset_index(level=1)
                .rename(columns={"level_1": "metric"})
            )
            dff.to_csv(args.output_long, sep="\t", index=True, header=True)
            logger.info(
                f"Wrote {dff.index.nunique()} datasets (stacked) metrics to {args.output_long}"
            )
    elif args.benchmark == "classification":
        ncbi = NCBI(args.ncbi)
        df, reports = evaluate_classification(
            predictions=args.predictions,
            reference=args.reference,
            ncbi=ncbi,
        )
        if args.output_classification_reports:
            write_reports(
                reports=reports,
                outdir=args.output_classification_reports,
                ncbi=ncbi,
            )
    else:
        # args.benchmark == "binning-classification":
        df = evaluate_binning_classification(
            predictions=args.predictions, reference=args.reference
        )

    output_wide = (
        f"{args.benchmark}_benchmarks.tsv.gz"
        if not args.output_wide
        else args.output_wide
    )
    df.to_csv(output_wide, sep="\t", index=True, header=True)
    logger.info(f"Wrote {df.shape[0]} datasets metrics to {output_wide}")

コード例 #6

0

ファイルを表示

ファイル: benchmark.py プロジェクト: jason-c-kwan/Autometa

def get_target_labels(
    prediction: str, reference: Union[str, pd.DataFrame], ncbi: Union[str, NCBI]
) -> namedtuple:
    """Retrieve taxid lineage as target labels from merge of `reference` and `prediction`.

    Note
    ----

    The exact label value matters for these metrics as we are
    looking at the available target labels for classification (not clustering)

    Parameters
    ----------
    prediction : str
        Path to contig taxid predictions
    reference : Union[str, pd.DataFrame]
        Path to ground truth contig taxids
    ncbi : Union[str, NCBI]
        Path to NCBI databases directory or instance of autometa NCBI class.

    Returns
    -------
    namedtuple
        Targets namedtuple with fields 'true', 'pred' and 'target_names'

    Raises
    ------
    ValueError
        Provided reference is not a pd.DataFrame or path to reference assignments file.
    ValueError
        The provided reference community and predictions do not match
    """
    pred_df = pd.read_csv(
        prediction, sep="\t", index_col="contig", usecols=["contig", "taxid"]
    ).convert_dtypes()
    # Convert taxids of 0 to 1 (some taxon-profilers assign unclassified to 0)
    unclassified_contigs = pred_df[pred_df.taxid.eq(0)].index.unique().tolist()
    if unclassified_contigs:
        logger.debug(unclassified_contigs)
        logger.debug(f"Converting {pred_df.taxid.eq(0).sum():,} taxids from 0 to 1")

    pred_df.taxid = pred_df.taxid.map(lambda tid: 1 if tid == 0 else tid)

    unclassified_contigs = (
        pred_df[pred_df.taxid.eq("unclassified")].index.unique().tolist()
    )
    if unclassified_contigs:
        logger.debug(unclassified_contigs)
        logger.debug(
            f"Converting {pred_df.taxid.eq('unclassified').sum():,} taxids from 'unclassified' to 1"
        )

    pred_df.taxid = pred_df.taxid.map(lambda tid: 1 if tid == "unclassified" else tid)

    if not isinstance(reference, pd.DataFrame) and isinstance(reference, str):
        ref_df = (
            pd.read_csv(
                reference,
                sep="\t",
                index_col="contig",
                usecols=["contig", "taxid"],
            )
            .dropna(axis="index")
            .convert_dtypes()
        )
    elif not isinstance(reference, pd.DataFrame) and not isinstance(reference, str):
        raise ValueError(f"reference is an invalid argument type: {type(reference)}")
    else:
        ref_df = reference
    # Merge reference_assignments and predictions
    main_df = pd.merge(
        pred_df,
        ref_df,
        how="inner",
        left_index=True,
        right_index=True,
        suffixes=("_pred", "_true"),
    )
    if main_df.empty:
        raise ValueError(
            "The provided reference community and predictions do not match!"
        )
    # Convert any old taxids to new taxids from merged.dmp
    ncbi = NCBI(ncbi) if isinstance(ncbi, str) else ncbi
    main_df.taxid_pred = main_df.taxid_pred.map(
        lambda tid: ncbi.convert_taxid_dtype(tid)
    )
    main_df.taxid_true = main_df.taxid_true.map(
        lambda tid: ncbi.convert_taxid_dtype(tid)
    )
    # Create binary encoded matrix for multi-label classification metrics
    # First join strings s.t. taxid|taxid|... to be used with pd.str.get_dummies(sep='|')
    main_df["true_lineage"] = main_df.taxid_true.map(
        lambda t: "|".join(
            str(l.get("taxid")) for l in ncbi.lineage(t, canonical=False)
        )
    )
    main_df["pred_lineage"] = main_df.taxid_pred.map(
        lambda t: "|".join(
            str(l.get("taxid")) for l in ncbi.lineage(t, canonical=False)
        )
    )
    # Now create our binary encoded matrices (NOTE: These are multi-label classification matrices)
    y_true = main_df.true_lineage.str.get_dummies()
    y_pred = main_df.pred_lineage.str.get_dummies()
    # Now we need to ensure our columns have one-to-one correspondence with both dataframes
    # Retrieve columns in y_true but not in y_pred
    absent_y_true_cols = y_true.loc[:, ~y_true.columns.isin(y_pred.columns)].columns
    # Retrieve columns in y_pred but not in y_true
    absent_y_pred_cols = y_pred.loc[:, ~y_pred.columns.isin(y_true.columns)].columns
    # Now add these columns with 0's to reflect their absence in the other respective dataframe
    y_pred.loc[:, absent_y_true_cols] = 0
    y_true.loc[:, absent_y_pred_cols] = 0
    # Now we need to ensure all column indices correspond to each other between dataframes
    all_cols = y_true.columns.tolist()
    y_pred = y_pred[all_cols]
    return Targets(true=y_true, pred=y_pred, target_names=all_cols)

コード例 #7

0

ファイルを表示

def main():
    import argparse
    import logging as logger

    logger.basicConfig(
        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
        datefmt="%m/%d/%Y %I:%M:%S %p",
        level=logger.DEBUG,
    )
    parser = argparse.ArgumentParser(
        description=
        "Summarize Autometa results writing genome fastas and their respective"
        " taxonomies/assembly metrics for respective metagenomes",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--binning-main",
        help=
        "Path to Autometa binning main table (output from --binning-main argument)",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--markers",
        help=
        "Path to annotated markers respective to domain (bacteria or archaea) binned",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--metagenome",
        help="Path to metagenome assembly",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--ncbi",
        help=
        "Path to user NCBI databases directory (Required for retrieving metabin taxonomies)",
        metavar="dirpath",
        required=False,
    )
    parser.add_argument(
        "--binning-column",
        help="Binning column to use for grouping metabins",
        metavar="str",
        required=False,
        default="cluster",
    )
    parser.add_argument(
        "--output-stats",
        help="Path to write metabins stats table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-taxonomy",
        help="Path to write metabins taxonomies table",
        metavar="filepath",
        required=True,
    )
    parser.add_argument(
        "--output-metabins",
        help=
        "Path to output directory. (Directory must not exist. This directory will be created.)",
        metavar="dirpath",
        required=True,
    )
    args = parser.parse_args()

    bin_df = pd.read_csv(args.binning_main, sep="\t", index_col="contig")
    if bin_df.empty:
        logger.error(f"{args.binning} empty...")
        exit(1)

    # First write out directory with fasta files per each metabin
    write_cluster_records(
        bin_df=bin_df,
        metagenome=args.metagenome,
        outdir=args.output_metabins,
        cluster_col=args.binning_column,
    )
    # Now retrieve stats for each metabin
    metabin_stats_df = get_metabin_stats(
        bin_df=bin_df,
        markers=args.markers,
        cluster_col=args.binning_column,
    )
    metabin_stats_df.to_csv(args.output_stats,
                            sep="\t",
                            index=True,
                            header=True)
    logger.info(f"Wrote metabin stats to {args.output_stats}")
    # Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method.
    if "taxid" in bin_df.columns:
        if not args.ncbi:
            logger.warn(
                "taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..."
            )
        else:
            ncbi = NCBI(dirpath=args.ncbi)
            taxa_df = get_metabin_taxonomies(bin_df=bin_df,
                                             ncbi=ncbi,
                                             cluster_col=args.binning_column)
            taxa_df.to_csv(args.output_taxonomy,
                           sep="\t",
                           index=True,
                           header=True)

コード例 #8

0

ファイルを表示

def get_metabin_taxonomies(bin_df: pd.DataFrame,
                           ncbi: NCBI,
                           cluster_col: str = "cluster") -> pd.DataFrame:
    """Retrieve taxonomies of all clusters recovered from Autometa binning.

    Parameters
    ----------
    bin_df : pd.DataFrame
        Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks]
    ncbi : autometa.taxonomy.ncbi.NCBI instance
        Autometa NCBI class instance
    cluster_col : str, optional
        Clustering column by which to group metabins

    Returns
    -------
    pd.DataFrame
        Dataframe consisting of cluster taxonomy with taxid and canonical rank.
        Indexed by cluster
    """
    logger.info(f"Retrieving metabin taxonomies for {cluster_col}")
    canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
    is_clustered = bin_df[cluster_col].notnull()
    bin_df = bin_df[is_clustered]
    outcols = [cluster_col, "length", "taxid", *canonical_ranks]
    tmp_lines = (bin_df[outcols].to_csv(sep="\t",
                                        index=False,
                                        header=False,
                                        line_terminator="\n").split("\n"))
    taxonomies = {}
    # Here we prepare our datastructure for the majority_vote.rank_taxids(...) function.
    for line in tmp_lines:
        if not line:
            # Account for end of file where we have empty string.
            continue
        llist = line.strip().split("\t")
        cluster = llist[0]
        length = int(llist[1])
        taxid = int(llist[2])
        ranks = llist[3:]
        for rank, canonical_rank in zip(ranks, canonical_ranks):
            if rank != "unclassified":
                break
        if cluster not in taxonomies:
            taxonomies.update({cluster: {canonical_rank: {taxid: length}}})
        elif canonical_rank not in taxonomies[cluster]:
            taxonomies[cluster].update({canonical_rank: {taxid: length}})
        elif taxid not in taxonomies[cluster][canonical_rank]:
            taxonomies[cluster][canonical_rank].update({taxid: length})
        else:
            taxonomies[cluster][canonical_rank][taxid] += length
    cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi)
    # With our cluster taxonomies, let's place these into a dataframe for easy data accession
    cluster_taxa_df = pd.Series(data=cluster_taxonomies,
                                name="taxid").to_frame()
    # With the list of taxids, we'll retrieve their complete canonical-rank information
    lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(),
                                            fillna=True)
    # Now put it all together
    cluster_taxa_df = pd.merge(cluster_taxa_df,
                               lineage_df,
                               how="left",
                               left_on="taxid",
                               right_index=True)
    cluster_taxa_df.index.name = cluster_col
    return cluster_taxa_df

コード例 #9

0

ファイルを表示

def lowest_majority(rank_counts: Dict[str, Dict], ncbi: NCBI) -> int:
    """Determine the lowest majority given `rank_counts` by first attempting to
    get a taxid that leads in counts with the highest specificity in terms of
    canonical rank.

    Parameters
    ----------
    rank_counts : dict
        {canonical_rank:{taxid:num_hits, ...}, rank2: {...}, ...}
    ncbi : NCBI instance
        NCBI object from autometa.taxonomy.ncbi

    Returns
    -------
    int
        Taxid above the lowest majority threshold.

    """
    taxid_totals = {}
    for rank in NCBI.CANONICAL_RANKS:
        if rank not in rank_counts:
            continue
        rank_index = NCBI.CANONICAL_RANKS.index(rank)
        ranks_to_consider = NCBI.CANONICAL_RANKS[rank_index:]
        for taxid in rank_counts[rank]:
            # Make a dictionary to total the number of canonical ranks hit
            # while traversing the path so that we can add 'unclassified' to
            # any that don't exist. Later we need to make sure that
            # 'unclassified' doesn't ever win
            ranks_in_path = {
                rank_to_consider: 0
                for rank_to_consider in ranks_to_consider
            }
            # We need to add to taxid_totals for each taxid in the tax_path
            current_taxid = taxid
            current_rank = rank
            while current_taxid != 1:
                if current_rank not in set(NCBI.CANONICAL_RANKS):
                    current_taxid = ncbi.parent(current_taxid)
                    current_rank = ncbi.rank(current_taxid)
                    continue
                ranks_in_path[current_rank] += 1
                if current_rank not in taxid_totals:
                    taxid_totals.update({current_rank: {current_taxid: 1}})
                    current_taxid = ncbi.parent(current_taxid)
                    current_rank = ncbi.rank(current_taxid)
                    continue
                if current_taxid in taxid_totals[current_rank]:
                    taxid_totals[current_rank][current_taxid] += 1
                else:
                    taxid_totals[current_rank][current_taxid] = 1
                current_taxid = ncbi.parent(current_taxid)
                current_rank = ncbi.rank(current_taxid)
            # Now go through ranks_in_path. Where total = 0, add 'unclassified'
            for rank_to_consider in ranks_to_consider:
                if ranks_in_path[rank_to_consider] == 0:
                    if rank_to_consider not in taxid_totals:
                        taxid_totals[rank_to_consider] = {"unclassified": 1}
                    elif "unclassified" in taxid_totals[rank_to_consider]:
                        taxid_totals[rank_to_consider]["unclassified"] += 1
                    else:
                        taxid_totals[rank_to_consider]["unclassified"] = 1
    # If there are any gaps in the taxonomy paths for any of the proteins in the contig,
    # we need to add 'unclassified' to the relevant canonical taxonomic rank.
    # However, we must never allow 'unclassified' to win! (That just won't really tell us anything)
    # Now we need to determine which is the first level to have a majority
    for rank in NCBI.CANONICAL_RANKS:
        total_votes = 0
        taxid_leader = None
        taxid_leader_votes = 0
        if not rank in taxid_totals:
            continue
        for taxid in taxid_totals[rank]:
            taxid_votes = taxid_totals[rank][taxid]
            total_votes += taxid_votes
            if taxid_votes > taxid_leader_votes:
                taxid_leader = taxid
                taxid_leader_votes = taxid_votes
        majority_threshold = float(total_votes) / 2
        if taxid_leader_votes > majority_threshold and taxid_leader != "unclassified":
            return taxid_leader
    # Just in case
    return 1

コード例 #10

0

ファイルを表示

ファイル: conftest.py プロジェクト: jason-c-kwan/Autometa

def fixture_ncbi(ncbi_dir):
    return NCBI(dirpath=ncbi_dir, verbose=False)