Example #1
0
def generic_compute_shaps(fasta_files, genotype, classifier, n_samples, verb):
    """
    Given a genotype file and/or a collection of possibly gzipped FASTA files as well as a
    phenotrex classifier, collect genotype information from both, get SHAP information about the
    genotypes using the classifier, and return a finished ShapHandler object as well as the list
    of GenotypeRecords created.
    """
    if not len(fasta_files) and genotype is None:
        raise RuntimeError(
            'Must either supply FASTA file(s) or single genotype file for prediction.'
        )
    if len(fasta_files):
        grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    else:
        grs_from_fasta = []

    grs_from_file = load_genotype_file(
        genotype) if genotype is not None else []
    gr = grs_from_fasta + grs_from_file

    model = load_classifier(filename=classifier, verb=verb)
    sh = ShapHandler.from_clf(model)
    try:
        fs, sv, bv = model.get_shap(gr, n_samples=n_samples)
    except TypeError:
        raise RuntimeError(
            'This TrexClassifier is not capable of generating SHAP explanations.'
        )
    sh.add_feature_data(sample_names=[x.identifier for x in gr],
                        features=fs,
                        shaps=sv,
                        base_value=bv)
    return sh, gr
Example #2
0
def compute_genotype(input, out, n_threads=None, verb=True):
    """
    Create a genotype file suitable for learning and inference with `phenotrex`.
    Given a set of (possibly gzipped) DNA or protein FASTA files,
    perform annotation of eggNOG5-tax-2 (bacterial eggNOG5) clusters, and write to a .genotype file.
    """
    from phenotrex.io.flat import write_genotype_file
    from phenotrex.transforms import fastas_to_grs

    write_genotype_file(genotypes=fastas_to_grs(input,
                                                verb=verb,
                                                n_threads=n_threads),
                        output_file=out)
def predict(input_files: List[str],
            classifier: str,
            min_proba=0.0,
            verb=True) -> pd.DataFrame:
    if not len(input_files):
        raise RuntimeError('Must supply input file(s) for prediction.')
    fasta_files, genotype_files = _determine_file_types(input_files)
    gr = []
    if fasta_files:
        gr += fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    for f in genotype_files:
        gr += load_genotype_file(f)
    model = load_classifier(filename=classifier, verb=verb)
    preds, probas = model.predict(X=gr)
    translate_output = {
        trait_id: trait_sign
        for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items()
    }
    out = {}
    for record, result, probability in zip(gr, preds, probas):
        if probability[result] < min_proba:
            result_disp = np.nan
        else:
            result_disp = translate_output[result]
        out[record.identifier] = {
            'Trait Present': result_disp,
            'Confidence': str(round(probability[result], 4))
        }

    df = pd.DataFrame.from_dict(out).T
    df.index.name = 'Genome'
    df = df.reset_index()
    df['Trait Name'] = model.trait_name
    df['Feature Type'] = model.feature_type
    df['Model File'] = Path(classifier).name
    df = df[[
        'Genome', 'Model File', 'Feature Type', 'Trait Name', 'Trait Present',
        'Confidence'
    ]]
    return df
Example #4
0
def predict(fasta_files=tuple(),
            genotype=None,
            classifier=None,
            min_proba=0.5,
            out_explain_per_sample=None,
            out_explain_summary=None,
            shap_n_samples=None,
            n_max_explained_features=None,
            verb=False):
    """
    Predict phenotype from a set of (possibly gzipped) DNA or protein FASTA files
    or a single genotype file. Optionally, compute SHAP explanations individually and/or summarily
    for the predicted samples.

    NB: Genotype computation is highly expensive and performed on the fly on FASTA files.
    For increased speed when predicting multiple phenotypes, create a .genotype file to reuse
    with the command `compute-genotype`.

    NB: As opposed to XGB models where they are trivially available, computing SHAP explanations
    on SVM models entails training a model-agnostic KernelExplainer which is highly costly (dozens
    to hundreds of seconds per sample if using a somewhat reasonable value for `shap_n_samples`).

    :param fasta_files: An iterable of fasta file paths
    :param genotype: A genotype file path
    :param classifier: A pickled classifier file path
    :param out_explain_per_sample: Where to save the most influential features by SHAP for each
                                   predicted sample.
    :param out_explain_summary: Where to save the SHAP summary of the predictions.
    :param shap_n_samples: The n_samples parameter -
                           only used by models which incorporate a `shap.KernelExplainer`.
    :param n_max_explained_features: How many of the most influential features by SHAP to consider.
    :param verb: Whether to show progress of fasta file annotation.
    """
    if not len(fasta_files) and genotype is None:
        raise RuntimeError(
            'Must supply FASTA file(s) and/or single genotype file for prediction.'
        )
    if len(fasta_files):
        grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb)
    else:
        grs_from_fasta = []

    grs_from_file = load_genotype_file(
        genotype) if genotype is not None else []
    gr = grs_from_fasta + grs_from_file

    model = load_classifier(filename=classifier, verb=verb)
    if out_explain_per_sample is not None or out_explain_summary is not None:
        try:
            fs, sv, bv = model.get_shap(gr, n_samples=shap_n_samples)
        except TypeError:
            raise RuntimeError(
                'This TrexClassifier is not capable of generating SHAP explanations.'
            )
        sh = ShapHandler.from_clf(model)
        sh.add_feature_data(sample_names=[x.identifier for x in gr],
                            features=fs,
                            shaps=sv,
                            base_value=bv)
        if out_explain_per_sample is not None:
            shap_df = pd.concat([
                sh.get_shap_force(x.identifier,
                                  n_max_features=n_max_explained_features)
                for x in gr
            ],
                                axis=0)
            shap_df.to_csv(out_explain_per_sample, sep='\t', index=False)
        if out_explain_summary is not None:
            sum_df = sh.get_shap_summary(n_max_explained_features)
            sum_df.to_csv(out_explain_summary, sep='\t', index=False)

    preds, probas = model.predict(X=gr)
    translate_output = {
        trait_id: trait_sign
        for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items()
    }
    print(f"# Trait: {model.trait_name}")
    print("Identifier\tTrait present\tConfidence")
    for record, result, probability in zip(gr, preds, probas):
        if probability[result] < min_proba:
            result_disp = "N/A"
        else:
            result_disp = translate_output[result]
        print(
            f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}"
        )
Example #5
0
from pathlib import Path

from phenotrex.transforms import fastas_to_grs
try:
    fastas_to_grs()
except ImportError:
    FROM_FASTA = False
except TypeError:
    FROM_FASTA = True

DATA_PATH = (Path(__file__).parent / 'test_data')
GENOMIC_PATH = DATA_PATH / 'genomic'
MODELS_PATH = DATA_PATH / 'models'
FLAT_PATH = DATA_PATH / 'flat'