def generic_compute_shaps(fasta_files, genotype, classifier, n_samples, verb): """ Given a genotype file and/or a collection of possibly gzipped FASTA files as well as a phenotrex classifier, collect genotype information from both, get SHAP information about the genotypes using the classifier, and return a finished ShapHandler object as well as the list of GenotypeRecords created. """ if not len(fasta_files) and genotype is None: raise RuntimeError( 'Must either supply FASTA file(s) or single genotype file for prediction.' ) if len(fasta_files): grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb) else: grs_from_fasta = [] grs_from_file = load_genotype_file( genotype) if genotype is not None else [] gr = grs_from_fasta + grs_from_file model = load_classifier(filename=classifier, verb=verb) sh = ShapHandler.from_clf(model) try: fs, sv, bv = model.get_shap(gr, n_samples=n_samples) except TypeError: raise RuntimeError( 'This TrexClassifier is not capable of generating SHAP explanations.' ) sh.add_feature_data(sample_names=[x.identifier for x in gr], features=fs, shaps=sv, base_value=bv) return sh, gr
def compute_genotype(input, out, n_threads=None, verb=True): """ Create a genotype file suitable for learning and inference with `phenotrex`. Given a set of (possibly gzipped) DNA or protein FASTA files, perform annotation of eggNOG5-tax-2 (bacterial eggNOG5) clusters, and write to a .genotype file. """ from phenotrex.io.flat import write_genotype_file from phenotrex.transforms import fastas_to_grs write_genotype_file(genotypes=fastas_to_grs(input, verb=verb, n_threads=n_threads), output_file=out)
def predict(input_files: List[str], classifier: str, min_proba=0.0, verb=True) -> pd.DataFrame: if not len(input_files): raise RuntimeError('Must supply input file(s) for prediction.') fasta_files, genotype_files = _determine_file_types(input_files) gr = [] if fasta_files: gr += fastas_to_grs(fasta_files, n_threads=None, verb=verb) for f in genotype_files: gr += load_genotype_file(f) model = load_classifier(filename=classifier, verb=verb) preds, probas = model.predict(X=gr) translate_output = { trait_id: trait_sign for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items() } out = {} for record, result, probability in zip(gr, preds, probas): if probability[result] < min_proba: result_disp = np.nan else: result_disp = translate_output[result] out[record.identifier] = { 'Trait Present': result_disp, 'Confidence': str(round(probability[result], 4)) } df = pd.DataFrame.from_dict(out).T df.index.name = 'Genome' df = df.reset_index() df['Trait Name'] = model.trait_name df['Feature Type'] = model.feature_type df['Model File'] = Path(classifier).name df = df[[ 'Genome', 'Model File', 'Feature Type', 'Trait Name', 'Trait Present', 'Confidence' ]] return df
def predict(fasta_files=tuple(), genotype=None, classifier=None, min_proba=0.5, out_explain_per_sample=None, out_explain_summary=None, shap_n_samples=None, n_max_explained_features=None, verb=False): """ Predict phenotype from a set of (possibly gzipped) DNA or protein FASTA files or a single genotype file. Optionally, compute SHAP explanations individually and/or summarily for the predicted samples. NB: Genotype computation is highly expensive and performed on the fly on FASTA files. For increased speed when predicting multiple phenotypes, create a .genotype file to reuse with the command `compute-genotype`. NB: As opposed to XGB models where they are trivially available, computing SHAP explanations on SVM models entails training a model-agnostic KernelExplainer which is highly costly (dozens to hundreds of seconds per sample if using a somewhat reasonable value for `shap_n_samples`). :param fasta_files: An iterable of fasta file paths :param genotype: A genotype file path :param classifier: A pickled classifier file path :param out_explain_per_sample: Where to save the most influential features by SHAP for each predicted sample. :param out_explain_summary: Where to save the SHAP summary of the predictions. :param shap_n_samples: The n_samples parameter - only used by models which incorporate a `shap.KernelExplainer`. :param n_max_explained_features: How many of the most influential features by SHAP to consider. :param verb: Whether to show progress of fasta file annotation. """ if not len(fasta_files) and genotype is None: raise RuntimeError( 'Must supply FASTA file(s) and/or single genotype file for prediction.' ) if len(fasta_files): grs_from_fasta = fastas_to_grs(fasta_files, n_threads=None, verb=verb) else: grs_from_fasta = [] grs_from_file = load_genotype_file( genotype) if genotype is not None else [] gr = grs_from_fasta + grs_from_file model = load_classifier(filename=classifier, verb=verb) if out_explain_per_sample is not None or out_explain_summary is not None: try: fs, sv, bv = model.get_shap(gr, n_samples=shap_n_samples) except TypeError: raise RuntimeError( 'This TrexClassifier is not capable of generating SHAP explanations.' ) sh = ShapHandler.from_clf(model) sh.add_feature_data(sample_names=[x.identifier for x in gr], features=fs, shaps=sv, base_value=bv) if out_explain_per_sample is not None: shap_df = pd.concat([ sh.get_shap_force(x.identifier, n_max_features=n_max_explained_features) for x in gr ], axis=0) shap_df.to_csv(out_explain_per_sample, sep='\t', index=False) if out_explain_summary is not None: sum_df = sh.get_shap_summary(n_max_explained_features) sum_df.to_csv(out_explain_summary, sep='\t', index=False) preds, probas = model.predict(X=gr) translate_output = { trait_id: trait_sign for trait_sign, trait_id in DEFAULT_TRAIT_SIGN_MAPPING.items() } print(f"# Trait: {model.trait_name}") print("Identifier\tTrait present\tConfidence") for record, result, probability in zip(gr, preds, probas): if probability[result] < min_proba: result_disp = "N/A" else: result_disp = translate_output[result] print( f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}" )
from pathlib import Path from phenotrex.transforms import fastas_to_grs try: fastas_to_grs() except ImportError: FROM_FASTA = False except TypeError: FROM_FASTA = True DATA_PATH = (Path(__file__).parent / 'test_data') GENOMIC_PATH = DATA_PATH / 'genomic' MODELS_PATH = DATA_PATH / 'models' FLAT_PATH = DATA_PATH / 'flat'