Beispiel #1
0
def generic_cccv(type, genotype, phenotype, folds, replicates, threads, comple_steps, conta_steps,
                 verb, groups=None, rank=None, optimize=False, out=None, n_features=None,
                 params_file=None,
                 *args, **kwargs):
    """
    Perform crossvalidation over a range of simulated completeness/contamination values,
    and save output.
    """
    kwargs = _fix_uppercase(kwargs)
    assert groups is None, 'Usage of LOGO in CCCV not currently implemented.'
    assert not optimize, 'Parameter search over CCCV not currently implemented.'
    training_records, *_ = load_training_files(
        genotype_file=genotype, phenotype_file=phenotype, verb=verb
    )
    if params_file is not None:
        loaded_params = load_params_file(params_file)
        logger.info(f'Parameters loaded from file:')
        logger.info('\n' + pformat(loaded_params))
        kwargs = {**kwargs, **loaded_params}  # TODO: should loaded params have precendence?
    clf = CLF_MAPPER[type](verb=verb, *args, **kwargs)
    reduce_features = True if n_features is not None else False
    cccv = clf.crossvalidate_cc(records=training_records, cv=folds, n_replicates=replicates,
                                comple_steps=comple_steps, conta_steps=conta_steps,
                                n_jobs=threads, reduce_features=reduce_features,
                                n_features=n_features)
    write_cccv_accuracy_file(out, cccv)
def test_resampling(trait_name):
    td, *_ = load_training_files(
        FLAT_PATH / trait_name / f'{trait_name}.genotype',
        FLAT_PATH / trait_name / f'{trait_name}.phenotype')
    trr = TrainingRecordResampler(random_state=2, verb=True)
    trr.fit(td)
    trr.get_resampled(td[0], comple=.5, conta=.5)
Beispiel #3
0
def generic_cv(type, genotype, phenotype, folds, replicates, threads, verb, optimize=False,
               optimize_out=None, optimize_n_iter=None, groups=None, rank=None, out=None,
               n_features=None, params_file=None,
               *args, **kwargs):
    """
    Estimate model performance by cross-validation.
    Optionally, perform parameter search and save found parameters.
    """
    kwargs = _fix_uppercase(kwargs)
    training_records, *_ = load_training_files(
        genotype_file=genotype,
        phenotype_file=phenotype,
        groups_file=groups,
        selected_rank=rank,
        verb=verb
    )
    if params_file is not None:
        loaded_params = load_params_file(params_file)
        logger.info(f'Parameters loaded from file:')
        logger.info('\n' + pformat(loaded_params))
        kwargs = {**kwargs, **loaded_params}  # TODO: should loaded params have precendence?

    clf = CLF_MAPPER[type](verb=verb, *args, **kwargs)

    if optimize:
        assert optimize_out is not None, 'No savepath for found parameters passed.'
        logger.info(f'Optimizing parameters...')
        found_params = clf.parameter_search(training_records, n_iter=optimize_n_iter)
        params = {**kwargs, **found_params}
        write_params_file(optimize_out, params)
        logger.info(f'Optimized parameters written to file {optimize_out}.')
        clf = CLF_MAPPER[type](verb=verb, *args, **params)

    reduce_features = True if n_features is not None else False
    use_groups = groups is not None
    logger.info(f'Running CV...')
    score_mean, score_sd, misclass = clf.crossvalidate(records=training_records, cv=folds,
                                                       n_replicates=replicates, groups=use_groups,
                                                       n_jobs=threads,
                                                       reduce_features=reduce_features,
                                                       n_features=n_features,
                                                       demote=not verb)
    logger.info(f"CV score: {round(score_mean, 4)} +/- {round(score_sd, 4)}")
    if out is not None:
        write_misclassifications_file(out, training_records, misclass, use_groups=use_groups)
Beispiel #4
0
def generic_train(type,
                  genotype,
                  phenotype,
                  verb,
                  weights,
                  out,
                  n_features=None,
                  params_file=None,
                  *args,
                  **kwargs):
    """
    Train and save a TrexClassifier model.
    """
    kwargs = _fix_uppercase(kwargs)
    training_records, *_ = load_training_files(genotype_file=genotype,
                                               phenotype_file=phenotype,
                                               verb=verb)
    if params_file is not None:
        loaded_params = load_params_file(params_file)
        logger.info(f'Parameters loaded from file:')
        logger.info('\n' + pformat(loaded_params))
        kwargs = {**kwargs, **loaded_params}
    clf = CLF_MAPPER[type](verb=verb, *args, **kwargs)

    reduce_features = True if n_features is not None else False
    clf.train(records=training_records,
              reduce_features=reduce_features,
              n_features=n_features)
    if weights:
        weights = clf.get_feature_weights()
        weights_file_name = f"{out}.rank"
        if clf.feature_type.startswith('eggNOG5'):
            text_annotator = Eggnog5TextAnnotator()
            feature_taxon = int(clf.feature_type.split('-')[-1])
            annots = [
                text_annotator.annotate(taxon_id=feature_taxon, enog_id=x)[1]
                for x in weights.keys()
            ]
        else:
            annots = None
        write_weights_file(weights_file=weights_file_name,
                           weights=weights,
                           annots=annots)
    save_classifier(obj=clf, filename=out, verb=verb)
Beispiel #5
0
    def test_get_shap(self, trait_name, classifier_type):
        """
        Get ShapHandler and SHAP data from classifier and genotype file.

        :param trait_name:
        :param classifier_type:
        :return:
        """
        full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype"
        full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype"
        training_records, genotype, phenotype, group = load_training_files(
            genotype_file=full_path_genotype,
            phenotype_file=full_path_phenotype,
            verb=True)
        tr = training_records[:3]
        model_path = MODELS_PATH / trait_name / f'{trait_name}.{classifier_type.lower()}.pkl'
        clf = load_classifier(model_path, verb=True)
        sh = ShapHandler.from_clf(clf)
        fs, sv, bv = clf.get_shap(tr, n_samples=50)
        return tr, sh, fs, sv, bv
Beispiel #6
0
 def test_load_data(self, trait_name, do_write):
     """
     Test training data loading. Check/catch invalid file formats.
     :param trait_name:
     :return:
     """
     full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype"
     full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype"
     full_path_groups = FLAT_PATH / trait_name / f"{trait_name}.taxids"
     training_records, genotype, phenotype, group = load_training_files(
         genotype_file=full_path_genotype,
         phenotype_file=full_path_phenotype,
         groups_file=full_path_groups,
         verb=True)
     if do_write:
         with TemporaryDirectory() as tmpdir:
             gt_out = Path(tmpdir) / 'gt.genotype'
             write_genotype_file(genotype, gt_out)
             assert gt_out.is_file()
     return training_records, genotype, phenotype, group
Beispiel #7
0
def generic_train(type, genotype, phenotype, verb, weights, out,
                  n_features=None, params_file=None, *args, **kwargs):
    """
    Train and save a TrexClassifier model.
    """
    kwargs = _fix_uppercase(kwargs)
    training_records, *_ = load_training_files(
        genotype_file=genotype, phenotype_file=phenotype, verb=verb
    )
    if params_file is not None:
        loaded_params = load_params_file(params_file)
        logger.info(f'Parameters loaded from file:')
        logger.info('\n' + pformat(loaded_params))
        kwargs = {**kwargs, **loaded_params}
    clf = CLF_MAPPER[type](verb=verb, *args, **kwargs)

    reduce_features = True if n_features is not None else False
    clf.train(records=training_records, reduce_features=reduce_features, n_features=n_features)
    if weights:
        weights = clf.get_feature_weights()
        weights_file_name = f"{out}.rank"
        write_weights_file(weights_file=weights_file_name, weights=weights)
    save_classifier(obj=clf, filename=out, verb=verb)
Beispiel #8
0
 def get_training_data(self):
     td, *_ = load_training_files(
         FLAT_PATH/trait_name/f'{trait_name}.genotype',
         FLAT_PATH/trait_name/f'{trait_name}.phenotype'
     )
     return td