def test_populate_class_weight(self):
        # should populate the class weight of a pipeline
        weights = Artifact.import_data(
            'FeatureTable[RelativeFrequency]',
            self.get_data_path('class_weight.biom'),
            view_type='BIOMV100Format')
        table = weights.view(biom.Table)

        svc_spec = [['feat_ext',
                     {'__type__': 'feature_extraction.text.HashingVectorizer',
                      'analyzer': 'char_wb',
                      'n_features': 8192,
                      'ngram_range': [8, 8],
                      'alternate_sign': False}],
                    ['classify',
                     {'__type__': 'naive_bayes.GaussianNB'}]]
        pipeline1 = pipeline_from_spec(svc_spec)
        populate_class_weight(pipeline1, table)

        classes = table.ids('observation')
        class_weights = []
        for wts in table.iter_data():
            class_weights.append(zip(classes, wts))
        svc_spec[1][1]['priors'] = list(zip(*sorted(class_weights[0])))[1]
        pipeline2 = pipeline_from_spec(svc_spec)

        for a, b in zip(pipeline1.get_params()['classify__priors'],
                        pipeline2.get_params()['classify__priors']):
            self.assertAlmostEqual(a, b)
    def test_populate_class_weight(self):
        # should populate the class weight of a pipeline
        weights = Artifact.import_data(
            'FeatureTable[RelativeFrequency]',
            self.get_data_path('class_weight.biom'),
            view_type='BIOMV100Format')
        table = weights.view(biom.Table)

        svc_spec = [['feat_ext',
                     {'__type__': 'feature_extraction.text.HashingVectorizer',
                      'analyzer': 'char_wb',
                      'n_features': 8192,
                      'ngram_range': [8, 8],
                      'alternate_sign': False}],
                    ['classify',
                     {'__type__': 'naive_bayes.GaussianNB'}]]
        pipeline1 = pipeline_from_spec(svc_spec)
        populate_class_weight(pipeline1, table)

        classes = table.ids('observation')
        class_weights = []
        for wts in table.iter_data():
            class_weights.append(zip(classes, wts))
        svc_spec[1][1]['priors'] = list(zip(*sorted(class_weights[0])))[1]
        pipeline2 = pipeline_from_spec(svc_spec)

        for a, b in zip(pipeline1.get_params()['classify__priors'],
                        pipeline2.get_params()['classify__priors']):
            self.assertAlmostEqual(a, b)
 def test_pipeline_serialisation(self):
     # pipeline inflation and deflation should be inverse operations
     for name, spec in _specific_fitters:
         pipeline = pipeline_from_spec(spec)
         spec_one = spec_from_pipeline(pipeline)
         pipeline = pipeline_from_spec(spec_one)
         spec_two = spec_from_pipeline(pipeline)
         self.assertEqual(spec_one, spec_two)
 def test_pipeline_serialisation(self):
     # pipeline inflation and deflation should be inverse operations
     for name, spec in _specific_fitters:
         pipeline = pipeline_from_spec(spec)
         spec_one = spec_from_pipeline(pipeline)
         pipeline = pipeline_from_spec(spec_one)
         spec_two = spec_from_pipeline(pipeline)
         self.assertEqual(spec_one, spec_two)
Esempio n. 5
0
def precalculate_nearest_neighbors(
        reference_taxonomy: Series, reference_sequences: DNAIterator,
        max_centroids_per_class: int=10,
        feature_extractor_specification: str=_default_feature_extractor,
        knn_classifier_specification: str=_default_knn_classifier,
        n_jobs: int=1, random_state: int=42) -> dict:
    spec = json.loads(feature_extractor_specification)
    feat_ext = pipeline_from_spec(spec)
    if not isinstance(feat_ext.steps[-1][-1], TransformerMixin):
        raise ValueError('feature_extractor_specification must specify a '
                         'transformer')
    spec = json.loads(knn_classifier_specification)
    nn = pipeline_from_spec(spec)
    if not isinstance(nn.steps[-1][-1], KNeighborsMixin):
        raise ValueError('knn_classifier_specification must specifiy a '
                         'KNeighbors classifier')

    seq_ids, X = _extract_reads(reference_sequences)
    data = [(reference_taxonomy[s], x)
            for s, x in zip(seq_ids, X) if s in reference_taxonomy]
    y, X = list(zip(*data))
    X = feat_ext.transform(X)

    if max_centroids_per_class > 0:
        class_counts = Counter(y)
        undersample_classes = {t: max_centroids_per_class
                               for t, c in class_counts.items()
                               if c > max_centroids_per_class}
        cc = ClusterCentroids(random_state=random_state, n_jobs=n_jobs,
                              ratio=undersample_classes, voting='hard')
        X_resampled, y_resampled = cc.fit_sample(X, y)
    else:
        X_resampled, y_resampled = X, y

    if 'n_jobs' in nn.steps[-1][-1].get_params():
        nn.steps[-1][-1].set_params(n_jobs=n_jobs)
    nn.fit(X_resampled)
    nn = nn.steps[-1][-1]
    if n_jobs != 1 and hasattr(X_resampled, 'todense'):
        indices = nn.kneighbors(X_resampled.todense(), return_distance=False)
    else:
        indices = nn.kneighbors(X_resampled, return_distance=False)
    return {'neighbors': indices.tolist(), 'taxonomies': y_resampled.tolist()}