def test_populate_class_weight(self): # should populate the class weight of a pipeline weights = Artifact.import_data( 'FeatureTable[RelativeFrequency]', self.get_data_path('class_weight.biom'), view_type='BIOMV100Format') table = weights.view(biom.Table) svc_spec = [['feat_ext', {'__type__': 'feature_extraction.text.HashingVectorizer', 'analyzer': 'char_wb', 'n_features': 8192, 'ngram_range': [8, 8], 'alternate_sign': False}], ['classify', {'__type__': 'naive_bayes.GaussianNB'}]] pipeline1 = pipeline_from_spec(svc_spec) populate_class_weight(pipeline1, table) classes = table.ids('observation') class_weights = [] for wts in table.iter_data(): class_weights.append(zip(classes, wts)) svc_spec[1][1]['priors'] = list(zip(*sorted(class_weights[0])))[1] pipeline2 = pipeline_from_spec(svc_spec) for a, b in zip(pipeline1.get_params()['classify__priors'], pipeline2.get_params()['classify__priors']): self.assertAlmostEqual(a, b)
def test_pipeline_serialisation(self): # pipeline inflation and deflation should be inverse operations for name, spec in _specific_fitters: pipeline = pipeline_from_spec(spec) spec_one = spec_from_pipeline(pipeline) pipeline = pipeline_from_spec(spec_one) spec_two = spec_from_pipeline(pipeline) self.assertEqual(spec_one, spec_two)
def precalculate_nearest_neighbors( reference_taxonomy: Series, reference_sequences: DNAIterator, max_centroids_per_class: int=10, feature_extractor_specification: str=_default_feature_extractor, knn_classifier_specification: str=_default_knn_classifier, n_jobs: int=1, random_state: int=42) -> dict: spec = json.loads(feature_extractor_specification) feat_ext = pipeline_from_spec(spec) if not isinstance(feat_ext.steps[-1][-1], TransformerMixin): raise ValueError('feature_extractor_specification must specify a ' 'transformer') spec = json.loads(knn_classifier_specification) nn = pipeline_from_spec(spec) if not isinstance(nn.steps[-1][-1], KNeighborsMixin): raise ValueError('knn_classifier_specification must specifiy a ' 'KNeighbors classifier') seq_ids, X = _extract_reads(reference_sequences) data = [(reference_taxonomy[s], x) for s, x in zip(seq_ids, X) if s in reference_taxonomy] y, X = list(zip(*data)) X = feat_ext.transform(X) if max_centroids_per_class > 0: class_counts = Counter(y) undersample_classes = {t: max_centroids_per_class for t, c in class_counts.items() if c > max_centroids_per_class} cc = ClusterCentroids(random_state=random_state, n_jobs=n_jobs, ratio=undersample_classes, voting='hard') X_resampled, y_resampled = cc.fit_sample(X, y) else: X_resampled, y_resampled = X, y if 'n_jobs' in nn.steps[-1][-1].get_params(): nn.steps[-1][-1].set_params(n_jobs=n_jobs) nn.fit(X_resampled) nn = nn.steps[-1][-1] if n_jobs != 1 and hasattr(X_resampled, 'todense'): indices = nn.kneighbors(X_resampled.todense(), return_distance=False) else: indices = nn.kneighbors(X_resampled, return_distance=False) return {'neighbors': indices.tolist(), 'taxonomies': y_resampled.tolist()}