def test_feats_add(): bags = [np.random.normal(size=(np.random.randint(10, 100), 10)) for _ in xrange(20)] labels = np.arange(20) first_15 = Features(bags[:15], labels=labels[:15]) last_5 = Features(bags[15:], labels=labels[15:]) plus = first_15 + last_5 assert len(plus) == 20 assert plus[:15] == first_15 plus_list = first_15 + bags[15:] assert len(plus_list) == 20 assert not plus_list.meta assert np.all(plus_list[16] == bags[16]) plus_singlelist = first_15 + [bags[18]] assert np.all(plus_singlelist[15] == bags[18]) rplus_list = bags[15:] + first_15 assert np.all(rplus_list[0] == bags[15]) rplus_singlelist = [bags[15]] + first_15 assert np.all(rplus_singlelist[0] == bags[15]) assert rplus_singlelist[1:] == first_15.bare() wrong_type(lambda: first_15 + 12) wrong_type(lambda: 12 + first_15) assert_raises(ValueError, lambda: first_15 + np.asarray(bags))
def test_copy_constructor(): bags = [np.random.normal(size=(np.random.randint(10, 100), 10)) for _ in xrange(20)] unstacked = Features(bags, label=np.arange(20)) oth_unstacked = Features(unstacked) assert oth_unstacked.label is unstacked.label assert oth_unstacked.features[0] is unstacked.features[0] assert oth_unstacked == unstacked oth_unstacked_bare = Features(unstacked, bare=True) assert oth_unstacked_bare == bags assert oth_unstacked_bare.bare() == oth_unstacked_bare oth_unstacked = Features(unstacked, label=np.ones(20)) assert np.all(oth_unstacked.label == 1) oth_unstacked2 = Features(unstacked, bare=True, label=np.arange(20)) assert oth_unstacked2 == unstacked oth_unstacked_copy = Features(unstacked, copy=True) assert oth_unstacked_copy == unstacked assert not np.may_share_memory(oth_unstacked_copy.features[0], unstacked.features[0]) stacked = unstacked.copy() stacked.make_stacked() oth_stacked = Features(stacked) assert oth_stacked == stacked
def test_pca(): bags = [ np.random.normal(5, 3, size=(np.random.randint(10, 100), 20)) for _ in xrange(50) ] feats = Features(bags, stack=True) pca = BagPCA(k=3) pca.fit(bags) pcaed = pca.transform(bags) assert pcaed.dim == 3 BagPCA(varfrac=.3).fit_transform(bags) pca2 = BagPCA(k=20) pcaed2 = pca2.fit_transform(bags) orig = pca2.inverse_transform(pcaed2) orig.make_stacked() assert np.allclose(feats.stacked_features, orig.stacked_features) assert BagPCA(k=5, randomize=True).fit_transform(bags).dim == 5 assert_raises(TypeError, lambda: BagPCA(randomize=True)) assert_raises(TypeError, lambda: BagPCA(mle_components=True, k=12)) assert BagPCA(mle_components=True)
def test_bagofwords_basic(): n_codewords = 10 dim = 5 kmeans = KMeans(n_clusters=n_codewords, max_iter=100, n_init=3, random_state=47) bow = BagOfWords(kmeans) np.random.seed(42) bags = [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(50)] bowed = bow.fit_transform(bags) assert bowed.shape == (len(bags), n_codewords) assert bow.codewords_.shape == (n_codewords, dim) assert np.all(bowed >= 0) assert np.all(np.sum(bowed, 1) == [b.shape[0] for b in bags]) bow.fit(Features(bags)) bowed2 = bow.transform(bags) assert np.all(bowed == bowed2) assert bow.codewords_.shape == (n_codewords, dim) minikmeans = MiniBatchKMeans(n_clusters=n_codewords, max_iter=100, random_state=47) minibow = BagOfWords(minikmeans) assert_raises(AttributeError, lambda: minibow.transform(bags)) minibowed = minibow.fit_transform(bags) assert minibowed.shape == bowed.shape assert np.all(bowed >= 0) assert np.all(np.sum(bowed, 1) == [b.shape[0] for b in bags])
def test_knn_memory(): if not have_flann: raise SkipTest("No flann, so skipping knn tests.") dim = 3 n = 20 np.random.seed(47) bags = Features( [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n)]) tdir = tempfile.mkdtemp() div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8') Ks = (3, 4) est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks, memory=tdir) res1 = est.fit_transform(bags) with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l: res2 = est.transform(bags) assert len(l.records) == 0 assert np.all(res1 == res2) with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l: res3 = est.fit_transform(bags) for r in l.records: assert not r.message.startswith("Getting divergences") assert np.all(res1 == res3)
def test_copying(): bags = [np.random.normal(size=(np.random.randint(10, 100), 10)) for _ in xrange(20)] unstacked = Features(bags, copy=False, stack=False, label=np.arange(20)) stacked = Features(bags, stack=True, label=np.arange(20)) assert unstacked == stacked unstacked_copy = copy(unstacked) assert not unstacked_copy.stacked assert stacked == unstacked_copy == unstacked assert unstacked_copy.label is unstacked.label assert not np.may_share_memory(unstacked[0], unstacked_copy[0]) unstacked_deep = deepcopy(unstacked) assert not unstacked_deep.stacked assert stacked == unstacked_deep == unstacked assert unstacked_deep.label is not unstacked.label stacked_copy = copy(stacked) assert stacked_copy.stacked assert stacked == stacked_copy == unstacked assert stacked_copy.label is stacked.label stacked_deep = deepcopy(stacked) assert stacked_deep.stacked assert stacked == stacked_deep == unstacked assert stacked_deep.label is not stacked.label unstacked_stacked = unstacked.copy(stack=True) assert unstacked_stacked.stacked assert stacked == unstacked_stacked == stacked assert unstacked_stacked.label is unstacked.label unstacked_pickled = pickle.loads(pickle.dumps(unstacked)) assert unstacked == unstacked_pickled assert not unstacked_pickled.stacked assert unstacked_pickled.label is not unstacked.label stacked_pickled = pickle.loads(pickle.dumps(stacked)) assert stacked == stacked_pickled assert stacked_pickled.stacked assert stacked_pickled.label is not stacked.label
def distribution_divergence(X_s, X_l, k=10): """ This function computes l2 and js divergences from samples of two distributions. The implementation use `skl-groups`, which implements non-parametric estimation of divergences. Args: + X_s: a numpy array containing point cloud in state space + X_e: a numpy array containing point cloud in latent space """ # We discard cases with too large dimensions if X_s.shape[1] > 50: return {'l2_divergence': -1., 'js_divergence': -1.} # We instantiate the divergence object div = KNNDivergenceEstimator(div_funcs=['l2', 'js'], Ks=[k], n_jobs=4, clamp=True) # We turn both data to float32 X_s = X_s.astype(np.float32) X_l = X_l.astype(np.float32) # We generate Features f_s = Features(X_s, n_pts=[X_s.shape[0]]) f_l = Features(X_l, n_pts=[X_l.shape[0]]) # We create the knn graph div.fit(X=f_s) # We compute the divergences l2, js = div.transform(X=f_l).squeeze() # We construct the returned dictionnary output = {'l2_divergence': l2, 'js_divergence': js} return output
def kNNdiv_Kernel(X_white, kernel, Knn=3, div_func='renyi:.5', Nref=None, compwise=True, njobs=1, W_ica_inv=None): ''' `div_func` kNN divergence estimate between some data X_white and a distribution specified by Kernel. ''' if isinstance(Knn, int): Knns = [Knn] elif isinstance(Knn, list): Knns = Knn # if component wise there should be X_white.shape[1] # kernels for each componenets if compwise: if X_white.shape[1] != len(kernel): raise ValueError # construct reference "bag" if compwise: ref_dist = np.zeros((Nref, X_white.shape[1])) for icomp in range(X_white.shape[1]): samp = kernel[icomp].sample(Nref) if isinstance(samp, tuple): ref_dist[:, icomp] = samp[0].flatten() else: ref_dist[:, icomp] = samp.flatten() else: samp = kernel.sample(Nref) if isinstance(samp, tuple): ref_dist = samp[0] else: ref_dist = samp if W_ica_inv is not None: ref_dist = np.dot(ref_dist, W_ica_inv.T) # estimate divergence kNN = KNNDivergenceEstimator(div_funcs=[div_func], Ks=Knns, version='slow', clamp=False, n_jobs=njobs) feat = Features([X_white, ref_dist]) div_knn = kNN.fit_transform(feat) if len(Knns) == 1: return div_knn[0][0][0][1] div_knns = np.zeros(len(Knns)) for i in range(len(Knns)): div_knns[i] = div_knn[0][i][0][1] return div_knns
def test_basic(): bags = [ np.random.normal(5, 3, size=(np.random.randint(10, 100), 20)) for _ in xrange(50) ] feats = Features(bags, stack=True) stder = BagStandardizer() stdized = stder.fit_transform(bags) stdized.make_stacked() assert np.allclose(np.mean(stdized.stacked_features), 0) assert np.allclose(np.std(stdized.stacked_features), 1) first_five = stder.transform(bags[:5]) assert first_five == stdized[:5] minmaxer = BagMinMaxScaler([3, 7]) minmaxed = minmaxer.fit_transform(feats) minmaxed.make_stacked() assert np.allclose(np.min(minmaxed.stacked_features, 0), 3) assert np.allclose(np.max(minmaxed.stacked_features, 0), 7) normer = BagNormalizer('l1') normed = normer.fit_transform(Features(bags)) normed.make_stacked() assert np.allclose(np.sum(np.abs(normed.stacked_features), 1), 1) class GetMean(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X.mean(axis=1)[None, :] m = BagPreprocesser(GetMean()) assert_raises(ValueError, lambda: m.transform(bags))
def test_knn_sanity_slow(): if not have_flann: raise SkipTest("No flann, so skipping knn tests.") dim = 3 n = 20 np.random.seed(47) bags = Features( [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n)]) # just make sure it runs div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8') Ks = (3, 4) est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks) res = est.fit_transform(bags) assert res.shape == (len(div_funcs), len(Ks), n, n) assert np.all(np.isfinite(res)) # test that JS blows up when there's a huge difference in bag sizes # (so that K is too low) assert_raises( ValueError, partial(est.fit_transform, bags + [np.random.randn(1000, dim)])) # test fit() and then transform() with JS, with different-sized test bags est = KNNDivergenceEstimator(div_funcs=('js', ), Ks=(5, )) est.fit(bags, get_rhos=True) with LogCapture('skl_groups.divergences.knn', level=logging.WARNING) as l: res = est.transform([np.random.randn(300, dim)]) assert res.shape == (1, 1, 1, len(bags)) assert len(l.records) == 1 assert l.records[0].message.startswith('Y_rhos had a lower max_K') # test that passing div func more than once raises def blah(df): est = KNNDivergenceEstimator(div_funcs=[df, df]) return est.fit(bags) assert_raises(ValueError, lambda: blah('kl')) assert_raises(ValueError, lambda: blah('renyi:.8')) assert_raises(ValueError, lambda: blah('l2'))
def kNNdiv_general( X, Y, Knn=3, div_func='kl', alpha=None, njobs=1, ): #renyi:.5 """ kNN divergence estimate for samples drawn from any two arbitrary distributions. """ if Y.shape[1] != X.shape[1]: raise ValueError( 'dimension between X_white and Gaussian reference distribution do not match' ) if isinstance(Knn, int): Knns = [Knn] elif isinstance(Knn, list): Knns = Knn if alpha is not None: div_func = div_func + ':%s' % alpha kNN = KNNDivergenceEstimator(div_funcs=[div_func], Ks=Knns, version='slow', clamp=False, n_jobs=njobs) feat = Features([X, Y]) div_knn = kNN.fit_transform(feat) if len(Knns) == 1: return div_knn[0][0][0][1] div_knns = np.zeros(len(Knns)) for i in range(len(Knns)): div_knns[i] = div_knn[0][i][0][1] return div_knns
def kNNdiv_gauss(X_white, cov_X, Knn=3, div_func='renyi:.5', gauss=None, Nref=None, njobs=1): ''' `div_func` kNN divergence estimate between X_white and a reference Gaussian with covariance matrix cov_X. ''' if gauss is None: if Nref is None: raise ValueError gauss = np.random.multivariate_normal( np.zeros(X_white.shape[1]), cov_X, size=Nref) # Gaussian reference distribution if gauss.shape[1] != X_white.shape[1]: raise ValueError( 'dimension between X_white and Gaussian reference distribution do not match' ) if isinstance(Knn, int): Knns = [Knn] elif isinstance(Knn, list): Knns = Knn kNN = KNNDivergenceEstimator(div_funcs=[div_func], Ks=Knns, version='slow', clamp=False, n_jobs=njobs) feat = Features([X_white, gauss]) div_knn = kNN.fit_transform(feat) if len(Knns) == 1: return div_knn[0][0][0][1] div_knns = np.zeros(len(Knns)) for i in range(len(Knns)): div_knns[i] = div_knn[0][i][0][1] return div_knns
def computePairwiseSimilarities2(patients, y): """ Compute the pairwise similarity between bags using Dougal code Inputs: - patients: the collection of patient features - y: labels (number of abnormal nodes) for each patient. Used to fit the KNNDivergenceEstimator Returns: - sims: the pairwise similarities between each patient * Note: sims is a NxN symmetric matrix, where N is the number of patients """ # pass the features and labels to scikit-learn Features feats = Features(patients, labels=y) # directly from Dougal # note: learning methods won't use the labels, this is for conveinence # estimate the distances between the bags (patients) using KNNDivergenceEstimator # details: use the kl divergence, find 3 nearest neighbors # not sure what the pairwise picker line does? # rbf and projectPSD help ensure the data is separable? distEstModel = Pipeline( [ # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time ('divs', KNNDivergenceEstimator(div_funcs=['kl'], Ks=[3], n_jobs=-1, version='fast')), ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize()), ('rbf', RBFize(gamma=1, scale_by_median=True)), ('project', ProjectPSD()) ]) # return the pairwise similarities between the bags (patients) sims = distEstModel.fit_transform(feats) return sims
def test_knn_version_consistency(): if not have_flann: raise SkipTest("No flann, so skipping knn tests.") if not have_accel: raise SkipTest("No skl-groups-accel, so skipping version consistency.") n = 20 for dim in [1, 7]: np.random.seed(47) bags = Features([ np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n) ]) div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8') Ks = (3, 4) get_est = partial(KNNDivergenceEstimator, div_funcs=div_funcs, Ks=Ks) results = {} for version in ('fast', 'slow', 'best'): est = get_est(version=version) results[version] = res = est.fit_transform(bags) assert res.shape == (len(div_funcs), len(Ks), n, n) assert np.all(np.isfinite(res)) for df, fast, slow in zip(div_funcs, results['fast'], results['slow']): assert_array_almost_equal(fast, slow, decimal=1 if df == 'js' else 5, err_msg="({}, dim {})".format(df, dim)) # TODO: debug JS differences est = get_est(version='fast', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['fast'] == res) est = get_est(version='slow', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['slow'] == res)
def computeSubjSubjKernel(subjects, div='KL', numNeighbors=3): """ Start by computing the pairwise similarities between subject using Dougal's code. Then, for HE and KL, symmetrize, RBFize, and project the similarities onto a positive semi-definite space. Inputs: - subjects: the collection of patient features - div: which divergence to use. Options are - 'KL': Kullback-Leibler divergence, 'kl' in the function (default) - 'HE': Hellinger divergence, 'hellinger' in the function - 'MMD': Maximum Mean Discrepancy, calls another function - numNeighbors: how many neighbors to look at. Default is 3 Returns: - kernel: the kernel calculated using the pairwise similarities between each subject * Note: kernel is a NxN symmetric matrix, where N is the number of subjects """ # pass the features and labels to scikit-learn Features feats = Features(subjects) # directly from Dougal # specify the divergence to use if div == 'KL': # estimate the distances between the bags (patients) using KNNDivergenceEstimator # details: use the kl divergence, find 3 nearest neighbors # not sure what the pairwise picker line does? # rbf and projectPSD help ensure the data is separable? distEstModel = Pipeline( [ # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time ('divs', KNNDivergenceEstimator(div_funcs=['kl'], Ks=[numNeighbors], n_jobs=-1, version='fast')), ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize()) # ('rbf', RBFize(gamma=1, scale_by_median=True)), # ('project', ProjectPSD()) ]) # return the pairwise similarities between the bags (patients) sims = distEstModel.fit_transform(feats) # Great, we have the similarities and they're symmetric # Now RBFize them, but do the scale by median by hand rbf = RBFize(gamma=1, scale_by_median=False) simsMedian = np.median(sims[np.triu_indices_from(sims)]) medianScaledSims = sims / simsMedian rbfedSims = rbf.fit_transform(medianScaledSims) # Final step in building the kernel: project the rbf'ed similarities # onto a positive semi-definite space psd = ProjectPSD() kernel = psd.fit_transform(rbfedSims) elif div == 'HE': # estimate the distances between the bags (patients) using KNNDivergenceEstimator # details: use the hellinger divergence, find 3 nearest neighbors # not sure what the pairwise picker line does? # rbf and projectPSD help ensure the data is separable? distEstModel = Pipeline( [ # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time ('divs', KNNDivergenceEstimator(div_funcs=['hellinger'], Ks=[numNeighbors], n_jobs=-1, version='fast')), ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize()) # ('rbf', RBFize(gamma=1, scale_by_median=True)), # ('project', ProjectPSD()) ]) # return the pairwise similarities between the bags (patients) sims = distEstModel.fit_transform(feats) # Great, we have the similarities and they're symmetric # Now RBFize them, but do the scale by median by hand rbf = RBFize(gamma=1, scale_by_median=False) simsMedian = np.median(sims[np.triu_indices_from(sims)]) # medianScaledSims = sims/simsMedian # rbfedSims = rbf.fit_transform(medianScaledSims) rbfedSims = rbf.fit_transform(sims) # Final step in building the kernel: project the rbf'ed similarities # onto a positive semi-definite space psd = ProjectPSD() kernel = psd.fit_transform(rbfedSims) elif div == 'MMD': # start by getting the median pairwise squared distance between subject, # used as a heuristic for choosing the bandwidth of the inner RBF kernel subset = np.vstack(feats) subset = subset[np.random.choice(subset.shape[0], min(2000, subset.shape[0]), replace=False)] subsetSquaredDists = euclidean_distances(subset, squared=True) featsMedianSquaredDist = np.median( subsetSquaredDists[np.triu_indices_from(subsetSquaredDists, k=numNeighbors)], overwrite_input=True) # now we need to determine gamma (scaling factor, inverse of sigma) # This was initially done in the library, but Kayhan believes there's # a multiplication instead of a division, so it's being done by hand firstGamma = 1 / featsMedianSquaredDist # calculate the mmds mmds, mmkDiagonals = mmd.rbf_mmd(feats, gammas=firstGamma, squared=True, ret_X_diag=True) # now let's turn the squared MMD distances into a kernel # symmetrize it sym = Symmetrize() mmds = sym.fit_transform(mmds) # get the median squared MMD distance mmdMedianSquaredDist = np.median(mmds[np.triu_indices_from( mmds, k=numNeighbors)]) kernel = np.exp(np.multiply(mmds, -1 / mmdMedianSquaredDist)) else: print("Error: divergence entered is not valid.") return -1 return kernel
def test_features_basic(): bags = [np.random.normal(size=(np.random.randint(10, 100), 10)) for _ in xrange(20)] assert repr(Features([[[8, 9], [12, 12]]])) feats = Features(bags, copy=False, stack=False, label=np.arange(20)) assert len(feats) == 20 assert feats.total_points == sum(bag.shape[0] for bag in bags) assert np.all(feats[3] == bags[3]) assert np.all(feats.label == np.arange(20)) assert repr(feats) assert feats.dtype == np.float64 assert feats != bags assert feats.bare() == bags sub = feats[[5, 2]] assert np.all(sub.label == [5, 2]) assert np.all(sub[0] == feats[5]) assert np.all(sub[1] == feats[2]) assert repr(sub) feats[4][0, 0] = 1000 assert bags[4][0, 0] == 1000 feats.make_stacked() assert feats != bags assert feats.bare() == bags assert len(feats) == 20 assert feats.total_points == sum(bag.shape[0] for bag in bags) assert np.all(feats[3] == bags[3]) assert np.all(feats.label == np.arange(20)) feats[0][0, 0] = -800 assert feats.features[0][0, 0] == -800 assert feats.stacked_features[0, 0] == -800 assert repr(feats) wrong_type(lambda: feats['label']) wrong_type(lambda: feats[['label']]) wrong_type(lambda: feats[[3, 'label']]) cop = feats.copy() assert cop == feats assert cop.stacked cop.make_stacked() assert cop == feats cop[0][0, 0] = 12 assert cop != feats assert repr(cop) fs = lambda *a, **kw: partial(Features, *a, **kw) bags = np.random.normal(size=(10, 5)) wrong_type(fs(bags)) wrong_type(fs(bags, [[4], [12]])) wrong_type(fs(bags, [])) wrong_type(fs(bags, [-3, 13])) wrong_type(fs(bags, [7.5, 2.5])) wrong_type(fs(bags, [7, 2])) wrong_type(fs(np.zeros((10, 0)), [7, 3])) bags = [np.random.normal(size=(5, 8)), np.random.normal(size=(6, 8))] wrong_type(fs(bags, [5, 6])) assert np.all( Features([[5, 6], [[7, 9], [0, 0]]])[0] == np.reshape([5, 6], (1, 2))) wrong_type(fs([ [[[5]]] ])) wrong_type(fs([["hello", "there"]])) wrong_type(fs([[np.arange(10, dtype=int)], [np.arange(10, dtype=float)]])) wrong_type(fs([np.random.randn(8, 7), np.random.randn(0, 7)])) assert np.all( Features([[[1, 2]], [[3, 4]]], stack=True).stacked_features == [[1, 2], [3, 4]]) assert_raises(ValueError, fs(bags, labels=np.arange(3))) with warnings.catch_warnings(record=True) as w: Features(bags, total_points=[1, 2]) assert len(w) == 1