def test_perform_kde(self): me = Analyze() me.bw_steps = 10 data = np.concatenate([self.dist_norm1, self.dist_norm2]) # grid search me.bandwidth = 'grid' obs = me.perform_kde(data)[2] self.assertAlmostEqual(obs, 0.21544346900318834) # Silverman's rule-of-thumb me.bandwidth = 'silverman' obs = me.perform_kde(data)[2] self.assertAlmostEqual(obs, 0.48713295460585126) # fixed value me.bandwidth = 0.5 obs = me.perform_kde(data)[2] self.assertAlmostEqual(obs, 0.5) # invalid bandwidth me.bandwidth = 100 with self.assertRaises(ValueError) as ctx: me.perform_kde(data) msg = 'Invalid bandwidth: 100.' self.assertEqual(str(ctx.exception), msg)
def test_predict_hgt(self): me = Analyze() # populate score table n = 1000 data = { 'sample': ['S1'] * n, 'protein': [f'P{x}' for x in range(n)], 'self': np.random.choice(self.dist_gamma, n), 'close': np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3, np.random.choice(self.dist_norm2, int(n / 2)))), 'distal': np.concatenate( (np.random.choice(self.dist_lognorm, int(n * 3 / 4)), np.random.choice(self.dist_gamma, int(n / 4)) / 2)), 'match': ['0'] * n } me.df = pd.DataFrame(data) # default setting me.output = self.tmpdir me.self_low = False me.bandwidth = 'auto' me.bw_steps = 20 me.low_part = 75 me.fixed = 25 me.noise = 50 me.silhouette = 0.5 me.taxdump = {} me.donor_name = False me.donor_rank = None # run prediction self.assertEqual(me.predict_hgt(), 96) groups = ['self', 'close', 'distal'] for group in groups[1:]: fp = join(self.tmpdir, f'{group}.hist.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'scatter.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'hgts') self.assertTrue(isfile(join(fp, 'S1.txt'))) rmtree(fp) # constant values me.df['close'] = 1 me.df.drop('hgt', axis=1, inplace=True) self.assertEqual(me.predict_hgt(), 0) self.assertNotIn('hgt', me.df.columns) remove(join(self.tmpdir, 'close.hist.png'))
def test_cluster_kde(self): me = Analyze() data = np.concatenate([self.dist_norm1, self.dist_norm2]) me.df = pd.Series(data, name='group').to_frame() me.bw_steps = 10 me.noise = 50 me.low_part = 75 me.output = self.tmpdir # grid search me.bandwidth = 'grid' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 1.855525575742988) # Silverman's rule-of-thumb me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2279977615745703) # fixed value me.bandwidth = 0.5 obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2507008281395433) # smart KDE me.bandwidth = 'auto' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.1903958075763343) # clean up remove(join(self.tmpdir, 'group.kde.png')) # cannot find threshold (unimodal distribution) me.df = pd.Series(self.dist_norm1, name='group').to_frame() me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertEqual(obs, 0)