def test_remove_orphans(self): me = Analyze() me.df = pd.DataFrame( [[1.0, 0.2], [0.5, 0.4], [0.0, 0.0], [0.8, 0.0], [0.0, 0.7]], columns=['close', 'distal']) me.remove_orphans() self.assertListEqual(me.df.values.tolist(), [[1.0, 0.2], [0.5, 0.4], [0.8, 0.0], [0.0, 0.7]])
def test_remove_outliers(self): me = Analyze() me.self_low = False df = pd.DataFrame(np.array([self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['close', 'distal']) # Z-score me.df = df.copy() me.outliers = 'zscore' me.remove_outliers() self.assertEqual(me.df.shape[0], 781) # boxplot me.df = df.copy() me.outliers = 'boxplot' me.remove_outliers() self.assertEqual(me.df.shape[0], 710)
def test_plot_hgts(self): me = Analyze() me.output = self.tmpdir me.df = pd.DataFrame(np.array( [self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['close', 'distal']) me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2) me.plot_hgts() fp = join(self.tmpdir, 'scatter.png') self.assertTrue(isfile(fp)) remove(fp)
def test_smart_kde(self): me = Analyze() # typical case (bimodal distribution) me.df = pd.Series(np.concatenate([self.dist_norm1, self.dist_norm2]), name='group').to_frame() me.bw_steps = 10 me.noise = 50 me.low_part = 75 me.output = self.tmpdir obs = me.smart_kde('group') self.assertAlmostEqual(obs, 2.1903958075763343) file = join(self.tmpdir, 'group.kde.png') self.assertTrue(isfile(file)) remove(file) # unable to determine threshold me.low_part = 0.001 me.df = pd.Series(self.dist_norm1, name='group').to_frame() self.assertEqual(me.smart_kde('group'), 0)
def test_predict_hgt(self): me = Analyze() # populate score table n = 1000 data = { 'sample': ['S1'] * n, 'protein': [f'P{x}' for x in range(n)], 'self': np.random.choice(self.dist_gamma, n), 'close': np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3, np.random.choice(self.dist_norm2, int(n / 2)))), 'distal': np.concatenate( (np.random.choice(self.dist_lognorm, int(n * 3 / 4)), np.random.choice(self.dist_gamma, int(n / 4)) / 2)), 'match': ['0'] * n } me.df = pd.DataFrame(data) # default setting me.output = self.tmpdir me.self_low = False me.bandwidth = 'auto' me.bw_steps = 20 me.low_part = 75 me.fixed = 25 me.noise = 50 me.silhouette = 0.5 me.taxdump = {} me.donor_name = False me.donor_rank = None # run prediction self.assertEqual(me.predict_hgt(), 96) groups = ['self', 'close', 'distal'] for group in groups[1:]: fp = join(self.tmpdir, f'{group}.hist.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'scatter.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'hgts') self.assertTrue(isfile(join(fp, 'S1.txt'))) rmtree(fp) # constant values me.df['close'] = 1 me.df.drop('hgt', axis=1, inplace=True) self.assertEqual(me.predict_hgt(), 0) self.assertNotIn('hgt', me.df.columns) remove(join(self.tmpdir, 'close.hist.png'))
def test_refine_cluster(self): me = Analyze() # only close and distal me.self_low = False me.silhouette = 0.5 me.df = pd.DataFrame(np.array( [self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['close', 'distal']) me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2) me.refine_cluster(me.calc_cluster_props()) self.assertEqual(me.df[me.df['hgt']].shape[0], 11) # all three groups me.self_low = True me.df = pd.DataFrame(np.array( [self.dist_norm1[:800], self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['self', 'close', 'distal']) me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2) me.refine_cluster(me.calc_cluster_props()) self.assertEqual(me.df[me.df['hgt']].shape[0], 4)
def test_calc_cluster_props(self): me = Analyze() me.self_low = False me.df = pd.DataFrame(np.array( [self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['close', 'distal']) me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2) obs = me.calc_cluster_props() self.assertAlmostEqual(obs[0], 1.094658052928843) self.assertAlmostEqual(obs[1], 4.30076698399293) obs = me.df['silh'].describe() self.assertAlmostEqual(obs['mean'], 0.312495082044277) self.assertAlmostEqual(obs['std'], 0.21945541659155993) self.assertEqual(me.df.query('hgt & silh < 0.5').shape[0], 35)
def test_cluster_kde(self): me = Analyze() data = np.concatenate([self.dist_norm1, self.dist_norm2]) me.df = pd.Series(data, name='group').to_frame() me.bw_steps = 10 me.noise = 50 me.low_part = 75 me.output = self.tmpdir # grid search me.bandwidth = 'grid' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 1.855525575742988) # Silverman's rule-of-thumb me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2279977615745703) # fixed value me.bandwidth = 0.5 obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2507008281395433) # smart KDE me.bandwidth = 'auto' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.1903958075763343) # clean up remove(join(self.tmpdir, 'group.kde.png')) # cannot find threshold (unimodal distribution) me.df = pd.Series(self.dist_norm1, name='group').to_frame() me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertEqual(obs, 0)
def test_write_hgt_list(self): me = Analyze() me.output = self.tmpdir makedirs(join(me.output, 'hgts'), exist_ok=True) me.donor_name = False me.donor_rank = None me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.df = pd.DataFrame( [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True], ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True], ['S2', 'P5', 0.20, '0', False]], columns=['sample', 'protein', 'silh', 'match', 'hgt']) # default me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # number format and negative result me.write_hgt_list('S2') with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f: self.assertEqual(f.read(), 'P4\t0.8\t766\n') # raise to family me.donor_rank = 'family' me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # report taxon name me.donor_rank = None me.donor_name = True me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\tEscherichia coli\n' 'P2\t0.95\tShigella dysenteriae\n' 'P3\t1.05\tN/A\n') self.assertEqual(obs, exp) rmtree(join(me.output, 'hgts'))