def test_plot_hgts(self): me = Analyze() me.output = self.tmpdir me.df = pd.DataFrame(np.array( [self.dist_gamma, self.dist_lognorm[:800]]).T, columns=['close', 'distal']) me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2) me.plot_hgts() fp = join(self.tmpdir, 'scatter.png') self.assertTrue(isfile(fp)) remove(fp)
def test_predict_hgt(self): me = Analyze() # populate score table n = 1000 data = { 'sample': ['S1'] * n, 'protein': [f'P{x}' for x in range(n)], 'self': np.random.choice(self.dist_gamma, n), 'close': np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3, np.random.choice(self.dist_norm2, int(n / 2)))), 'distal': np.concatenate( (np.random.choice(self.dist_lognorm, int(n * 3 / 4)), np.random.choice(self.dist_gamma, int(n / 4)) / 2)), 'match': ['0'] * n } me.df = pd.DataFrame(data) # default setting me.output = self.tmpdir me.self_low = False me.bandwidth = 'auto' me.bw_steps = 20 me.low_part = 75 me.fixed = 25 me.noise = 50 me.silhouette = 0.5 me.taxdump = {} me.donor_name = False me.donor_rank = None # run prediction self.assertEqual(me.predict_hgt(), 96) groups = ['self', 'close', 'distal'] for group in groups[1:]: fp = join(self.tmpdir, f'{group}.hist.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'scatter.png') self.assertTrue(isfile(fp)) remove(fp) fp = join(self.tmpdir, 'hgts') self.assertTrue(isfile(join(fp, 'S1.txt'))) rmtree(fp) # constant values me.df['close'] = 1 me.df.drop('hgt', axis=1, inplace=True) self.assertEqual(me.predict_hgt(), 0) self.assertNotIn('hgt', me.df.columns) remove(join(self.tmpdir, 'close.hist.png'))
def test_write_hgt_list(self): me = Analyze() me.output = self.tmpdir makedirs(join(me.output, 'hgts'), exist_ok=True) me.donor_name = False me.donor_rank = None me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.df = pd.DataFrame( [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True], ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True], ['S2', 'P5', 0.20, '0', False]], columns=['sample', 'protein', 'silh', 'match', 'hgt']) # default me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # number format and negative result me.write_hgt_list('S2') with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f: self.assertEqual(f.read(), 'P4\t0.8\t766\n') # raise to family me.donor_rank = 'family' me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # report taxon name me.donor_rank = None me.donor_name = True me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\tEscherichia coli\n' 'P2\t0.95\tShigella dysenteriae\n' 'P3\t1.05\tN/A\n') self.assertEqual(obs, exp) rmtree(join(me.output, 'hgts'))
def test_make_score_table(self): me = Analyze() me.output = self.tmpdir me.data = { 'S1': [{ 'id': 'P1', 'length': 100, 'match': '0', 'self': 1.5, 'close': 0.75, 'distal': 0.0, 'hits': pd.DataFrame([0] * 3) }, { 'id': 'P2', 'length': 120, 'match': '1224', 'self': 1.625, 'close': 0.225, 'distal': 0.375, 'hits': pd.DataFrame([0] * 5) }], 'S2': [{ 'id': 'P1', 'length': 225, 'match': '620', 'self': 2.35, 'close': 1.05, 'distal': 0.75, 'hits': pd.DataFrame([0] * 6) }] } me.make_score_table() obs = me.df.values.tolist() exp = [['S1', 'P1', 100, 3, 1.5, 0.75, 0, '0'], ['S1', 'P2', 120, 5, 1.625, 0.225, 0.375, '1224'], ['S2', 'P1', 225, 6, 2.35, 1.05, 0.75, '620']] self.assertListEqual(obs, exp) fp = join(self.tmpdir, 'scores.tsv') with open(fp, 'r') as f: obs = [x.split('\t') for x in f.read().splitlines()[1:]] exp = [[str(y) for y in x] for x in exp] self.assertListEqual(obs, exp) remove(fp)
def test_smart_kde(self): me = Analyze() # typical case (bimodal distribution) me.df = pd.Series(np.concatenate([self.dist_norm1, self.dist_norm2]), name='group').to_frame() me.bw_steps = 10 me.noise = 50 me.low_part = 75 me.output = self.tmpdir obs = me.smart_kde('group') self.assertAlmostEqual(obs, 2.1903958075763343) file = join(self.tmpdir, 'group.kde.png') self.assertTrue(isfile(file)) remove(file) # unable to determine threshold me.low_part = 0.001 me.df = pd.Series(self.dist_norm1, name='group').to_frame() self.assertEqual(me.smart_kde('group'), 0)
def test_cluster_kde(self): me = Analyze() data = np.concatenate([self.dist_norm1, self.dist_norm2]) me.df = pd.Series(data, name='group').to_frame() me.bw_steps = 10 me.noise = 50 me.low_part = 75 me.output = self.tmpdir # grid search me.bandwidth = 'grid' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 1.855525575742988) # Silverman's rule-of-thumb me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2279977615745703) # fixed value me.bandwidth = 0.5 obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.2507008281395433) # smart KDE me.bandwidth = 'auto' obs = me.cluster_kde('group') self.assertAlmostEqual(obs, 2.1903958075763343) # clean up remove(join(self.tmpdir, 'group.kde.png')) # cannot find threshold (unimodal distribution) me.df = pd.Series(self.dist_norm1, name='group').to_frame() me.bandwidth = 'silverman' obs = me.cluster_kde('group') self.assertEqual(obs, 0)