コード例 #1
0
 def test_remove_orphans(self):
     me = Analyze()
     me.df = pd.DataFrame(
         [[1.0, 0.2], [0.5, 0.4], [0.0, 0.0], [0.8, 0.0], [0.0, 0.7]],
         columns=['close', 'distal'])
     me.remove_orphans()
     self.assertListEqual(me.df.values.tolist(),
                          [[1.0, 0.2], [0.5, 0.4], [0.8, 0.0], [0.0, 0.7]])
コード例 #2
0
    def test_remove_outliers(self):
        me = Analyze()
        me.self_low = False
        df = pd.DataFrame(np.array([self.dist_gamma,
                                    self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])

        # Z-score
        me.df = df.copy()
        me.outliers = 'zscore'
        me.remove_outliers()
        self.assertEqual(me.df.shape[0], 781)

        # boxplot
        me.df = df.copy()
        me.outliers = 'boxplot'
        me.remove_outliers()
        self.assertEqual(me.df.shape[0], 710)
コード例 #3
0
 def test_plot_hgts(self):
     me = Analyze()
     me.output = self.tmpdir
     me.df = pd.DataFrame(np.array(
         [self.dist_gamma, self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])
     me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
     me.plot_hgts()
     fp = join(self.tmpdir, 'scatter.png')
     self.assertTrue(isfile(fp))
     remove(fp)
コード例 #4
0
    def test_smart_kde(self):
        me = Analyze()

        # typical case (bimodal distribution)
        me.df = pd.Series(np.concatenate([self.dist_norm1, self.dist_norm2]),
                          name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir
        obs = me.smart_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)
        file = join(self.tmpdir, 'group.kde.png')
        self.assertTrue(isfile(file))
        remove(file)

        # unable to determine threshold
        me.low_part = 0.001
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        self.assertEqual(me.smart_kde('group'), 0)
コード例 #5
0
    def test_predict_hgt(self):
        me = Analyze()

        # populate score table
        n = 1000
        data = {
            'sample': ['S1'] * n,
            'protein': [f'P{x}' for x in range(n)],
            'self':
            np.random.choice(self.dist_gamma, n),
            'close':
            np.concatenate((np.random.choice(self.dist_norm1, int(n / 2)) / 3,
                            np.random.choice(self.dist_norm2, int(n / 2)))),
            'distal':
            np.concatenate(
                (np.random.choice(self.dist_lognorm, int(n * 3 / 4)),
                 np.random.choice(self.dist_gamma, int(n / 4)) / 2)),
            'match': ['0'] * n
        }
        me.df = pd.DataFrame(data)

        # default setting
        me.output = self.tmpdir
        me.self_low = False
        me.bandwidth = 'auto'
        me.bw_steps = 20
        me.low_part = 75
        me.fixed = 25
        me.noise = 50
        me.silhouette = 0.5
        me.taxdump = {}
        me.donor_name = False
        me.donor_rank = None

        # run prediction
        self.assertEqual(me.predict_hgt(), 96)
        groups = ['self', 'close', 'distal']
        for group in groups[1:]:
            fp = join(self.tmpdir, f'{group}.hist.png')
            self.assertTrue(isfile(fp))
            remove(fp)
        fp = join(self.tmpdir, 'scatter.png')
        self.assertTrue(isfile(fp))
        remove(fp)
        fp = join(self.tmpdir, 'hgts')
        self.assertTrue(isfile(join(fp, 'S1.txt')))
        rmtree(fp)

        # constant values
        me.df['close'] = 1
        me.df.drop('hgt', axis=1, inplace=True)
        self.assertEqual(me.predict_hgt(), 0)
        self.assertNotIn('hgt', me.df.columns)
        remove(join(self.tmpdir, 'close.hist.png'))
コード例 #6
0
    def test_refine_cluster(self):
        me = Analyze()

        # only close and distal
        me.self_low = False
        me.silhouette = 0.5
        me.df = pd.DataFrame(np.array(
            [self.dist_gamma, self.dist_lognorm[:800]]).T,
                             columns=['close', 'distal'])
        me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
        me.refine_cluster(me.calc_cluster_props())
        self.assertEqual(me.df[me.df['hgt']].shape[0], 11)

        # all three groups
        me.self_low = True
        me.df = pd.DataFrame(np.array(
            [self.dist_norm1[:800], self.dist_gamma,
             self.dist_lognorm[:800]]).T,
                             columns=['self', 'close', 'distal'])
        me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
        me.refine_cluster(me.calc_cluster_props())
        self.assertEqual(me.df[me.df['hgt']].shape[0], 4)
コード例 #7
0
 def test_calc_cluster_props(self):
     me = Analyze()
     me.self_low = False
     me.df = pd.DataFrame(np.array(
         [self.dist_gamma, self.dist_lognorm[:800]]).T,
                          columns=['close', 'distal'])
     me.df['hgt'] = (me.df['close'] < 2) & (me.df['distal'] > 2)
     obs = me.calc_cluster_props()
     self.assertAlmostEqual(obs[0], 1.094658052928843)
     self.assertAlmostEqual(obs[1], 4.30076698399293)
     obs = me.df['silh'].describe()
     self.assertAlmostEqual(obs['mean'], 0.312495082044277)
     self.assertAlmostEqual(obs['std'], 0.21945541659155993)
     self.assertEqual(me.df.query('hgt & silh < 0.5').shape[0], 35)
コード例 #8
0
    def test_cluster_kde(self):
        me = Analyze()
        data = np.concatenate([self.dist_norm1, self.dist_norm2])
        me.df = pd.Series(data, name='group').to_frame()
        me.bw_steps = 10
        me.noise = 50
        me.low_part = 75
        me.output = self.tmpdir

        # grid search
        me.bandwidth = 'grid'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 1.855525575742988)

        # Silverman's rule-of-thumb
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2279977615745703)

        # fixed value
        me.bandwidth = 0.5
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.2507008281395433)

        # smart KDE
        me.bandwidth = 'auto'
        obs = me.cluster_kde('group')
        self.assertAlmostEqual(obs, 2.1903958075763343)

        # clean up
        remove(join(self.tmpdir, 'group.kde.png'))

        # cannot find threshold (unimodal distribution)
        me.df = pd.Series(self.dist_norm1, name='group').to_frame()
        me.bandwidth = 'silverman'
        obs = me.cluster_kde('group')
        self.assertEqual(obs, 0)
コード例 #9
0
    def test_write_hgt_list(self):
        me = Analyze()
        me.output = self.tmpdir
        makedirs(join(me.output, 'hgts'), exist_ok=True)
        me.donor_name = False
        me.donor_rank = None
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.df = pd.DataFrame(
            [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True],
             ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True],
             ['S2', 'P5', 0.20, '0', False]],
            columns=['sample', 'protein', 'silh', 'match', 'hgt'])

        # default
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # number format and negative result
        me.write_hgt_list('S2')
        with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f:
            self.assertEqual(f.read(), 'P4\t0.8\t766\n')

        # raise to family
        me.donor_rank = 'family'
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # report taxon name
        me.donor_rank = None
        me.donor_name = True
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\tEscherichia coli\n'
               'P2\t0.95\tShigella dysenteriae\n'
               'P3\t1.05\tN/A\n')
        self.assertEqual(obs, exp)
        rmtree(join(me.output, 'hgts'))