def setUp(self): super().setUp() # load the test1 experiment as sparse self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None) # load the timeseries experiment as sparse self.timeseries = ca.read(self.timeseries_biom, self.timeseries_samp, normalize=None)
def test_save(self): exp = ca.read(self.test2_biom, self.test2_samp, normalize=None) d = mkdtemp() f = join(d, 'test1.save') # test the json biom format exp.save(f, fmt='json') newexp = ca.read(f + '.biom', f + '_sample.txt', normalize=None) assert_experiment_equal(newexp, exp, ignore_md_fields=['#SampleID.1']) shutil.rmtree(d)
def setUp(self): super().setUp() self.test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None) self.test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None)
def setUp(self): super().setUp() self.test2_sparse = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None) self.test2_dense = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, sparse=False, normalize=None)
def setUp(self): super().setUp() # load the simple experiment as sparse self.test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None) self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None) # load the complex experiment as sparse self.timeseries = ca.read(self.timeseries_biom, self.timeseries_samp, normalize=None)
def setUp(self): super().setUp() # load the simple experiment as sparse self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None) # load the paired testing experiment self.test_paired = ca.read(self.test_paired_biom, self.test_paired_samp, normalize=None) # load the complex experiment as sparse with normalizing and removing low read samples self.complex = ca.read_amplicon(self.timeseries_biom, self.timeseries_samp, min_reads=1000, normalize=10000)
def test_read_no_metadata(self): logging.disable(logging.NOTSET) with self.assertLogs(level='INFO') as cm: # test loading without a mapping file exp = ca.read(self.test1_biom, normalize=None) self.assertRegex(cm.output[0], 'loaded 21 samples, 12 features') self._validate_read(exp, validate_sample_metadata=False)
def test_save_biom(self): # NOTE: Currently not testing the save biom hdf with taxonomy # as there is a bug there! exp = ca.read_amplicon(self.test1_biom, self.test1_samp, normalize=None, min_reads=None) d = mkdtemp() f = join(d, 'test1.save.biom') # test the json biom format exp.save_biom(f, fmt='hdf5') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp) # test the txt biom format exp.save_biom(f, fmt='txt') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) # test the hdf5 biom format with no taxonomy exp.save_biom(f, add_metadata=None) newexp = ca.read(f, self.test1_samp, normalize=None) self.assertTrue('taxonomy' not in newexp.feature_metadata) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) shutil.rmtree(d)
def test_read_sample_kwargs(self): # re-enable logging because it is disabled in setUp logging.disable(logging.NOTSET) with self.assertLogs(level='INFO') as cm: # load the simple dataset as sparse exp = ca.read( self.test1_biom, self.test1_samp, self.test1_feat, normalize=None, sample_metadata_kwargs={'parse_dates': ['collection_date']}) # test the log messages are correct self.assertRegex(cm.output[0], 'loaded 21 samples, 12 features') self.assertRegex(cm.output[1], "dropped \\(1\\): {'SAMPLE_NOT_EXIST'}") self.assertRegex( cm.output[2], "These have data but do not have metadata: {'badsample'}") self.assertRegex(cm.output[3], "dropped \\(1\\): {'FEATURE_NOT_EXIST'}") self.assertRegex( cm.output[4], "These have data but do not have metadata: {'badfeature'}") self.assertTrue(scipy.sparse.issparse(exp.data)) self._validate_read(exp) obs_dates = exp.sample_metadata['collection_date'].tolist() # the last sample in OTU table does not have metadata, so NaT exp_dates = [pd.Timestamp('2017-8-1')] * 20 + [pd.NaT] self.assertListEqual(obs_dates, exp_dates)
def test_read_feature_kwargs(self): # re-enable logging because it is disabled in setUp logging.disable(logging.NOTSET) with self.assertLogs(level='INFO') as cm: # load the simple dataset as sparse exp = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None, feature_metadata_kwargs={'dtype': { 'ph': str }}) # test the log messages are correct self.assertRegex(cm.output[0], 'loaded 21 samples, 12 features') self.assertRegex(cm.output[1], "dropped \\(1\\): {'SAMPLE_NOT_EXIST'}") self.assertRegex( cm.output[2], "These have data but do not have metadata: {'badsample'}") self.assertRegex(cm.output[3], "dropped \\(1\\): {'FEATURE_NOT_EXIST'}") self.assertRegex( cm.output[4], "These have data but do not have metadata: {'badfeature'}") self.assertTrue(scipy.sparse.issparse(exp.data)) self._validate_read(exp) # read as str not float self.assertEqual(exp.feature_metadata.loc['AA', 'ph'], '4.0')
def test_filter_samples_na(self): test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None) # filter na value in group column obs = test1.filter_samples('group', None) self.assertEqual(obs.shape, (20, 12)) self.assertEqual(test1.sample_metadata.dropna(axis=0).index.tolist(), obs.sample_metadata.index.tolist())
def test_read_not_sparse(self): logging.disable(logging.NOTSET) with self.assertLogs(level='INFO') as cm: # load the simple dataset as dense exp = ca.read(self.test1_biom, self.test1_samp, sparse=False, normalize=None) self.assertFalse(scipy.sparse.issparse(exp.data)) self._validate_read(exp, cm.output)
def test_create_biom_table_from_exp(self): exp = ca.read(self.test1_biom, self.test1_samp, normalize=None) table = _create_biom_table_from_exp(exp) self.assertCountEqual(table.ids(axis='observation'), exp.feature_metadata.index.values) self.assertCountEqual(table.ids(axis='sample'), exp.sample_metadata.index.values) assert_array_almost_equal(table.matrix_data.toarray(), exp.get_data(sparse=False).transpose()) metadata = table.metadata(id=exp.feature_metadata.index[1], axis='observation') self.assertEqual(metadata['taxonomy'], exp.feature_metadata['taxonomy'].iloc[1])
def test_filter_by_metadata_na(self): test = self.test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None, feature_metadata_kwargs={'na_values': 'B'}) test_drop = test.filter_by_metadata('level1', select=None, axis='f') self.assertEqual(self.test2.sample_metadata.index.tolist(), test_drop.sample_metadata.index.tolist()) self.assertEqual(['AT', 'AG', 'AC', 'TA', 'TT', 'TC'], test_drop.feature_metadata.index.tolist())
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp1 = ca.read_amplicon(self.test1_biom, min_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('abundance', axis=0, cutoff=1000, inplace=True, mean_or_sum='sum') exp2.normalize(inplace=True) assert_experiment_equal(exp1, exp2) self.assertIn('taxonomy', exp1.feature_metadata.columns)
def test_read(self): # load the simple dataset as sparse exp = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None) self.assertTrue(scipy.sparse.issparse(exp.data)) self._validate_read(exp)
def test_read_not_sparse(self): # load the simple dataset as dense exp = ca.read(self.test1_biom, self.test1_samp, sparse=False, normalize=None) self.assertFalse(scipy.sparse.issparse(exp.data)) self._validate_read(exp)
def test_filter_samples_edge_cases(self): # no group 3 - none filtered test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None) # group dtype is O obs = test1.filter_samples('group', ['3']) self.assertEqual(obs.shape, (0, 12)) obs = test1.filter_samples('group', ['3'], negate=True) self.assert_experiment_equal(obs, test1)
def test_plot_core_features(self): np.random.seed(12345) self.test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=100) self.test1.sparse = False ax = self.test1.filter_samples( 'group', ['1', '2']).plot_core_features( field='group', steps=(2, 12), iterations=2) lines = ax.get_lines() self.assertEqual(len(lines), 6)
def test_read_openms_bucket_table_samples_are_rows(self): # load the openms bucket table with no metadata exp = ca.read(self.openms_samples_rows_csv, data_file_type='csv', sample_in_row=True, sparse=False, normalize=None) self.assertEqual(len(exp.sample_metadata), 9) self.assertEqual(len(exp.feature_metadata), 10) self.assertEqual(exp.shape, (9, 10)) self.assertEqual(exp.data[0, :].sum(), 8554202) self.assertEqual(exp.data[:, 1].sum(), 13795540) self.assertEqual(exp.sparse, False)
def test_filter_by_data_sample(self): for sparse, inplace in [(True, False), (True, True), (False, False), (False, True)]: test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, sparse=sparse, normalize=None) # filter out samples with abundance < 1200. only the last sample is filtered out. obs = test2.filter_by_data( 'sum_abundance', axis=0, inplace=inplace, cutoff=1200) self.assertEqual(obs.shape, (8, 8)) exp = ca.read(*[get_data_path(i) for i in [ 'test2.biom.filter.sample', 'test2.sample', 'test2.feature']], normalize=None) assert_experiment_equal(obs, exp) if inplace: self.assertIs(obs, test2) else: self.assertIsNot(obs, test2)
def test_filter_features(self): for inplace in [True, False]: test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, normalize=None) obs = test2.filter_features('oxygen', ['anaerobic'], inplace=inplace) self.assertEqual(obs.shape, (9, 2)) self.assertListEqual(obs.feature_metadata.index.tolist(), ['TG', 'TC']) if inplace: self.assertIs(obs, test2) else: self.assertIsNot(obs, test2)
def test_save_fasta(self): exp = ca.read(self.test1_biom, self.test1_samp, normalize=None) d = mkdtemp() f = join(d, 'test1.fasta') exp.save_fasta(f) seqs = [] for seq in skbio.read(f, format='fasta'): seqs.append(str(seq)) self.assertCountEqual(seqs, exp.feature_metadata.index.values) shutil.rmtree(d)
def test_normalize_by_subset_features(self): # test the filtering in standard mode (remove a few features, normalize to 10k) exp = ca.read(self.test1_biom, self.test1_samp, normalize=None) bad_features = [6, 7] features = [exp.feature_metadata.index[cbad] for cbad in bad_features] newexp = exp.normalize_by_subset_features(features, 10000, negate=True, inplace=False) # see the mean of the features we want (without 6,7) is 10k good_features = list(set(range(exp.data.shape[1])).difference(set(bad_features))) assert_array_almost_equal(newexp.data[:, good_features].sum(axis=1), np.ones([exp.data.shape[0]])*10000) self.assertTrue(np.all(newexp.data[:, bad_features] > exp.data[:, bad_features]))
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp = ca.read_amplicon(self.test1_biom, filter_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('sum_abundance', cutoff=1000, inplace=True) exp2.normalize(inplace=True) assert_experiment_equal(exp, exp2) self.assertIn('taxonomy', exp.feature_metadata)
def load(self): win = LoadWindow() res = win.exec_() if res == QtWidgets.QDialog.Accepted: tablefname = str(win.wTableFile.text()) mapfname = str(win.wMapFile.text()) if mapfname == '': mapfname = None gnpsfname = str(win.wGNPSFile.text()) if gnpsfname == '': gnpsfname = None expname = str(win.wNewName.text()) exptype = str(win.wType.currentText()) if exptype == 'Amplicon': try: expdat = ca.read_amplicon(tablefname, mapfname, normalize=10000, filter_reads=1000) except: logger.warn( 'Load for amplicon biom table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Metabolomics (rows are samples)': try: expdat = ca.read_open_ms(tablefname, mapfname, gnps_file=gnpsfname, normalize=None, rows_are_samples=True) except: logger.warn('Load for openms table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Metabolomics (rows are features)': try: expdat = ca.read_open_ms(tablefname, mapfname, gnps_file=gnpsfname, normalize=None, rows_are_samples=False) except: logger.warn('Load for openms table %s map %s failed' % (tablefname, mapfname)) return elif exptype == 'Amplicon': try: expdat = ca.read(tablefname, mapfname) except: logger.warn('Load for biom table %s map %s failed' % (tablefname, mapfname)) return expdat._studyname = expname self.addexp(expdat)
def test_filter_samples(self): for inplace in [True, False]: test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None) # only filter samples from 11 to 14. obs = test1.filter_samples('id', list(range(11, 15)), inplace=inplace) self.assertEqual(obs.shape, (4, 12)) self.assertEqual(obs.sample_metadata.index.tolist(), ['S11', 'S12', 'S13', 'S14']) if inplace: self.assertIs(obs, test1) else: self.assertIsNot(obs, test1)
def test_filter_by_metadata_feature(self): for sparse, inplace in [(True, False), (True, True), (False, False), (False, True)]: test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, sparse=sparse, normalize=None) # only filter samples with id bewtween 3 and 7. obs = test2.filter_by_metadata('oxygen', ['anaerobic'], axis=1, inplace=inplace) self.assertEqual(obs.shape, (9, 2)) self.assertListEqual(obs.feature_metadata.index.tolist(), ['TG', 'TC']) if inplace: self.assertIs(obs, test2) else: self.assertIsNot(obs, test2)
def test_sort_samples(self): obs = self.timeseries.sort_samples( 'MINUTES', inplace=True).sort_samples('HOUR', inplace=True).sort_samples('DAY', inplace=True) self.assertIs(obs, self.timeseries) exp = ca.read(join(self.test_data_dir, 'timeseries.sorted.time.biom'), join(self.test_data_dir, 'timeseries.sample'), normalize=None) assert_experiment_equal(obs, exp, almost_equal=True) self.assertListEqual(obs.sample_metadata['MF_SAMPLE_NUMBER'].tolist(), list(range(1, 96)))
def test_sort_by_metadata_sample(self): # test sorting various fields (keeping the order) obs = self.timeseries.sort_by_metadata( field='MINUTES', inplace=True).sort_by_metadata( field='HOUR', inplace=True).sort_by_metadata(field='DAY', inplace=True) self.assertIs(obs, self.timeseries) exp = ca.read(join(self.test_data_dir, 'timeseries.sorted.time.biom'), join(self.test_data_dir, 'timeseries.sample'), normalize=None) assert_experiment_equal(obs, exp, almost_equal=True) self.assertListEqual(obs.sample_metadata['MF_SAMPLE_NUMBER'].tolist(), list(range(1, 96)))