def test_group_taxa_data_table_alt(self): with patch.object(TaxonomyModel, 'presence_data_table') as mock_model: mock_model.return_value = DataTable.from_dataframe( pd.DataFrame({ 'sampleId': ['sample-1', 'sample-1', 'sample-2'], 'rank_1': ['a', 'a', 'a'], 'rank_2': ['b', 'f', 'b'], })) response, code = group_taxa_present_alt( {'sample_ids': ['sample-1', 'sample-2']}, "dataset1", "some-table") self.assertEqual(code, 200) exp_keys = ['data', 'columns'] obs = json.loads(response) self.assertCountEqual(exp_keys, obs.keys()) self.assertCountEqual(obs['columns'], [{ 'data': 'sampleId' }, { 'data': 'rank_1' }, { 'data': 'rank_2' }]) for item in obs['data']: self.assertCountEqual(item.keys(), ['sampleId', 'rank_1', 'rank_2'])
def presence_data_table(self, ids: Iterable[str]) -> DataTable: table = self._table.filter(set(ids), inplace=False).remove_empty() features = table.ids(axis='observation') entries = list() for vec, sample_id, _ in table.iter(dense=False): for feature_idx, val in zip(vec.indices, vec.data): entry = { 'sampleId': sample_id, 'relativeAbundance': val, **self._formatted_taxa_names[features[feature_idx]], } entries.append(entry) sample_data = pd.DataFrame( entries, # this enforces the column order columns=['sampleId'] + self._formatter.labels + ['relativeAbundance'], # need the .astype('object') in case a # column is completely empty (filled with # Nan, default dtype is numeric, # which cannot be replaced with None. # Need None because it is valid for JSON, # but NaN is not. ).astype('object') sample_data[pd.isna(sample_data)] = None return DataTable.from_dataframe(sample_data)
def test_data_table(self): DataEntry = create_data_entry(['foo', 'bar']) entry1 = DataEntry(foo='baz', bar='qux') entry2 = DataEntry(foo='quuz', bar='corge') dt = DataTable(data=[entry1, entry2], columns=['foo', 'bar']) obs_dict = dt.to_dict() exp_dict = { 'data': [{ 'foo': 'baz', 'bar': 'qux' }, { 'foo': 'quuz', 'bar': 'corge' }], 'columns': ['foo', 'bar'], } obs = json.dumps(obs_dict) exp = json.dumps(exp_dict) self.assertEqual(obs, exp)
def test_data_table_from_dataframe(self): dict_ = { 'data': [{ 'foo': 'baz', 'bar': 'qux' }, { 'foo': 'quuz', 'bar': 'corge' }], 'columns': ['foo', 'bar'], } df = pd.DataFrame(dict_['data'], columns=dict_['columns']) dt = DataTable.from_dataframe(df) dict_['columns'] = [{'data': 'foo'}, {'data': 'bar'}] obs_dict = dt.to_dict() obs = json.dumps(obs_dict) exp = json.dumps(dict_) self.assertEqual(obs, exp)
def test_group_taxa_data_table(self): with patch('microsetta_public_api.repo._taxonomy_repo.TaxonomyRepo.' 'tables', new_callable=PropertyMock) as mock_tables, \ patch.object(TaxonomyModel, 'presence_data_table') as \ mock_model: mock_tables.return_value = { 'some-table': { 'table': self.table, 'feature-data-taxonomy': self.taxonomy_df, 'variances': self.table_vars, 'model': self.taxonomy_model, }, } mock_model.return_value = DataTable.from_dataframe( pd.DataFrame({ 'sampleId': ['sample-1', 'sample-1', 'sample-2'], 'rank_1': ['a', 'a', 'a'], 'rank_2': ['b', 'f', 'b'], })) response, code = group_taxa_present( {'sample_ids': ['sample-1', 'sample-2']}, "some-table") self.assertEqual(code, 200) exp_keys = ['data', 'columns'] obs = json.loads(response) self.assertCountEqual(exp_keys, obs.keys()) self.assertCountEqual(obs['columns'], [{ 'data': 'sampleId' }, { 'data': 'rank_1' }, { 'data': 'rank_2' }]) for item in obs['data']: self.assertCountEqual(item.keys(), ['sampleId', 'rank_1', 'rank_2'])
def test_presence_data_table(self): taxonomy = Taxonomy(self.table, self.taxonomy_greengenes_df, self.table_vars) obs = taxonomy.presence_data_table(['sample-1', 'sample-2']) exp_columns = [ 'sampleId', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'relativeAbundance' ] DataEntry = create_data_entry(exp_columns) exp = DataTable( data=[ DataEntry( **{ 'sampleId': 'sample-1', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': 'd', 'Genus': 'e', 'Species': None, 'relativeAbundance': 2. / 5, }), DataEntry( **{ 'sampleId': 'sample-1', 'Kingdom': 'a', 'Phylum': 'f', 'Class': None, 'Order': 'g', 'Family': 'h', 'Genus': None, 'Species': None, 'relativeAbundance': 3. / 5, }), DataEntry( **{ 'sampleId': 'sample-2', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': None, 'Genus': None, 'Species': None, 'relativeAbundance': 1. / 5, }), DataEntry( **{ 'sampleId': 'sample-2', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': 'd', 'Genus': 'e', 'Species': None, 'relativeAbundance': 4. / 5, }), ], columns=exp_columns, ) self.assertListEqual([{ 'data': col } for col in exp.columns], obs.columns) # wouldn't want to do this on a huge dataframe..., but it checks if # there is a row of obs corresponding to each row of exp... exp_df = pd.DataFrame(exp.data) obs_df = pd.DataFrame(obs.data) obs_df_copy = obs_df.copy() for e_idx, row_exp in exp_df.iterrows(): for o_idx, row_obs in obs_df.iterrows(): if row_exp.eq(row_obs).all(): obs_df_copy.drop(index=o_idx, inplace=True) break self.assertTrue(obs_df_copy.empty)
def test_group_data_table(self): response = self.client.post( '/api/taxonomy/present/group/' 'table2-greengenes', content_type='application/json', data=json.dumps({'sample_ids': ['sample-1', 'sample-2']})) self.assertEqual(response.status_code, 200) obs = json.loads(response.data) exp_columns = [ 'sampleId', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'relativeAbundance' ] DataEntry = create_data_entry(exp_columns) exp = DataTable( data=[ DataEntry( **{ 'sampleId': 'sample-1', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': 'd', 'Genus': 'e', 'Species': None, 'relativeAbundance': 2. / 5, }), DataEntry( **{ 'sampleId': 'sample-1', 'Kingdom': 'a', 'Phylum': 'f', 'Class': None, 'Order': 'g', 'Family': 'h', 'Genus': None, 'Species': None, 'relativeAbundance': 3. / 5, }), DataEntry( **{ 'sampleId': 'sample-2', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': None, 'Genus': None, 'Species': None, 'relativeAbundance': 1. / 5, }), DataEntry( **{ 'sampleId': 'sample-2', 'Kingdom': 'a', 'Phylum': 'b', 'Class': None, 'Order': 'c', 'Family': 'd', 'Genus': 'e', 'Species': None, 'relativeAbundance': 4. / 5, }), ], columns=[{ 'data': col } for col in exp_columns], ).to_dict() self.assertListEqual(exp['columns'], obs['columns']) # wouldn't want to do this on a huge dataframe..., but it checks if # there is a row of obs corresponding to each row of exp... exp_df = pd.DataFrame(exp['data']) obs_df = pd.DataFrame(obs['data']) obs_df_copy = obs_df.copy() for e_idx, row_exp in exp_df.iterrows(): for o_idx, row_obs in obs_df.iterrows(): if row_exp.eq(row_obs).all(): obs_df_copy.drop(index=o_idx, inplace=True) break self.assertTrue(obs_df_copy.empty)