def test_no_artifacts(self): md1 = Metadata( pd.DataFrame({'a': [1, 2]}, index=pd.Index(['id1', 'id2'], name='id'))) md2 = Metadata( pd.DataFrame({'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) metadata = md1.merge(md2) self.assertEqual(metadata.artifacts, ())
def test_id_column_only(self): md1 = Metadata( pd.DataFrame({}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) md2 = Metadata( pd.DataFrame({}, index=pd.Index(['id2', 'X', 'id1'], name='id'))) md3 = Metadata( pd.DataFrame({}, index=pd.Index(['id1', 'id3', 'id2'], name='id'))) obs = md1.merge(md2, md3) exp = Metadata( pd.DataFrame({}, index=pd.Index(['id1', 'id2'], name='id'))) self.assertEqual(obs, exp)
def test_empty_metadata(self): # No index, no columns. df = pd.DataFrame([], index=pd.Index([], name='id')) with self.assertRaisesRegex(ValueError, 'Metadata.*at least one ID'): Metadata(df) # No index, has columns. df = pd.DataFrame([], index=pd.Index([], name='id'), columns=['a', 'b']) with self.assertRaisesRegex(ValueError, 'Metadata.*at least one ID'): Metadata(df)
def test_invalid_column_dtype_w_null(self): columns = pd.Index(['a', float('nan')], dtype=object) with self.assertRaisesRegex(TypeError, 'non-string.*column name.*nan'): Metadata(pd.DataFrame([['val1', 'val2']], index=pd.Index(['x'], name='id'), columns=columns)) columns = pd.Index(['a', None], dtype=object) with self.assertRaisesRegex(TypeError, 'non-string.*column name.*None'): Metadata(pd.DataFrame([['val1', 'val2']], index=pd.Index(['x'], name='id'), columns=columns))
def test_merged_id_column_name(self): md1 = Metadata(pd.DataFrame( {'a': [1, 2]}, index=pd.Index(['id1', 'id2'], name='sample ID'))) md2 = Metadata(pd.DataFrame( {'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='feature ID'))) obs = md1.merge(md2) exp = Metadata(pd.DataFrame( {'a': [1, 2], 'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) self.assertEqual(obs, exp)
def test_inner_join(self): md1 = Metadata( pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) md2 = Metadata( pd.DataFrame({ 'c': [7, 8, 9], 'd': [10, 11, 12] }, index=pd.Index(['id2', 'X', 'Y'], name='id'))) md3 = Metadata( pd.DataFrame({ 'e': [13, 14, 15], 'f': [16, 17, 18] }, index=pd.Index(['X', 'id3', 'id2'], name='id'))) # Single shared ID. obs = md1.merge(md2, md3) exp = Metadata( pd.DataFrame( { 'a': [2], 'b': [5], 'c': [7], 'd': [10], 'e': [15], 'f': [18] }, index=pd.Index(['id2'], name='id'))) self.assertEqual(obs, exp) # Multiple shared IDs. obs = md1.merge(md3) exp = Metadata( pd.DataFrame( { 'a': [2, 3], 'b': [5, 6], 'e': [15, 14], 'f': [18, 17] }, index=pd.Index(['id2', 'id3'], name='id'))) self.assertEqual(obs, exp)
def test_equality_without_artifact(self): md1 = Metadata( pd.DataFrame({ 'a': '1', 'b': '3' }, index=pd.Index(['0'], name='id'))) md2 = Metadata( pd.DataFrame({ 'a': '1', 'b': '3' }, index=pd.Index(['0'], name='id'))) self.assertReallyEqual(md1, md2)
def test_merging_two(self): md1 = Metadata(pd.DataFrame( {'a': [1, 2, 3], 'b': [4, 5, 6]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) md2 = Metadata(pd.DataFrame( {'c': [7, 8, 9], 'd': [10, 11, 12]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) obs = md1.merge(md2) exp = Metadata(pd.DataFrame( {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9], 'd': [10, 11, 12]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) self.assertEqual(obs, exp)
def test_data_mismatch(self): md1 = Metadata( pd.DataFrame({ 'a': '1', 'b': '3' }, index=pd.Index(['0'], name='id'))) md2 = Metadata( pd.DataFrame({ 'a': '1', 'b': '2' }, index=pd.Index(['0'], name='id'))) self.assertReallyNotEqual(md1, md2)
def test_case_insensitive_duplicate_column_names(self): index = pd.Index(['a', 'b', 'c'], name='id') df = pd.DataFrame({'column': ['1', '2', '3'], 'Column': ['4', '5', '6']}, index=index) metadata = Metadata(df) self.assertEqual(set(metadata.columns), {'column', 'Column'})
def test_duplicate_indices(self): index = pd.Index(['a', 'b', 'b'], name='id', dtype=object) df = pd.DataFrame({'foo': [1, 2, 3]}, index=index) with self.assertRaisesRegex(ValueError, "IDs must be unique.*'b'"): Metadata(df)
def test_invalid_index_dtype_w_null(self): index = pd.Index(['a', float('nan'), 'b'], name='id', dtype=object) with self.assertRaisesRegex(TypeError, 'non-string.*ID.*nan'): Metadata( pd.DataFrame({ 'x': [1, 2, 3], 'y': [4, 5, 6] }, index=index)) index = pd.Index(['a', None, 'c'], name='id', dtype=object) with self.assertRaisesRegex(TypeError, 'non-string.*ID.*None'): Metadata( pd.DataFrame({ 'x': [1, 2, 3], 'y': [4, 5, 6] }, index=index))
def test_various_numbers(self): numbers = [ 0.0, -0.0, np.nan, 1.0, 42.0, -33.0, 1e-10, 1.5e15, 0.0003, -4.234, # This last number should be rounded because it exceeds 15 digits # of precision. 12.34567891234567 ] index = pd.Index(['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11'], name='ID') md = Metadata(pd.DataFrame({'numbers': numbers}, index=index)) md.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ( "ID\tnumbers\n" "#q2:types\tnumeric\n" "id1\t0\n" "id2\t-0\n" "id3\t\n" "id4\t1\n" "id5\t42\n" "id6\t-33\n" "id7\t1e-10\n" "id8\t1.5e+15\n" "id9\t0.0003\n" "id10\t-4.234\n" "id11\t12.3456789123457\n" ) self.assertEqual(obs, exp)
def test_ids_and_column_names_as_numeric_strings(self): index = pd.Index(['0.000001', '0.004000', '0.000000'], dtype=object, name='id') columns = ['42.0', '1000', '-4.2'] data = [ [2.0, 'b', 2.5], [1.0, 'b', 4.2], [3.0, 'c', -9.999] ] df = pd.DataFrame(data, index=index, columns=columns) md = Metadata(df) md.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ( "id\t42.0\t1000\t-4.2\n" "#q2:types\tnumeric\tcategorical\tnumeric\n" "0.000001\t2\tb\t2.5\n" "0.004000\t1\tb\t4.2\n" "0.000000\t3\tc\t-9.999\n" ) self.assertEqual(obs, exp)
def test_duplicate_columns_self_merge(self): md = Metadata(pd.DataFrame( {'a': [1, 2], 'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) with self.assertRaisesRegex(ValueError, "columns overlap: 'a', 'b'"): md.merge(md)
def test_valid_metadata_id_column_only(self): index = pd.Index(['a', 'b', 'c'], name='ID', dtype=object) df = pd.DataFrame({}, index=index, dtype=object) metadata = Metadata(df) self.assertEqual(metadata.id_count, 3) self.assertEqual(metadata.column_count, 0)
def test_non_standard_characters(self): # Test that non-standard characters in IDs, column names, and cells are # handled correctly. The test case isn't exhaustive (e.g. it doesn't # test every Unicode character; that would be a nice additional test # case to have in the future). Instead, this test aims to be more of an # integration test for the robustness of the reader to non-standard # data. Many of the characters and their placement within the data file # are based on use-cases/bugs reported on the forum, Slack, etc. The # data file has comments explaining these test case choices in more # detail. fp = get_data_path('valid/non-standard-characters.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index( ['©id##1', '((id))2', "'id_3<>'", '"id#4"', 'i d\r\t\n5'], name='id') exp_columns = [ '↩c@l1™', 'col(#2)', "#col'3", '"<col_4>"', 'col\t \r\n5' ] exp_data = [['ƒoo', '(foo)', '#f o #o', 'fo\ro', np.nan], ["''2''", 'b#r', 'ba\nr', np.nan, np.nan], ['b"ar', 'c\td', '4\r\n2', np.nan, np.nan], ['b__a_z', '<42>', '>42', np.nan, np.nan], ['baz', np.nan, '42']] exp_df = pd.DataFrame(exp_data, index=exp_index, columns=exp_columns) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_simple_expression(self): df = pd.DataFrame( { 'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut'] }, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = Metadata(df) where = "Subject='subject-1'" actual = metadata.get_ids(where) expected = {'S1', 'S2'} self.assertEqual(actual, expected) where = "Subject='subject-2'" actual = metadata.get_ids(where) expected = {'S3'} self.assertEqual(actual, expected) where = "Subject='subject-3'" actual = metadata.get_ids(where) expected = set() self.assertEqual(actual, expected) where = "SampleType='gut'" actual = metadata.get_ids(where) expected = {'S1', 'S3'} self.assertEqual(actual, expected) where = "SampleType='tongue'" actual = metadata.get_ids(where) expected = {'S2'} self.assertEqual(actual, expected)
def test_artifacts(self): index = pd.Index(['a', 'b', 'c'], name='id', dtype=object) df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=object) metadata = Metadata(df) self.assertEqual(metadata.artifacts, ())
def test_duplicate_columns(self): md1 = Metadata( pd.DataFrame({ 'a': [1, 2], 'b': [3, 4] }, index=pd.Index(['id1', 'id2'], name='id'))) md2 = Metadata( pd.DataFrame({ 'c': [5, 6], 'b': [7, 8] }, index=pd.Index(['id1', 'id2'], name='id'))) with self.assertRaisesRegex(ValueError, "columns overlap: 'b'"): md1.merge(md2)
def test_disjoint_indices(self): md1 = Metadata( pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) md2 = Metadata( pd.DataFrame({ 'c': [7, 8, 9], 'd': [10, 11, 12] }, index=pd.Index(['X', 'Y', 'Z'], name='id'))) with self.assertRaisesRegex(ValueError, 'no IDs shared'): md1.merge(md2)
def test_invalid_columns_dtype(self): with self.assertRaisesRegex(TypeError, 'non-string.*column name.*42'): Metadata( pd.DataFrame({ 'foo': ['a', 'b'], 42: ['c', 'd'] }, index=pd.Index(['0', '1'], name='id')))
def test_merging_nothing(self): md = Metadata(pd.DataFrame( {'a': [1, 2, 3], 'b': [4, 5, 6]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) with self.assertRaisesRegex(ValueError, 'At least one Metadata.*nothing to merge'): md.merge()
def test_duplicate_columns(self): index = pd.Index(['a', 'b'], name='id', dtype=object) df = pd.DataFrame({'foo': [1, 2], 'bar': [3, 4]}, index=index) df.columns = ['foo', 'foo'] with self.assertRaisesRegex(ValueError, "column names must be unique.*'foo'"): Metadata(df)
def test_valid_metadata_str(self): index = pd.Index(['a', 'b', 'c'], name='sample id', dtype=str) df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=str) metadata = Metadata(df) obs_type = metadata.columns['col1'].type self.assertEqual(obs_type, 'categorical')
def test_valid_metadata(self): index = pd.Index(['a', 'b', 'c'], name='feature ID', dtype=object) df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=object) metadata = Metadata(df) obs_type = metadata.columns['col1'].type self.assertEqual(obs_type, 'categorical')
def test_invalid_where(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='sampleid')) metadata = Metadata(df) where = "not-a-column-name='subject-1'" with self.assertRaises(ValueError): metadata.get_ids(where)
def test_query_by_id(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = Metadata(df) actual = metadata.get_ids(where="id='S2' OR id='S1'") expected = {'S1', 'S2'} self.assertEqual(actual, expected)
def test_single_column(self): fp = get_data_path('valid/single-column.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_no_columns(self): fp = get_data_path('valid/no-columns.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['a', 'b', 'my-id'], name='id') exp_df = pd.DataFrame({}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)