def test_duplicate_columns_self_merge(self): md = Metadata(pd.DataFrame( {'a': [1, 2], 'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) with self.assertRaisesRegex(ValueError, "columns overlap: 'a', 'b'"): md.merge(md)
def test_index_and_column_merge_order(self): md1 = Metadata(pd.DataFrame( [[1], [2], [3], [4]], index=pd.Index(['id1', 'id2', 'id3', 'id4'], name='id'), columns=['a'])) md2 = Metadata(pd.DataFrame( [[5], [6], [7]], index=pd.Index(['id4', 'id3', 'id1'], name='id'), columns=['b'])) md3 = Metadata(pd.DataFrame( [[8], [9], [10]], index=pd.Index(['id1', 'id4', 'id3'], name='id'), columns=['c'])) obs = md1.merge(md2, md3) exp = Metadata(pd.DataFrame( [[1, 7, 8], [3, 6, 10], [4, 5, 9]], index=pd.Index(['id1', 'id3', 'id4'], name='id'), columns=['a', 'b', 'c'])) self.assertEqual(obs, exp) # Merging in different order produces different ID/column order. obs = md2.merge(md1, md3) exp = Metadata(pd.DataFrame( [[5, 4, 9], [6, 3, 10], [7, 1, 8]], index=pd.Index(['id4', 'id3', 'id1'], name='id'), columns=['b', 'a', 'c'])) self.assertEqual(obs, exp)
def test_ids_and_column_names_as_numeric_strings(self): index = pd.Index(['0.000001', '0.004000', '0.000000'], dtype=object, name='id') columns = ['42.0', '1000', '-4.2'] data = [ [2.0, 'b', 2.5], [1.0, 'b', 4.2], [3.0, 'c', -9.999] ] df = pd.DataFrame(data, index=index, columns=columns) md = Metadata(df) md.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ( "id\t42.0\t1000\t-4.2\n" "#q2:types\tnumeric\tcategorical\tnumeric\n" "0.000001\t2\tb\t2.5\n" "0.004000\t1\tb\t4.2\n" "0.000000\t3\tc\t-9.999\n" ) self.assertEqual(obs, exp)
def test_invalid_header(self): fp = get_data_path('invalid/invalid-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'unrecognized ID column name.*' 'invalid_id_header'): Metadata.load(fp)
def test_column_types_unrecognized_column_name(self): fp = get_data_path('valid/simple.tsv') with self.assertRaisesRegex(MetadataFileError, 'not_a_column.*column_types.*not a column ' 'in the metadata file'): Metadata.load(fp, column_types={'not_a_column': 'numeric'})
def test_duplicate_column_names_with_whitespace(self): fp = get_data_path( 'invalid/duplicate-column-names-with-whitespace.tsv') with self.assertRaisesRegex(MetadataFileError, 'Column names must be unique.*col1'): Metadata.load(fp)
def test_directive_before_header(self): fp = get_data_path('invalid/directive-before-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'directive.*#q2:types.*searching for ' 'header'): Metadata.load(fp)
def test_qiime1_empty_mapping_file(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/qiime1-empty.tsv') with self.assertRaisesRegex(MetadataFileError, 'at least one ID.*empty'): Metadata.load(fp)
def test_unrecognized_column_type_in_directive(self): fp = get_data_path('invalid/unrecognized-column-type.tsv') with self.assertRaisesRegex(MetadataFileError, 'col2.*unrecognized column type.*foo.*' '#q2:types directive'): Metadata.load(fp)
def test_unrecognized_directive(self): fp = get_data_path('invalid/unrecognized-directive.tsv') with self.assertRaisesRegex(MetadataFileError, 'Unrecognized directive.*#q2:foo.*' '#q2:types directive is supported'): Metadata.load(fp)
def test_data_longer_than_header(self): fp = get_data_path('invalid/data-longer-than-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'row has 5 cells.*header declares 4 ' 'cells'): Metadata.load(fp)
def test_path_is_directory(self): fp = get_data_path('valid') with self.assertRaisesRegex(MetadataFileError, "path points to something other than a " "file"): Metadata.load(fp)
def test_comments_and_empty_rows_only(self): fp = get_data_path('invalid/comments-and-empty-rows-only.tsv') with self.assertRaisesRegex(MetadataFileError, 'locate header.*only of comments or empty ' 'rows'): Metadata.load(fp)
def test_inner_join(self): md1 = Metadata(pd.DataFrame( {'a': [1, 2, 3], 'b': [4, 5, 6]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) md2 = Metadata(pd.DataFrame( {'c': [7, 8, 9], 'd': [10, 11, 12]}, index=pd.Index(['id2', 'X', 'Y'], name='id'))) md3 = Metadata(pd.DataFrame( {'e': [13, 14, 15], 'f': [16, 17, 18]}, index=pd.Index(['X', 'id3', 'id2'], name='id'))) # Single shared ID. obs = md1.merge(md2, md3) exp = Metadata(pd.DataFrame( {'a': [2], 'b': [5], 'c': [7], 'd': [10], 'e': [15], 'f': [18]}, index=pd.Index(['id2'], name='id'))) self.assertEqual(obs, exp) # Multiple shared IDs. obs = md1.merge(md3) exp = Metadata(pd.DataFrame( {'a': [2, 3], 'b': [5, 6], 'e': [15, 14], 'f': [18, 17]}, index=pd.Index(['id2', 'id3'], name='id'))) self.assertEqual(obs, exp)
def test_column_types_override_directive_not_convertible_to_numeric(self): fp = get_data_path('valid/simple-with-directive.tsv') with self.assertRaisesRegex(MetadataFileError, "column 'col3' to numeric.*could not be " "interpreted as numeric: 'bar', 'foo'"): Metadata.load(fp, column_types={'col3': 'numeric'})
def test_empty_file(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/empty') with self.assertRaisesRegex(MetadataFileError, 'locate header.*file may be empty'): Metadata.load(fp)
def test_various_numbers(self): numbers = [ 0.0, -0.0, np.nan, 1.0, 42.0, -33.0, 1e-10, 1.5e15, 0.0003, -4.234, # This last number should be rounded because it exceeds 15 digits # of precision. 12.34567891234567 ] index = pd.Index(['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11'], name='ID') md = Metadata(pd.DataFrame({'numbers': numbers}, index=index)) md.save(self.filepath) with open(self.filepath, 'r') as fh: obs = fh.read() exp = ( "ID\tnumbers\n" "#q2:types\tnumeric\n" "id1\t0\n" "id2\t-0\n" "id3\t\n" "id4\t1\n" "id5\t42\n" "id6\t-33\n" "id7\t1e-10\n" "id8\t1.5e+15\n" "id9\t0.0003\n" "id10\t-4.234\n" "id11\t12.3456789123457\n" ) self.assertEqual(obs, exp)
def test_merging_nothing(self): md = Metadata(pd.DataFrame( {'a': [1, 2, 3], 'b': [4, 5, 6]}, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) with self.assertRaisesRegex(ValueError, 'At least one Metadata.*nothing to merge'): md.merge()
def test_numeric_column(self): fp = get_data_path('valid/numeric-column.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_minimal_file(self): fp = get_data_path('valid/minimal.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_all_cells_padded(self): fp = get_data_path('valid/all-cells-padded.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_column_name_conflicts_with_id_header(self): fp = get_data_path( 'invalid/column-name-conflicts-with-id-header.tsv') with self.assertRaisesRegex(MetadataFileError, "column name 'featureid' conflicts.*ID " "column header"): Metadata.load(fp)
def test_column_types_unrecognized_column_type(self): fp = get_data_path('valid/simple.tsv') with self.assertRaisesRegex(MetadataFileError, 'col2.*column_types.*unrecognized column ' 'type.*CATEGORICAL'): Metadata.load(fp, column_types={'col1': 'numeric', 'col2': 'CATEGORICAL'})
def test_non_standard_characters(self): fp = get_data_path('valid/non-standard-characters.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_directive_after_directives_section(self): fp = get_data_path( 'invalid/directive-after-directives-section.tsv') with self.assertRaisesRegex(MetadataFileError, '#q2:types.*outside of the directives ' 'section'): Metadata.load(fp)
def test_query_by_id(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = Metadata(df) actual = metadata.get_ids(where="id='S2' OR id='S1'") expected = {'S1', 'S2'} self.assertEqual(actual, expected)
def test_invalid_where(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='sampleid')) metadata = Metadata(df) where = "not-a-column-name='subject-1'" with self.assertRaises(ValueError): metadata.get_ids(where)
def test_no_artifacts(self): md1 = Metadata(pd.DataFrame( {'a': [1, 2]}, index=pd.Index(['id1', 'id2'], name='id'))) md2 = Metadata(pd.DataFrame( {'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) metadata = md1.merge(md2) self.assertEqual(metadata.artifacts, ())
def test_duplicate_columns(self): md1 = Metadata(pd.DataFrame( {'a': [1, 2], 'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='id'))) md2 = Metadata(pd.DataFrame( {'c': [5, 6], 'b': [7, 8]}, index=pd.Index(['id1', 'id2'], name='id'))) with self.assertRaisesRegex(ValueError, "columns overlap: 'b'"): md1.merge(md2)
def test_empty_result(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = Metadata(df) where = "Subject='subject-3'" actual = metadata.get_ids(where) expected = set() self.assertEqual(actual, expected)
def test_header_only(self): fp = get_data_path('invalid/header-only.tsv') with self.assertRaisesRegex(MetadataFileError, 'at least one ID'): Metadata.load(fp)
def test_save_metadata_auto_extension(self): md = Metadata( pd.DataFrame( { 'col1': [1.0, 2.0, 3.0], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42'] }, index=pd.Index(['id1', 'id2', 'id3'], name='id'))) # Filename & extension endswith is matching (non-default). fp = os.path.join(self.temp_dir, 'metadatatsv') obs_md = md.save(fp, '.tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadatatsv.tsv') # No period in filename; no extension included. fp = os.path.join(self.temp_dir, 'metadata') obs_md = md.save(fp) obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata') # No period in filename; no period in extension. fp = os.path.join(self.temp_dir, 'metadata') obs_md = md.save(fp, 'tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # No period in filename; multiple periods in extension. fp = os.path.join(self.temp_dir, 'metadata') obs_md = md.save(fp, '..tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Single period in filename; no period in extension. fp = os.path.join(self.temp_dir, 'metadata.') obs_md = md.save(fp, 'tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Single period in filename; single period in extension. fp = os.path.join(self.temp_dir, 'metadata.') obs_md = md.save(fp, '.tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Single period in filename; multiple periods in extension. fp = os.path.join(self.temp_dir, 'metadata.') obs_md = md.save(fp, '..tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Multiple periods in filename; single period in extension. fp = os.path.join(self.temp_dir, 'metadata..') obs_md = md.save(fp, '.tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Multiple periods in filename; multiple periods in extension. fp = os.path.join(self.temp_dir, 'metadata..') obs_md = md.save(fp, '..tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # No extension in filename; no extension input. fp = os.path.join(self.temp_dir, 'metadata') obs_md = md.save(fp) obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata') # No extension in filename; extension input. fp = os.path.join(self.temp_dir, 'metadata') obs_md = md.save(fp, '.tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Extension in filename; no extension input. fp = os.path.join(self.temp_dir, 'metadata.tsv') obs_md = md.save(fp) obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv') # Extension in filename; extension input (non-matching). fp = os.path.join(self.temp_dir, 'metadata.tsv') obs_md = md.save(fp, '.txt') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv.txt') # Extension in filename; extension input (matching). fp = os.path.join(self.temp_dir, 'metadata.tsv') obs_md = md.save(fp, '.tsv') obs_filename = os.path.basename(obs_md) self.assertEqual(obs_filename, 'metadata.tsv')
def test_empty_id(self): fp = self.get_data_path('invalid/empty-id.tsv') with self.assertRaisesRegex(MetadataFileError, 'empty metadata ID'): Metadata.load(fp)
def test_whitespace_only_column_name(self): fp = get_data_path('invalid/whitespace-only-column-name.tsv') with self.assertRaisesRegex(MetadataFileError, 'column without a name'): Metadata.load(fp)
def test_duplicate_column_names(self): fp = get_data_path('invalid/duplicate-column-names.tsv') with self.assertRaisesRegex(MetadataFileError, 'Column names must be unique.*col1'): Metadata.load(fp)
def test_utf_16_be_file(self): fp = get_data_path('invalid/simple-utf-16be.txt') with self.assertRaisesRegex(MetadataFileError, 'UTF-16 Unicode'): Metadata.load(fp)
def test_empty_file(self): fp = get_data_path('invalid/empty-file') with self.assertRaisesRegex(MetadataFileError, 'locate header.*file may be empty'): Metadata.load(fp)
def test_no_source_artifacts(self): fp = get_data_path('valid/simple.tsv') metadata = Metadata.load(fp) self.assertEqual(metadata.artifacts, ())
def test_path_does_not_exist(self): with self.assertRaisesRegex(MetadataFileError, "Metadata file path doesn't exist"): Metadata.load( '/qiime2/unit/tests/hopefully/this/path/does/not/exist')
def test_leading_trailing_whitespace(self): fp = get_data_path('valid/leading-trailing-whitespace.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_with_empty_types_directive(self): fp = get_data_path('valid/empty-types-directive.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_different_file_extension(self): fp = get_data_path('valid/simple.txt') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_whitespace_only_id(self): fp = get_data_path('invalid/whitespace-only-id.tsv') with self.assertRaisesRegex(MetadataFileError, 'empty metadata ID'): Metadata.load(fp)
def test_duplicate_directives(self): fp = get_data_path('invalid/duplicate-directives.tsv') with self.assertRaisesRegex(MetadataFileError, 'duplicate directive.*#q2:types'): Metadata.load(fp)
def test_empty_column_name(self): fp = self.get_data_path('invalid/empty-column-name.tsv') with self.assertRaisesRegex(MetadataFileError, 'column without a name'): Metadata.load(fp)
def test_id_conflicts_with_id_header(self): fp = get_data_path('invalid/id-conflicts-with-id-header.tsv') with self.assertRaisesRegex(MetadataFileError, "ID 'id' conflicts.*ID column header"): Metadata.load(fp)
def test_trailing_columns(self): fp = get_data_path('valid/trailing-columns.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_duplicate_ids_with_whitespace(self): fp = get_data_path('invalid/duplicate-ids-with-whitespace.tsv') with self.assertRaisesRegex(MetadataFileError, 'IDs must be unique.*id1'): Metadata.load(fp)
def test_no_newline_at_eof(self): fp = get_data_path('valid/no-newline-at-eof.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_empty_rows(self): fp = get_data_path('valid/empty-rows.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_comments(self): fp = get_data_path('valid/comments.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_qiime1_empty_mapping_file(self): fp = get_data_path('invalid/qiime1-empty.tsv') with self.assertRaisesRegex(MetadataFileError, 'at least one ID'): Metadata.load(fp)
def test_mac_line_endings(self): fp = get_data_path('valid/mac-line-endings.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_header_only_with_comments_and_empty_rows(self): fp = self.get_data_path( 'invalid/header-only-with-comments-and-empty-rows.tsv') with self.assertRaisesRegex(MetadataFileError, 'at least one ID'): Metadata.load(fp)
def test_non_utf_8_file(self): fp = get_data_path('invalid/non-utf-8.tsv') with self.assertRaisesRegex(MetadataFileError, 'encoded as UTF-8 or ASCII'): Metadata.load(fp)