def _filter_ids_helper(cls, df_or_series, ids, ids_to_keep): # `ids_to_keep` can be any iterable, so turn it into a list so that it # can be iterated over multiple times below (and length-checked). ids_to_keep = list(ids_to_keep) if len(ids_to_keep) == 0: raise ValueError("`ids_to_keep` must contain at least one ID.") duplicates = find_duplicates(ids_to_keep) if duplicates: raise ValueError( "`ids_to_keep` must contain unique IDs. The following IDs are " "duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) ids_to_keep = set(ids_to_keep) missing_ids = ids_to_keep - ids if missing_ids: raise ValueError( "The following IDs are not present in the metadata: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = ids - ids_to_keep return df_or_series.drop(labels=ids_to_discard, axis='index', inplace=False, errors='raise')
def _filter_ids_helper(cls, df_or_series, ids, ids_to_keep): # `ids_to_keep` can be any iterable, so turn it into a list so that it # can be iterated over multiple times below (and length-checked). ids_to_keep = list(ids_to_keep) if len(ids_to_keep) == 0: raise ValueError("`ids_to_keep` must contain at least one ID.") duplicates = find_duplicates(ids_to_keep) if duplicates: raise ValueError( "`ids_to_keep` must contain unique IDs. The following IDs are " "duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) ids_to_keep = set(ids_to_keep) missing_ids = ids_to_keep - ids if missing_ids: raise ValueError( "The following IDs are not present in the metadata: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = ids - ids_to_keep return df_or_series.drop(labels=ids_to_discard, axis='index', inplace=False, errors='raise')
def _read_header(self): header = None for row in self._reader: if self._is_header(row): header = row break elif self._is_comment(row): continue elif self._is_empty(row): continue elif self._is_directive(row): raise MetadataFileError( "Found directive %r while searching for header. " "Directives may only appear immediately after the header." % row[0]) else: raise MetadataFileError( "Found unrecognized ID column name %r while searching for " "header. The first column name in the header defines the " "ID column, and must be one of these values:\n\n%s" % (row[0], FORMATTED_ID_HEADERS)) if header is None: raise MetadataFileError( "Failed to locate header. The metadata file may be empty, or " "consists only of comments or empty rows.") # Trim trailing empty cells from header. data_extent = None for idx, cell in enumerate(header): if cell != '': data_extent = idx header = header[:data_extent+1] # Basic validation to 1) fail early before processing entire file; and # 2) make some basic guarantees about the header for things in this # class that use the header as part of reading the file. column_names = set(header) if '' in column_names: raise MetadataFileError( "Found at least one column without a name in the header. Each " "column must be named.") elif len(header) != len(column_names): duplicates = find_duplicates(header) raise MetadataFileError( "Column names must be unique. The following column names are " "duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) # Skip the first element of the header because we know it is a valid ID # header. The other column names are validated to ensure they *aren't* # valid ID headers. for column_name in header[1:]: if is_id_header(column_name): raise MetadataFileError( "Metadata column name %r conflicts with a name reserved " "for the ID column header. Reserved ID column headers:" "\n\n%s" % (column_name, FORMATTED_ID_HEADERS)) return header
def filter_ids(self, ids_to_keep): ids_to_keep_set = set(ids_to_keep) if len(ids_to_keep) != len(ids_to_keep_set): duplicates = find_duplicates(ids_to_keep) raise ValueError( "`ids_to_keep` must consist of unique IDs. The following " "ID(s) are duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) missing_ids = ids_to_keep_set - self.get_ids() if missing_ids: raise ValueError( "The following ID(s) are not present in the MetadataColumn: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = self.get_ids() - ids_to_keep_set filtered_series = self._series.drop( labels=ids_to_discard, axis='index', inplace=False, errors='raise') # Not using Series.empty because empty columns are allowed in # Metadata. # TODO instead of erroring here, just check that `ids_to_keep` isn't # empty at the start of this method. if filtered_series.index.empty: raise ValueError( "All IDs were filtered out of the MetadataColumn, resulting " "in an empty MetadataColumn object.") filtered_mdc = self.__class__(filtered_series) filtered_mdc._add_artifacts(self.artifacts) return filtered_mdc
def filter_ids(self, ids_to_keep): ids_to_keep_set = set(ids_to_keep) if len(ids_to_keep) != len(ids_to_keep_set): duplicates = find_duplicates(ids_to_keep) raise ValueError( "`ids_to_keep` must consist of unique IDs. The following " "ID(s) are duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) missing_ids = ids_to_keep_set - self.get_ids() if missing_ids: raise ValueError( "The following ID(s) are not present in the MetadataColumn: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = self.get_ids() - ids_to_keep_set filtered_series = self._series.drop( labels=ids_to_discard, axis='index', inplace=False, errors='raise') # Not using Series.empty because empty columns are allowed in # Metadata. # TODO instead of erroring here, just check that `ids_to_keep` isn't # empty at the start of this method. if filtered_series.index.empty: raise ValueError( "All IDs were filtered out of the MetadataColumn, resulting " "in an empty MetadataColumn object.") filtered_mdc = self.__class__(filtered_series) filtered_mdc._add_artifacts(self.artifacts) return filtered_mdc
def _read_header(self): header = None for row in self._reader: if self._is_header(row): header = row break elif self._is_comment(row): continue elif self._is_empty(row): continue elif self._is_directive(row): raise MetadataFileError( "Found directive %r while searching for header. " "Directives may only appear immediately after the header." % row[0]) else: raise MetadataFileError( "Found unrecognized ID column name %r while searching for " "header. The first column name in the header defines the " "ID column, and must be one of these values:\n\n%s" % (row[0], FORMATTED_ID_HEADERS)) if header is None: raise MetadataFileError( "Failed to locate header. The metadata file may be empty, or " "consists only of comments or empty rows.") # Trim trailing empty cells from header. data_extent = None for idx, cell in enumerate(header): if cell != '': data_extent = idx header = header[:data_extent+1] # Basic validation to 1) fail early before processing entire file; and # 2) make some basic guarantees about the header for things in this # class that use the header as part of reading the file. column_names = set(header) if '' in column_names: raise MetadataFileError( "Found at least one column without a name in the header. Each " "column must be named.") elif len(header) != len(column_names): duplicates = find_duplicates(header) raise MetadataFileError( "Column names must be unique. The following column names are " "duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) # Skip the first element of the header because we know it is a valid ID # header. The other column names are validated to ensure they *aren't* # valid ID headers. for column_name in header[1:]: if is_id_header(column_name): raise MetadataFileError( "Metadata column name %r conflicts with a name reserved " "for the ID column header. Reserved ID column headers:" "\n\n%s" % (column_name, FORMATTED_ID_HEADERS)) return header
def test_different_hashables(self): iterable = iter([ 'foo', 42, -9.999, 'baz', ('a', 'b'), 42, 'foo', ('a', 'b', 'c'), ('a', 'b') ]) obs = util.find_duplicates(iterable) self.assertEqual(obs, {'foo', 42, ('a', 'b')})
def filter_ids(self, ids_to_keep): """Filter metadata by IDs. Parameters ---------- ids_to_keep : iterable of str IDs that should be retained in the filtered ``Metadata`` object. If any IDs in `ids_to_keep` are not contained in this ``Metadata`` object, a ``ValueError`` will be raised. The filtered ``Metadata`` object will retain the same relative ordering of IDs in this ``Metadata`` object. Thus, the ordering of IDs in `ids_to_keep` does not determine the ordering of IDs in the filtered ``Metadata`` object. Returns ------- Metadata The metadata filtered by IDs. """ ids_to_keep_set = set(ids_to_keep) if len(ids_to_keep) != len(ids_to_keep_set): duplicates = find_duplicates(ids_to_keep) raise ValueError( "`ids_to_keep` must consist of unique IDs. The following " "ID(s) are duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) missing_ids = ids_to_keep_set - self.get_ids() if missing_ids: raise ValueError( "The following ID(s) are not present in the Metadata: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = self.get_ids() - ids_to_keep_set filtered_df = self._dataframe.drop(labels=ids_to_discard, axis='index', inplace=False, errors='raise') # Not using DataFrame.empty because empty columns are allowed in # Metadata. # TODO instead of erroring here, just check that `ids_to_keep` isn't # empty at the start of this method. if filtered_df.index.empty: raise ValueError( "All IDs were filtered out of the Metadata, resulting in an " "empty Metadata object.") filtered_md = self.__class__(filtered_df) filtered_md._add_artifacts(self.artifacts) return filtered_md
def filter_ids(self, ids_to_keep): """Filter metadata by IDs. Parameters ---------- ids_to_keep : iterable of str IDs that should be retained in the filtered ``Metadata`` object. If any IDs in `ids_to_keep` are not contained in this ``Metadata`` object, a ``ValueError`` will be raised. The filtered ``Metadata`` object will retain the same relative ordering of IDs in this ``Metadata`` object. Thus, the ordering of IDs in `ids_to_keep` does not determine the ordering of IDs in the filtered ``Metadata`` object. Returns ------- Metadata The metadata filtered by IDs. """ ids_to_keep_set = set(ids_to_keep) if len(ids_to_keep) != len(ids_to_keep_set): duplicates = find_duplicates(ids_to_keep) raise ValueError( "`ids_to_keep` must consist of unique IDs. The following " "ID(s) are duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) missing_ids = ids_to_keep_set - self.get_ids() if missing_ids: raise ValueError( "The following ID(s) are not present in the Metadata: %s" % (', '.join(repr(e) for e in sorted(missing_ids)))) # While preserving order, get rid of any IDs not contained in # `ids_to_keep`. ids_to_discard = self.get_ids() - ids_to_keep_set filtered_df = self._dataframe.drop(labels=ids_to_discard, axis='index', inplace=False, errors='raise') # Not using DataFrame.empty because empty columns are allowed in # Metadata. # TODO instead of erroring here, just check that `ids_to_keep` isn't # empty at the start of this method. if filtered_df.index.empty: raise ValueError( "All IDs were filtered out of the Metadata, resulting in an " "empty Metadata object.") filtered_md = self.__class__(filtered_df) filtered_md._add_artifacts(self.artifacts) return filtered_md
def __init__(self, *choices): if not choices: raise ValueError("'Choices' cannot be instantiated with an empty" " set.") # Backwards compatibility with old Choices({1, 2, 3}) syntax if len(choices) == 1: if not isinstance(choices[0], (bool, str)): choices = choices[0] self.choices = choices = tuple(choices) if len(choices) != len(set(choices)): raise ValueError("Duplicates found in choices: %r" % util.find_duplicates(choices))
def _read_header(self): header = None for record in self._reader: if self._is_header(record): header = record break elif self._is_comment(record): continue elif self._is_empty(record): continue elif self._is_directive(record): raise MetadataFileError( "Found directive %r when searching for header. Directives " "may only appear immediately after the header." % record[0]) else: # TODO better error message to hint at what to do raise MetadataFileError("Invalid header: %r" % record) if header is None: raise MetadataFileError( "Failed to locate header. The metadata file may be empty, or " "consists only of comments or empty records.") # Trim trailing empty cells from header. data_extent = None for idx, cell in enumerate(header): if cell != '': data_extent = idx header = header[:data_extent+1] # Basic validation to 1) fail early before processing entire file; and # 2) make some basic guarantees about the header for things in this # class that use the header as part of reading the file. column_names = set(header) if '' in column_names: raise MetadataFileError( "Found at least one column without a name in the header. Each " "column must be named.") elif len(header) != len(column_names): duplicates = find_duplicates(header) raise MetadataFileError( "Column names must be unique. The following column name(s) " "are duplicated: %s" % (', '.join(repr(e) for e in sorted(duplicates)))) return header
def _validate_index(cls, index, *, axis): if axis == 'id': label = 'ID' elif axis == 'column': label = 'column name' else: raise NotImplementedError for value in index: if not isinstance(value, str): raise TypeError( "Detected non-string metadata %s of type %r: %r" % (label, type(value), value)) if not value: raise ValueError( "Detected empty metadata %s. %ss must consist of at least " "one character." % (label, label)) if value != value.strip(): raise ValueError( "Detected metadata %s with leading or trailing " "whitespace characters: %r" % (label, value)) if axis == 'id' and value.startswith('#'): raise ValueError( "Detected metadata %s that begins with a pound sign " "(#): %r" % (label, value)) if is_id_header(value): raise ValueError( "Detected metadata %s %r that conflicts with a name " "reserved for the ID header. Reserved ID headers:\n\n%s" % (label, value, FORMATTED_ID_HEADERS)) if len(index) != len(set(index)): duplicates = find_duplicates(index) raise ValueError( "Metadata %ss must be unique. The following %ss are " "duplicated: %s" % (label, label, ', '.join(repr(e) for e in sorted(duplicates))))
def _validate_index(cls, index, *, axis): if axis == 'id': label = 'ID' elif axis == 'column': label = 'column name' else: raise NotImplementedError for value in index: if not isinstance(value, str): raise TypeError( "Detected non-string metadata %s of type %r: %r" % (label, type(value), value)) if not value: raise ValueError( "Detected empty metadata %s. %ss must consist of at least " "one character." % (label, label)) if value != value.strip(): raise ValueError( "Detected metadata %s with leading or trailing " "whitespace characters: %r" % (label, value)) if axis == 'id' and value.startswith('#'): raise ValueError( "Detected metadata %s that begins with a pound sign " "(#): %r" % (label, value)) if is_id_header(value): raise ValueError( "Detected metadata %s %r that conflicts with a name " "reserved for the ID header. Reserved ID headers:\n\n%s" % (label, value, FORMATTED_ID_HEADERS)) if len(index) != len(set(index)): duplicates = find_duplicates(index) raise ValueError( "Metadata %ss must be unique. The following %ss are " "duplicated: %s" % (label, label, ', '.join(repr(e) for e in sorted(duplicates))))
def _validate_pandas_index(self, index, label): for value in index: # TODO raise a better error message for "missing values" # (e.g. np.nan, None), right now users will get a "non-string # metadata ID" error message, which isn't the most intuitive. if not isinstance(value, str): raise TypeError( "Detected non-string metadata %s: %r" % (label, value)) if not value: raise ValueError( "Detected empty metadata %s. %ss must consist of at least " "one character." % (label, label)) if value != value.strip(): raise ValueError( "Detected metadata %s with leading or trailing " "whitespace characters: %r" % (label, value)) # HACK: don't use label as a conditional here if label == 'ID' and value.startswith('#'): raise ValueError( "Detected metadata %s that begins with the pound sign " "(#): %r" % (label, value)) try: self._assert_valid_id_header(value) except ValueError: pass else: raise ValueError( "Detected metadata %s that conflicts with a name reserved " "for ID headers: %r" % (label, value)) if len(index) != len(set(index)): duplicates = find_duplicates(index) raise ValueError( "Metadata %ss must be unique. The following %ss are " "duplicated: %s" % (label, label, ', '.join(repr(e) for e in sorted(duplicates))))
def _validate_pandas_index(self, index, label): for value in index: # TODO raise a better error message for "missing values" # (e.g. np.nan, None), right now users will get a "non-string # metadata ID" error message, which isn't the most intuitive. if not isinstance(value, str): raise TypeError( "Detected non-string metadata %s: %r" % (label, value)) if not value: raise ValueError( "Detected empty metadata %s: %r" % (label, value)) if value != value.strip(): raise ValueError( "Detected metadata %s with leading or trailing " "whitespace characters: %r" % (label, value)) # HACK: don't use label as a conditional here if label == 'ID' and value.startswith('#'): raise ValueError( "Detected metadata %s that begins with the pound sign " "(#): %r" % (label, value)) try: self._assert_valid_id_header(value) except ValueError: pass else: raise ValueError( "Detected metadata %s that conflicts with a name reserved " "for ID headers: %r" % (label, value)) if len(index) != len(set(index)): duplicates = find_duplicates(index) raise ValueError( "Metadata %ss must be unique. The following %ss are " "duplicated: %s" % (label, label, ', '.join(repr(e) for e in sorted(duplicates))))
def test_single_value(self): obs = util.find_duplicates(iter(['foo'])) self.assertEqual(obs, set())
def test_all_duplicates(self): obs = util.find_duplicates( iter(['foo', 'bar', 'baz', 'baz', 'bar', 'foo'])) self.assertEqual(obs, {'foo', 'bar', 'baz'})
def test_different_hashables(self): iterable = iter(['foo', 42, -9.999, 'baz', ('a', 'b'), 42, 'foo', ('a', 'b', 'c'), ('a', 'b')]) obs = util.find_duplicates(iterable) self.assertEqual(obs, {'foo', 42, ('a', 'b')})
def test_all_duplicates(self): obs = util.find_duplicates( iter(['foo', 'bar', 'baz', 'baz', 'bar', 'foo'])) self.assertEqual(obs, {'foo', 'bar', 'baz'})
def test_multiple_values_no_duplicates(self): obs = util.find_duplicates(iter(['foo', 'bar'])) self.assertEqual(obs, set())
def test_single_value(self): obs = util.find_duplicates(iter(['foo'])) self.assertEqual(obs, set())
def test_empty_iterable(self): obs = util.find_duplicates(iter([])) self.assertEqual(obs, set())
def test_multiple_values_no_duplicates(self): obs = util.find_duplicates(iter(['foo', 'bar'])) self.assertEqual(obs, set())
def test_empty_iterable(self): obs = util.find_duplicates(iter([])) self.assertEqual(obs, set())