Python find_duplicatesの例、qiime2.core.util.find_duplicates Pythonの例

コード例 #1

0

ファイルを表示

ファイル: metadata.py プロジェクト: zhuangwb/qiime2

    def _filter_ids_helper(cls, df_or_series, ids, ids_to_keep):
        # `ids_to_keep` can be any iterable, so turn it into a list so that it
        # can be iterated over multiple times below (and length-checked).
        ids_to_keep = list(ids_to_keep)

        if len(ids_to_keep) == 0:
            raise ValueError("`ids_to_keep` must contain at least one ID.")

        duplicates = find_duplicates(ids_to_keep)
        if duplicates:
            raise ValueError(
                "`ids_to_keep` must contain unique IDs. The following IDs are "
                "duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        ids_to_keep = set(ids_to_keep)
        missing_ids = ids_to_keep - ids
        if missing_ids:
            raise ValueError(
                "The following IDs are not present in the metadata: %s"
                % (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = ids - ids_to_keep
        return df_or_series.drop(labels=ids_to_discard, axis='index',
                                 inplace=False, errors='raise')

コード例 #2

0

ファイルを表示

ファイル: metadata.py プロジェクト: jakereps/qiime2

    def _filter_ids_helper(cls, df_or_series, ids, ids_to_keep):
        # `ids_to_keep` can be any iterable, so turn it into a list so that it
        # can be iterated over multiple times below (and length-checked).
        ids_to_keep = list(ids_to_keep)

        if len(ids_to_keep) == 0:
            raise ValueError("`ids_to_keep` must contain at least one ID.")

        duplicates = find_duplicates(ids_to_keep)
        if duplicates:
            raise ValueError(
                "`ids_to_keep` must contain unique IDs. The following IDs are "
                "duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        ids_to_keep = set(ids_to_keep)
        missing_ids = ids_to_keep - ids
        if missing_ids:
            raise ValueError(
                "The following IDs are not present in the metadata: %s"
                % (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = ids - ids_to_keep
        return df_or_series.drop(labels=ids_to_discard, axis='index',
                                 inplace=False, errors='raise')

コード例 #3

0

ファイルを表示

    def _read_header(self):
        header = None
        for row in self._reader:
            if self._is_header(row):
                header = row
                break
            elif self._is_comment(row):
                continue
            elif self._is_empty(row):
                continue
            elif self._is_directive(row):
                raise MetadataFileError(
                    "Found directive %r while searching for header. "
                    "Directives may only appear immediately after the header."
                    % row[0])
            else:
                raise MetadataFileError(
                    "Found unrecognized ID column name %r while searching for "
                    "header. The first column name in the header defines the "
                    "ID column, and must be one of these values:\n\n%s" %
                    (row[0], FORMATTED_ID_HEADERS))

        if header is None:
            raise MetadataFileError(
                "Failed to locate header. The metadata file may be empty, or "
                "consists only of comments or empty rows.")

        # Trim trailing empty cells from header.
        data_extent = None
        for idx, cell in enumerate(header):
            if cell != '':
                data_extent = idx
        header = header[:data_extent+1]

        # Basic validation to 1) fail early before processing entire file; and
        # 2) make some basic guarantees about the header for things in this
        # class that use the header as part of reading the file.
        column_names = set(header)
        if '' in column_names:
            raise MetadataFileError(
                "Found at least one column without a name in the header. Each "
                "column must be named.")
        elif len(header) != len(column_names):
            duplicates = find_duplicates(header)
            raise MetadataFileError(
                "Column names must be unique. The following column names are "
                "duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        # Skip the first element of the header because we know it is a valid ID
        # header. The other column names are validated to ensure they *aren't*
        # valid ID headers.
        for column_name in header[1:]:
            if is_id_header(column_name):
                raise MetadataFileError(
                    "Metadata column name %r conflicts with a name reserved "
                    "for the ID column header. Reserved ID column headers:"
                    "\n\n%s" % (column_name, FORMATTED_ID_HEADERS))

        return header

コード例 #4

0

ファイルを表示

ファイル: metadata.py プロジェクト: rnandety/qiime2

    def filter_ids(self, ids_to_keep):
        ids_to_keep_set = set(ids_to_keep)
        if len(ids_to_keep) != len(ids_to_keep_set):
            duplicates = find_duplicates(ids_to_keep)
            raise ValueError(
                "`ids_to_keep` must consist of unique IDs. The following "
                "ID(s) are duplicated: %s"
                % (', '.join(repr(e) for e in sorted(duplicates))))

        missing_ids = ids_to_keep_set - self.get_ids()
        if missing_ids:
            raise ValueError(
                "The following ID(s) are not present in the MetadataColumn: %s"
                % (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = self.get_ids() - ids_to_keep_set
        filtered_series = self._series.drop(
            labels=ids_to_discard, axis='index', inplace=False, errors='raise')

        # Not using Series.empty because empty columns are allowed in
        # Metadata.
        # TODO instead of erroring here, just check that `ids_to_keep` isn't
        # empty at the start of this method.
        if filtered_series.index.empty:
            raise ValueError(
                "All IDs were filtered out of the MetadataColumn, resulting "
                "in an empty MetadataColumn object.")

        filtered_mdc = self.__class__(filtered_series)
        filtered_mdc._add_artifacts(self.artifacts)
        return filtered_mdc

コード例 #5

0

ファイルを表示

ファイル: metadata.py プロジェクト: gregcaporaso/qiime2

    def filter_ids(self, ids_to_keep):
        ids_to_keep_set = set(ids_to_keep)
        if len(ids_to_keep) != len(ids_to_keep_set):
            duplicates = find_duplicates(ids_to_keep)
            raise ValueError(
                "`ids_to_keep` must consist of unique IDs. The following "
                "ID(s) are duplicated: %s"
                % (', '.join(repr(e) for e in sorted(duplicates))))

        missing_ids = ids_to_keep_set - self.get_ids()
        if missing_ids:
            raise ValueError(
                "The following ID(s) are not present in the MetadataColumn: %s"
                % (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = self.get_ids() - ids_to_keep_set
        filtered_series = self._series.drop(
            labels=ids_to_discard, axis='index', inplace=False, errors='raise')

        # Not using Series.empty because empty columns are allowed in
        # Metadata.
        # TODO instead of erroring here, just check that `ids_to_keep` isn't
        # empty at the start of this method.
        if filtered_series.index.empty:
            raise ValueError(
                "All IDs were filtered out of the MetadataColumn, resulting "
                "in an empty MetadataColumn object.")

        filtered_mdc = self.__class__(filtered_series)
        filtered_mdc._add_artifacts(self.artifacts)
        return filtered_mdc

コード例 #6

0

ファイルを表示

ファイル: io.py プロジェクト: jakereps/qiime2

    def _read_header(self):
        header = None
        for row in self._reader:
            if self._is_header(row):
                header = row
                break
            elif self._is_comment(row):
                continue
            elif self._is_empty(row):
                continue
            elif self._is_directive(row):
                raise MetadataFileError(
                    "Found directive %r while searching for header. "
                    "Directives may only appear immediately after the header."
                    % row[0])
            else:
                raise MetadataFileError(
                    "Found unrecognized ID column name %r while searching for "
                    "header. The first column name in the header defines the "
                    "ID column, and must be one of these values:\n\n%s" %
                    (row[0], FORMATTED_ID_HEADERS))

        if header is None:
            raise MetadataFileError(
                "Failed to locate header. The metadata file may be empty, or "
                "consists only of comments or empty rows.")

        # Trim trailing empty cells from header.
        data_extent = None
        for idx, cell in enumerate(header):
            if cell != '':
                data_extent = idx
        header = header[:data_extent+1]

        # Basic validation to 1) fail early before processing entire file; and
        # 2) make some basic guarantees about the header for things in this
        # class that use the header as part of reading the file.
        column_names = set(header)
        if '' in column_names:
            raise MetadataFileError(
                "Found at least one column without a name in the header. Each "
                "column must be named.")
        elif len(header) != len(column_names):
            duplicates = find_duplicates(header)
            raise MetadataFileError(
                "Column names must be unique. The following column names are "
                "duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        # Skip the first element of the header because we know it is a valid ID
        # header. The other column names are validated to ensure they *aren't*
        # valid ID headers.
        for column_name in header[1:]:
            if is_id_header(column_name):
                raise MetadataFileError(
                    "Metadata column name %r conflicts with a name reserved "
                    "for the ID column header. Reserved ID column headers:"
                    "\n\n%s" % (column_name, FORMATTED_ID_HEADERS))

        return header

コード例 #7

0

ファイルを表示

    def test_different_hashables(self):
        iterable = iter([
            'foo', 42, -9.999, 'baz', ('a', 'b'), 42, 'foo', ('a', 'b', 'c'),
            ('a', 'b')
        ])
        obs = util.find_duplicates(iterable)

        self.assertEqual(obs, {'foo', 42, ('a', 'b')})

コード例 #8

0

ファイルを表示

    def filter_ids(self, ids_to_keep):
        """Filter metadata by IDs.

        Parameters
        ----------
        ids_to_keep : iterable of str
            IDs that should be retained in the filtered ``Metadata`` object. If
            any IDs in `ids_to_keep` are not contained in this ``Metadata``
            object, a ``ValueError`` will be raised. The filtered ``Metadata``
            object will retain the same relative ordering of IDs in this
            ``Metadata`` object. Thus, the ordering of IDs in `ids_to_keep`
            does not determine the ordering of IDs in the filtered ``Metadata``
            object.

        Returns
        -------
        Metadata
            The metadata filtered by IDs.

        """
        ids_to_keep_set = set(ids_to_keep)
        if len(ids_to_keep) != len(ids_to_keep_set):
            duplicates = find_duplicates(ids_to_keep)
            raise ValueError(
                "`ids_to_keep` must consist of unique IDs. The following "
                "ID(s) are duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        missing_ids = ids_to_keep_set - self.get_ids()
        if missing_ids:
            raise ValueError(
                "The following ID(s) are not present in the Metadata: %s" %
                (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = self.get_ids() - ids_to_keep_set
        filtered_df = self._dataframe.drop(labels=ids_to_discard,
                                           axis='index',
                                           inplace=False,
                                           errors='raise')

        # Not using DataFrame.empty because empty columns are allowed in
        # Metadata.
        # TODO instead of erroring here, just check that `ids_to_keep` isn't
        # empty at the start of this method.
        if filtered_df.index.empty:
            raise ValueError(
                "All IDs were filtered out of the Metadata, resulting in an "
                "empty Metadata object.")

        filtered_md = self.__class__(filtered_df)
        filtered_md._add_artifacts(self.artifacts)
        return filtered_md

コード例 #9

0

ファイルを表示

ファイル: metadata.py プロジェクト: gregcaporaso/qiime2

    def filter_ids(self, ids_to_keep):
        """Filter metadata by IDs.

        Parameters
        ----------
        ids_to_keep : iterable of str
            IDs that should be retained in the filtered ``Metadata`` object. If
            any IDs in `ids_to_keep` are not contained in this ``Metadata``
            object, a ``ValueError`` will be raised. The filtered ``Metadata``
            object will retain the same relative ordering of IDs in this
            ``Metadata`` object. Thus, the ordering of IDs in `ids_to_keep`
            does not determine the ordering of IDs in the filtered ``Metadata``
            object.

        Returns
        -------
        Metadata
            The metadata filtered by IDs.

        """
        ids_to_keep_set = set(ids_to_keep)
        if len(ids_to_keep) != len(ids_to_keep_set):
            duplicates = find_duplicates(ids_to_keep)
            raise ValueError(
                "`ids_to_keep` must consist of unique IDs. The following "
                "ID(s) are duplicated: %s"
                % (', '.join(repr(e) for e in sorted(duplicates))))

        missing_ids = ids_to_keep_set - self.get_ids()
        if missing_ids:
            raise ValueError(
                "The following ID(s) are not present in the Metadata: %s"
                % (', '.join(repr(e) for e in sorted(missing_ids))))

        # While preserving order, get rid of any IDs not contained in
        # `ids_to_keep`.
        ids_to_discard = self.get_ids() - ids_to_keep_set
        filtered_df = self._dataframe.drop(labels=ids_to_discard, axis='index',
                                           inplace=False, errors='raise')

        # Not using DataFrame.empty because empty columns are allowed in
        # Metadata.
        # TODO instead of erroring here, just check that `ids_to_keep` isn't
        # empty at the start of this method.
        if filtered_df.index.empty:
            raise ValueError(
                "All IDs were filtered out of the Metadata, resulting in an "
                "empty Metadata object.")

        filtered_md = self.__class__(filtered_df)
        filtered_md._add_artifacts(self.artifacts)
        return filtered_md

コード例 #10

0

ファイルを表示

    def __init__(self, *choices):
        if not choices:
            raise ValueError("'Choices' cannot be instantiated with an empty"
                             " set.")

        # Backwards compatibility with old Choices({1, 2, 3}) syntax
        if len(choices) == 1:
            if not isinstance(choices[0], (bool, str)):
                choices = choices[0]

        self.choices = choices = tuple(choices)
        if len(choices) != len(set(choices)):
            raise ValueError("Duplicates found in choices: %r"
                             % util.find_duplicates(choices))

コード例 #11

0

ファイルを表示

    def _read_header(self):
        header = None
        for record in self._reader:
            if self._is_header(record):
                header = record
                break
            elif self._is_comment(record):
                continue
            elif self._is_empty(record):
                continue
            elif self._is_directive(record):
                raise MetadataFileError(
                    "Found directive %r when searching for header. Directives "
                    "may only appear immediately after the header."
                    % record[0])
            else:
                # TODO better error message to hint at what to do
                raise MetadataFileError("Invalid header: %r" % record)

        if header is None:
            raise MetadataFileError(
                "Failed to locate header. The metadata file may be empty, or "
                "consists only of comments or empty records.")

        # Trim trailing empty cells from header.
        data_extent = None
        for idx, cell in enumerate(header):
            if cell != '':
                data_extent = idx
        header = header[:data_extent+1]

        # Basic validation to 1) fail early before processing entire file; and
        # 2) make some basic guarantees about the header for things in this
        # class that use the header as part of reading the file.
        column_names = set(header)
        if '' in column_names:
            raise MetadataFileError(
                "Found at least one column without a name in the header. Each "
                "column must be named.")
        elif len(header) != len(column_names):
            duplicates = find_duplicates(header)
            raise MetadataFileError(
                "Column names must be unique. The following column name(s) "
                "are duplicated: %s" %
                (', '.join(repr(e) for e in sorted(duplicates))))

        return header

コード例 #12

0

ファイルを表示

ファイル: metadata.py プロジェクト: jakereps/qiime2

    def _validate_index(cls, index, *, axis):
        if axis == 'id':
            label = 'ID'
        elif axis == 'column':
            label = 'column name'
        else:
            raise NotImplementedError

        for value in index:
            if not isinstance(value, str):
                raise TypeError(
                    "Detected non-string metadata %s of type %r: %r" %
                    (label, type(value), value))

            if not value:
                raise ValueError(
                    "Detected empty metadata %s. %ss must consist of at least "
                    "one character." % (label, label))

            if value != value.strip():
                raise ValueError(
                    "Detected metadata %s with leading or trailing "
                    "whitespace characters: %r" % (label, value))

            if axis == 'id' and value.startswith('#'):
                raise ValueError(
                    "Detected metadata %s that begins with a pound sign "
                    "(#): %r" % (label, value))

            if is_id_header(value):
                raise ValueError(
                    "Detected metadata %s %r that conflicts with a name "
                    "reserved for the ID header. Reserved ID headers:\n\n%s" %
                    (label, value, FORMATTED_ID_HEADERS))

        if len(index) != len(set(index)):
            duplicates = find_duplicates(index)
            raise ValueError(
                "Metadata %ss must be unique. The following %ss are "
                "duplicated: %s" %
                (label, label, ', '.join(repr(e) for e in sorted(duplicates))))

コード例 #13

0

ファイルを表示

ファイル: metadata.py プロジェクト: zhuangwb/qiime2

    def _validate_index(cls, index, *, axis):
        if axis == 'id':
            label = 'ID'
        elif axis == 'column':
            label = 'column name'
        else:
            raise NotImplementedError

        for value in index:
            if not isinstance(value, str):
                raise TypeError(
                    "Detected non-string metadata %s of type %r: %r" %
                    (label, type(value), value))

            if not value:
                raise ValueError(
                    "Detected empty metadata %s. %ss must consist of at least "
                    "one character." % (label, label))

            if value != value.strip():
                raise ValueError(
                    "Detected metadata %s with leading or trailing "
                    "whitespace characters: %r" % (label, value))

            if axis == 'id' and value.startswith('#'):
                raise ValueError(
                    "Detected metadata %s that begins with a pound sign "
                    "(#): %r" % (label, value))

            if is_id_header(value):
                raise ValueError(
                    "Detected metadata %s %r that conflicts with a name "
                    "reserved for the ID header. Reserved ID headers:\n\n%s" %
                    (label, value, FORMATTED_ID_HEADERS))

        if len(index) != len(set(index)):
            duplicates = find_duplicates(index)
            raise ValueError(
                "Metadata %ss must be unique. The following %ss are "
                "duplicated: %s" %
                (label, label, ', '.join(repr(e) for e in sorted(duplicates))))

コード例 #14

0

ファイルを表示

ファイル: metadata.py プロジェクト: rnandety/qiime2

    def _validate_pandas_index(self, index, label):
        for value in index:
            # TODO raise a better error message for "missing values"
            # (e.g. np.nan, None), right now users will get a "non-string
            # metadata ID" error message, which isn't the most intuitive.
            if not isinstance(value, str):
                raise TypeError(
                    "Detected non-string metadata %s: %r" % (label, value))

            if not value:
                raise ValueError(
                    "Detected empty metadata %s. %ss must consist of at least "
                    "one character." % (label, label))

            if value != value.strip():
                raise ValueError(
                    "Detected metadata %s with leading or trailing "
                    "whitespace characters: %r" % (label, value))

            # HACK: don't use label as a conditional here
            if label == 'ID' and value.startswith('#'):
                raise ValueError(
                    "Detected metadata %s that begins with the pound sign "
                    "(#): %r" % (label, value))

            try:
                self._assert_valid_id_header(value)
            except ValueError:
                pass
            else:
                raise ValueError(
                    "Detected metadata %s that conflicts with a name reserved "
                    "for ID headers: %r" % (label, value))

        if len(index) != len(set(index)):
            duplicates = find_duplicates(index)
            raise ValueError(
                "Metadata %ss must be unique. The following %ss are "
                "duplicated: %s" %
                (label, label, ', '.join(repr(e) for e in sorted(duplicates))))

コード例 #15

0

ファイルを表示

ファイル: metadata.py プロジェクト: gregcaporaso/qiime2

    def _validate_pandas_index(self, index, label):
        for value in index:
            # TODO raise a better error message for "missing values"
            # (e.g. np.nan, None), right now users will get a "non-string
            # metadata ID" error message, which isn't the most intuitive.
            if not isinstance(value, str):
                raise TypeError(
                    "Detected non-string metadata %s: %r" % (label, value))

            if not value:
                raise ValueError(
                    "Detected empty metadata %s: %r" % (label, value))

            if value != value.strip():
                raise ValueError(
                    "Detected metadata %s with leading or trailing "
                    "whitespace characters: %r" % (label, value))

            # HACK: don't use label as a conditional here
            if label == 'ID' and value.startswith('#'):
                raise ValueError(
                    "Detected metadata %s that begins with the pound sign "
                    "(#): %r" % (label, value))

            try:
                self._assert_valid_id_header(value)
            except ValueError:
                pass
            else:
                raise ValueError(
                    "Detected metadata %s that conflicts with a name reserved "
                    "for ID headers: %r" % (label, value))

        if len(index) != len(set(index)):
            duplicates = find_duplicates(index)
            raise ValueError(
                "Metadata %ss must be unique. The following %ss are "
                "duplicated: %s" %
                (label, label, ', '.join(repr(e) for e in sorted(duplicates))))

コード例 #16

0

ファイルを表示

ファイル: test_util.py プロジェクト: thermokarst/qiime2

    def test_single_value(self):
        obs = util.find_duplicates(iter(['foo']))

        self.assertEqual(obs, set())

コード例 #17

0

ファイルを表示

ファイル: test_util.py プロジェクト: thermokarst/qiime2

    def test_all_duplicates(self):
        obs = util.find_duplicates(
                iter(['foo', 'bar', 'baz', 'baz', 'bar', 'foo']))

        self.assertEqual(obs, {'foo', 'bar', 'baz'})

コード例 #18

0

ファイルを表示

ファイル: test_util.py プロジェクト: thermokarst/qiime2

    def test_different_hashables(self):
        iterable = iter(['foo', 42, -9.999, 'baz', ('a', 'b'), 42, 'foo',
                         ('a', 'b', 'c'), ('a', 'b')])
        obs = util.find_duplicates(iterable)

        self.assertEqual(obs, {'foo', 42, ('a', 'b')})

コード例 #19

0

ファイルを表示

    def test_all_duplicates(self):
        obs = util.find_duplicates(
            iter(['foo', 'bar', 'baz', 'baz', 'bar', 'foo']))

        self.assertEqual(obs, {'foo', 'bar', 'baz'})

コード例 #20

0

ファイルを表示

    def test_multiple_values_no_duplicates(self):
        obs = util.find_duplicates(iter(['foo', 'bar']))

        self.assertEqual(obs, set())

コード例 #21

0

ファイルを表示

    def test_single_value(self):
        obs = util.find_duplicates(iter(['foo']))

        self.assertEqual(obs, set())

コード例 #22

0

ファイルを表示

    def test_empty_iterable(self):
        obs = util.find_duplicates(iter([]))

        self.assertEqual(obs, set())

コード例 #23

0

ファイルを表示

ファイル: test_util.py プロジェクト: thermokarst/qiime2

    def test_multiple_values_no_duplicates(self):
        obs = util.find_duplicates(iter(['foo', 'bar']))

        self.assertEqual(obs, set())

コード例 #24

0

ファイルを表示

ファイル: test_util.py プロジェクト: thermokarst/qiime2

    def test_empty_iterable(self):
        obs = util.find_duplicates(iter([]))

        self.assertEqual(obs, set())