Example #1
0
 def test_file_closed(self):
     """File gets closed in decorator"""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     with open_file(filepath) as fh:
         pass
     self.assertTrue(fh.closed)
Example #2
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with open_file(fh) as ffh:
             self.assertTrue(fh is ffh)
         # And it doesn't close the file-handle
         self.assertFalse(fh.closed)
Example #3
0
 def test_file_closed(self):
     """File gets closed in decorator"""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     with open_file(filepath) as fh:
         pass
     self.assertTrue(fh.closed)
Example #4
0
 def test_filehandle(self):
     """Filehandles slip through untouched"""
     with tempfile.TemporaryFile('r') as fh:
         with open_file(fh) as ffh:
             self.assertTrue(fh is ffh)
         # And it doesn't close the file-handle
         self.assertFalse(fh.closed)
Example #5
0
    def to_file(self, out_f, delimiter='\t'):
        """Save the dissimilarity matrix to file in delimited text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.
        delimiter : str, optional
            Delimiter used to separate elements in output format.

        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            formatted_ids = self._format_ids(delimiter)
            out_f.write(formatted_ids)
            out_f.write('\n')

            for id_, vals in zip(self.ids, self.data):
                out_f.write(id_)
                out_f.write(delimiter)
                out_f.write(delimiter.join(np.asarray(vals, dtype=np.str)))
                out_f.write('\n')
Example #6
0
    def to_file(self, out_f, delimiter='\t'):
        """Save the dissimilarity matrix to file in delimited text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.
        delimiter : str, optional
            Delimiter used to separate elements in output format.

        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            formatted_ids = self._format_ids(delimiter)
            out_f.write(formatted_ids)
            out_f.write('\n')

            for id_, vals in zip(self.ids, self.data):
                out_f.write(id_)
                out_f.write(delimiter)
                out_f.write(delimiter.join(np.asarray(vals, dtype=np.str)))
                out_f.write('\n')
Example #7
0
 def test_file_closed_harder(self):
     """File gets closed in decorator, even if exceptions happen."""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     try:
         with open_file(filepath) as fh:
             raise TypeError
     except TypeError:
         self.assertTrue(fh.closed)
     else:
         # If we're here, no exceptions have been raised inside the
         # try clause, so the context manager swallowed them. No
         # good.
         raise Exception("`open_file` didn't propagate exceptions")
Example #8
0
 def test_file_closed_harder(self):
     """File gets closed in decorator, even if exceptions happen."""
     f = tempfile.NamedTemporaryFile('r')
     filepath = f.name
     try:
         with open_file(filepath) as fh:
             raise TypeError
     except TypeError:
         self.assertTrue(fh.closed)
     else:
         # If we're here, no exceptions have been raised inside the
         # try clause, so the context manager swallowed them. No
         # good.
         raise Exception("`open_file` didn't propagate exceptions")
Example #9
0
    def parser(lines):
        with open_file(lines) as lines:
            curr = []
            for l in lines:
                try:
                    l = str(l.decode('utf-8'))
                except AttributeError:
                    pass

                if constructor is not None:
                    line = constructor(l)
                else:
                    line = l
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
Example #10
0
    def parser(lines):
        with open_file(lines) as lines:
            curr = []
            for l in lines:
                try:
                    l = str(l.decode('utf-8'))
                except AttributeError:
                    pass

                if constructor is not None:
                    line = constructor(l)
                else:
                    line = l
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
Example #11
0
    def from_file(cls, dm_f, delimiter='\t'):
        """Load dissimilarity matrix from a delimited text file or file path.

        Creates a `DissimilarityMatrix` instance from a serialized
        dissimilarity matrix stored as delimited text.

        `dm_f` can be a file-like or a file path object containing delimited
        text. The first line (header) must contain the IDs of each object. The
        subsequent lines must contain an ID followed by each dissimilarity
        (float) between the current object and all other objects, where the
        order of objects is determined by the header line.  For example, a 2x2
        dissimilarity matrix with IDs ``'a'`` and ``'b'`` might look like::

            <del>a<del>b
            a<del>0.0<del>1.0
            b<del>1.0<del>0.0

        where ``<del>`` is the delimiter between elements.

        Parameters
        ----------
        dm_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing a serialized
            dissimilarity matrix.
        delimiter : str, optional
            String delimiting elements in `dm_f`.

        Returns
        -------
        DissimilarityMatrix
            Instance of type `cls` containing the parsed contents of `dm_f`.

        Notes
        -----
        Whitespace-only lines can occur anywhere throughout the "file" and are
        ignored. Lines starting with ``#`` are treated as comments and ignored.
        These comments can only occur *before* the ID header.

        IDs will have any leading/trailing whitespace removed when they are
        parsed.

        .. note::
            File-like objects passed to this method will not be closed upon the
            completion of the parsing, it is responsibility of the owner of the
            object to perform this operation.

        """
        # We aren't using np.loadtxt because it uses *way* too much memory
        # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing
        # has finished). See:
        # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html

        with open_file(dm_f, 'U') as dm_f:

            # We use iter() as we want to take a single pass over the
            # iterable and maintain our current position after finding
            # the header (mainly necessary for something like a list
            # of strings).
            dm_f = iter(dm_f)

            # Strategy:
            #   - find the header
            #   - initialize an empty ndarray
            #   - for each row of data in the input file:
            #     - populate the corresponding row in the ndarray with floats

            ids = cls._parse_ids(dm_f, delimiter)
            num_ids = len(ids)
            data = np.empty((num_ids, num_ids), dtype=np.float64)

            # curr_row_idx keeps track of the row index within the data matrix.
            # We're not using enumerate() because there may be
            # empty/whitespace-only lines throughout the data matrix. We want
            # to ignore those and only count the actual rows of data.
            curr_row_idx = 0
            for line in dm_f:
                line = line.strip()

                if not line:
                    continue
                elif curr_row_idx >= num_ids:
                    # We've hit a nonempty line after we already filled the
                    # data matrix. Raise an error because we shouldn't ignore
                    # extra data.
                    raise DissimilarityMatrixFormatError(
                        "Encountered extra rows without corresponding IDs in"
                        " the header.")

                tokens = line.split(delimiter)

                # -1 because the first element contains the current ID.
                if len(tokens) - 1 != num_ids:
                    raise DissimilarityMatrixFormatError(
                        "There are %d values in row number %d, which is not"
                        " equal to the number of IDs in the header (%d)."
                        % (len(tokens) - 1, curr_row_idx + 1, num_ids))

                curr_id = tokens[0].strip()
                expected_id = ids[curr_row_idx]
                if curr_id == expected_id:
                    data[curr_row_idx, :] = np.asarray(tokens[1:], dtype=float)
                else:
                    raise DissimilarityMatrixFormatError(
                        "Encountered mismatched IDs while parsing the "
                        "dissimilarity matrix file. Found '%s' but expected "
                        "'%s'. Please ensure that the IDs match between the "
                        "dissimilarity matrix header (first row) and the row "
                        "labels (first column)." % (curr_id, expected_id))

                curr_row_idx += 1

        if curr_row_idx != num_ids:
            raise DissimilarityMatrixFormatError(
                "Expected %d row(s) of data, but found %d." % (num_ids,
                                                               curr_row_idx))

        return cls(data, ids)
Example #12
0
 def test_BytesIO(self):
     """BytesIO (useful e.g. for testing) slips through."""
     f = BytesIO(b"File contents")
     with open_file(f) as fh:
         self.assertTrue(fh is f)
Example #13
0
    def from_file(cls, ord_res_f):
        r"""Load ordination results from text file.

        Creates a `OrdinationResults` instance from serialized results
        stored as text.

        `ord_res_f` must be a file-like object containing text.

        The ord_res_f format should look like::

            Eigvals<tab>2
            0.096<tab>0.040

            Proportion explained<tab>2
            0.512<tab>0.488

            Species<tab>3<tab>2
            Species1<tab>0.408<tab>0.069
            Species2<tab>-0.115<tab>-0.299
            Species3<tab>-0.309<tab>0.187

            Site<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

            Biplot<tab>4<tab>3
            0.422<tab>-0.559<tab>-0.713
            0.988<tab>0.150<tab>-0.011
            -0.556<tab>0.817<tab>0.147
            -0.404<tab>-0.905<tab>-0.127

            Site constraints<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

        If a given result attribute is not present (e.g. Biplot), it should be
        still defined and declare its dimensions as 0::

            Biplot<tab>0<tab>0

        Parameters
        ----------
        ord_res_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing the
            serialized ordination results.

        Returns
        -------
        OrdinationResults
            Instance of type `cls` containing the parsed contents of
            `ord_res_f`.

        Raises
        ------
        ValueError
            if the shapes of the different sections of the file are not
            consistent
        FileFormatError
            if the format of the file is not recognized

        Examples
        --------
        Assume we have the following tab-delimited text file storing the
        ordination results::

            Eigvals\t2
            0.0961330159181\t0.0409418140138

            Proportion explained\t0

            Species\t3\t2
            Species1\t0.408869425742\t0.0695518116298
            Species2\t-0.1153860437\t-0.299767683538
            Species3\t-0.309967102571\t0.187391917117

            Site\t3\t2
            Site1\t-0.848956053187\t0.882764759014
            Site2\t-0.220458650578\t-1.34482000302
            Site3\t1.66697179591\t0.470324389808

            Biplot\t0\t0

            Site constraints\t0\t0

        Load the ordination results from the file:

        >>> from StringIO import StringIO
        >>> from skbio.math.stats.ordination import OrdinationResults
        >>> or_f = StringIO("Eigvals\t2\n"
        ...                 "0.0961330159181\t0.0409418140138\n"
        ...                 "\n"
        ...                 "Proportion explained\t0\n"
        ...                 "\n"
        ...                 "Species\t3\t2\n"
        ...                 "Species1\t0.408869425742\t0.0695518116298\n"
        ...                 "Species2\t-0.1153860437\t-0.299767683538\n"
        ...                 "Species3\t-0.309967102571\t0.187391917117\n"
        ...                 "\n"
        ...                 "Site\t3\t2\n"
        ...                 "Site1\t-0.848956053187\t0.882764759014\n"
        ...                 "Site2\t-0.220458650578\t-1.34482000302\n"
        ...                 "Site3\t1.66697179591\t0.470324389808\n"
        ...                 "\n"
        ...                 "Biplot\t0\t0\n"
        ...                 "\n"
        ...                 "Site constraints\t0\t0\n")
        >>> ord_res = OrdinationResults.from_file(or_f)
        """

        with open_file(ord_res_f, 'U') as fd:
            orf = iter(fd)

            # Starting at line 0, we should find the eigvals
            eigvals = cls._parse_eigvals(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Now we should find the proportion explained section
            prop_expl = cls._parse_proportion_explained(orf)

            if prop_expl is not None:
                if len(prop_expl) != len(eigvals):
                    raise ValueError(
                        'There should be as many proportion explained'
                        ' values as eigvals: %d != %d' %
                        (len(prop_expl), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the species section
            species, species_ids = cls._parse_coords(orf, 'Species')
            if species is not None:
                if len(species[0]) != len(eigvals):
                    raise ValueError('There should be as many coordinates per'
                                     ' species as eigvals: %d != %d' %
                                     (len(species[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site section
            site, site_ids = cls._parse_coords(orf, 'Site')
            if site is not None:
                if len(site[0]) != len(eigvals):
                    raise ValueError('There should be as many coordinates per'
                                     ' site as eigvals: %d != %d' %
                                     (len(site[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the biplot section
            biplot = cls._parse_biplot(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site constraints section
            cons, cons_ids = cls._parse_coords(orf, 'Site constraints')
            if cons_ids is not None and site_ids is not None:
                if cons_ids != site_ids:
                    raise ValueError(
                        'Site constraints ids and site ids must be'
                        ' equal: %s != %s' % (cons_ids, site_ids))

        return cls(eigvals=eigvals,
                   species=species,
                   site=site,
                   biplot=biplot,
                   site_constraints=cons,
                   proportion_explained=prop_expl,
                   species_ids=species_ids,
                   site_ids=site_ids)
Example #14
0
    def to_file(self, out_f):
        """Save the ordination results to file in text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.
        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            # Write eigvals
            out_f.write("Eigvals\t%d\n" % self.eigvals.shape)
            out_f.write("%s\n\n" %
                        '\t'.join(np.asarray(self.eigvals, dtype=np.str)))

            # Write proportion explained
            if self.proportion_explained is None:
                out_f.write("Proportion explained\t0\n\n")
            else:
                out_f.write("Proportion explained\t%d\n" %
                            self.proportion_explained.shape)
                out_f.write("%s\n\n" % '\t'.join(
                    np.asarray(self.proportion_explained, dtype=np.str)))

            # Write species
            if self.species is None:
                out_f.write("Species\t0\t0\n\n")
            else:
                out_f.write("Species\t%d\t%d\n" % self.species.shape)
                for id_, vals in zip(self.species_ids, self.species):
                    out_f.write(
                        "%s\t%s\n" %
                        (id_, '\t'.join(np.asarray(vals, dtype=np.str))))
                out_f.write("\n")

            # Write site
            if self.site is None:
                out_f.write("Site\t0\t0\n\n")
            else:
                out_f.write("Site\t%d\t%d\n" % self.site.shape)
                for id_, vals in zip(self.site_ids, self.site):
                    out_f.write(
                        "%s\t%s\n" %
                        (id_, '\t'.join(np.asarray(vals, dtype=np.str))))
                out_f.write("\n")

            # Write biplot
            if self.biplot is None:
                out_f.write("Biplot\t0\t0\n\n")
            else:
                out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape)
                for vals in self.biplot:
                    out_f.write("%s\n" %
                                '\t'.join(np.asarray(vals, dtype=np.str)))
                out_f.write("\n")

            # Write site-constraints
            if self.site_constraints is None:
                out_f.write("Site constraints\t0\t0\n")
            else:
                out_f.write("Site constraints\t%d\t%d\n" %
                            self.site_constraints.shape)
                for id_, vals in zip(self.site_ids, self.site_constraints):
                    out_f.write(
                        "%s\t%s\n" %
                        (id_, '\t'.join(np.asarray(vals, dtype=np.str))))
Example #15
0
    def to_file(self, out_f):
        """Save the ordination results to file in text format.

        Parameters
        ----------
        out_f : file-like object or filename
            File-like object to write serialized data to, or name of
            file. If it's a file-like object, it must have a ``write``
            method, and it won't be closed. Else, it is opened and
            closed after writing.
        See Also
        --------
        from_file

        """
        with open_file(out_f, 'w') as out_f:
            # Write eigvals
            out_f.write("Eigvals\t%d\n" % self.eigvals.shape)
            out_f.write("%s\n\n" % '\t'.join(np.asarray(self.eigvals,
                                                        dtype=np.str)))

            # Write proportion explained
            if self.proportion_explained is None:
                out_f.write("Proportion explained\t0\n\n")
            else:
                out_f.write("Proportion explained\t%d\n" %
                            self.proportion_explained.shape)
                out_f.write("%s\n\n" % '\t'.join(
                    np.asarray(self.proportion_explained, dtype=np.str)))

            # Write species
            if self.species is None:
                out_f.write("Species\t0\t0\n\n")
            else:
                out_f.write("Species\t%d\t%d\n" % self.species.shape)
                for id_, vals in zip(self.species_ids, self.species):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(np.asarray(vals,
                                dtype=np.str))))
                out_f.write("\n")

            # Write site
            if self.site is None:
                out_f.write("Site\t0\t0\n\n")
            else:
                out_f.write("Site\t%d\t%d\n" % self.site.shape)
                for id_, vals in zip(self.site_ids, self.site):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(
                        np.asarray(vals, dtype=np.str))))
                out_f.write("\n")

            # Write biplot
            if self.biplot is None:
                out_f.write("Biplot\t0\t0\n\n")
            else:
                out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape)
                for vals in self.biplot:
                    out_f.write("%s\n" % '\t'.join(
                        np.asarray(vals, dtype=np.str)))
                out_f.write("\n")

            # Write site-constraints
            if self.site_constraints is None:
                out_f.write("Site constraints\t0\t0\n")
            else:
                out_f.write("Site constraints\t%d\t%d\n" %
                            self.site_constraints.shape)
                for id_, vals in zip(self.site_ids, self.site_constraints):
                    out_f.write("%s\t%s\n" % (id_, '\t'.join(
                        np.asarray(vals, dtype=np.str))))
Example #16
0
    def from_file(cls, ord_res_f):
        r"""Load ordination results from text file.

        Creates a `OrdinationResults` instance from serialized results
        stored as text.

        `ord_res_f` must be a file-like object containing text.

        The ord_res_f format should look like::

            Eigvals<tab>2
            0.096<tab>0.040

            Proportion explained<tab>2
            0.512<tab>0.488

            Species<tab>3<tab>2
            Species1<tab>0.408<tab>0.069
            Species2<tab>-0.115<tab>-0.299
            Species3<tab>-0.309<tab>0.187

            Site<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

            Biplot<tab>4<tab>3
            0.422<tab>-0.559<tab>-0.713
            0.988<tab>0.150<tab>-0.011
            -0.556<tab>0.817<tab>0.147
            -0.404<tab>-0.905<tab>-0.127

            Site constraints<tab>3<tab>2
            Site1<tab>-0.848<tab>0.882
            Site2<tab>-0.220<tab>-1.344
            Site3<tab>1.666<tab>0.470

        If a given result attribute is not present (e.g. Biplot), it should be
        still defined and declare its dimensions as 0::

            Biplot<tab>0<tab>0

        Parameters
        ----------
        ord_res_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing the
            serialized ordination results.

        Returns
        -------
        OrdinationResults
            Instance of type `cls` containing the parsed contents of
            `ord_res_f`.

        Raises
        ------
        ValueError
            if the shapes of the different sections of the file are not
            consistent
        FileFormatError
            if the format of the file is not recognized

        Examples
        --------
        Assume we have the following tab-delimited text file storing the
        ordination results::

            Eigvals\t2
            0.0961330159181\t0.0409418140138

            Proportion explained\t0

            Species\t3\t2
            Species1\t0.408869425742\t0.0695518116298
            Species2\t-0.1153860437\t-0.299767683538
            Species3\t-0.309967102571\t0.187391917117

            Site\t3\t2
            Site1\t-0.848956053187\t0.882764759014
            Site2\t-0.220458650578\t-1.34482000302
            Site3\t1.66697179591\t0.470324389808

            Biplot\t0\t0

            Site constraints\t0\t0

        Load the ordination results from the file:

        >>> from StringIO import StringIO
        >>> from skbio.math.stats.ordination import OrdinationResults
        >>> or_f = StringIO("Eigvals\t2\n"
        ...                 "0.0961330159181\t0.0409418140138\n"
        ...                 "\n"
        ...                 "Proportion explained\t0\n"
        ...                 "\n"
        ...                 "Species\t3\t2\n"
        ...                 "Species1\t0.408869425742\t0.0695518116298\n"
        ...                 "Species2\t-0.1153860437\t-0.299767683538\n"
        ...                 "Species3\t-0.309967102571\t0.187391917117\n"
        ...                 "\n"
        ...                 "Site\t3\t2\n"
        ...                 "Site1\t-0.848956053187\t0.882764759014\n"
        ...                 "Site2\t-0.220458650578\t-1.34482000302\n"
        ...                 "Site3\t1.66697179591\t0.470324389808\n"
        ...                 "\n"
        ...                 "Biplot\t0\t0\n"
        ...                 "\n"
        ...                 "Site constraints\t0\t0\n")
        >>> ord_res = OrdinationResults.from_file(or_f)
        """

        with open_file(ord_res_f, 'U') as fd:
            orf = iter(fd)

            # Starting at line 0, we should find the eigvals
            eigvals = cls._parse_eigvals(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Now we should find the proportion explained section
            prop_expl = cls._parse_proportion_explained(orf)

            if prop_expl is not None:
                if len(prop_expl) != len(eigvals):
                    raise ValueError(
                        'There should be as many proportion explained'
                        ' values as eigvals: %d != %d' %
                        (len(prop_expl), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the species section
            species, species_ids = cls._parse_coords(orf, 'Species')
            if species is not None:
                if len(species[0]) != len(eigvals):
                    raise ValueError(
                        'There should be as many coordinates per'
                        ' species as eigvals: %d != %d' %
                        (len(species[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site section
            site, site_ids = cls._parse_coords(orf, 'Site')
            if site is not None:
                if len(site[0]) != len(eigvals):
                    raise ValueError(
                        'There should be as many coordinates per'
                        ' site as eigvals: %d != %d' %
                        (len(site[0]), len(eigvals)))

            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the biplot section
            biplot = cls._parse_biplot(orf)
            # The next line should be an empty line
            cls._check_empty_line(orf)
            # Next section should be the site constraints section
            cons, cons_ids = cls._parse_coords(orf, 'Site constraints')
            if cons_ids is not None and site_ids is not None:
                if cons_ids != site_ids:
                    raise ValueError(
                        'Site constraints ids and site ids must be'
                        ' equal: %s != %s' % (cons_ids, site_ids))

        return cls(eigvals=eigvals, species=species, site=site, biplot=biplot,
                   site_constraints=cons, proportion_explained=prop_expl,
                   species_ids=species_ids, site_ids=site_ids)
Example #17
0
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33):
    r"""yields label, seq, and qual from a fastq file.

    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.
    strict : bool, optional
        Defaults to ``False``. If strict is true a FastqParse error will be
        raised if the seq and qual labels dont' match.
    enforce_qual_range : bool, optional
        Defaults to ``True``. If ``True``, an exception will be raised if a
        quality score outside the range [0, 62] is detected
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols to integers

    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry

    Examples
    --------
    Assume we have a fastq formatted file with the following contents::

        @seq1
        AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
        +
        ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
        @seq2
        TATGTATATATAACATATACATATATACATACATA
        +
        ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb

    We can use the following code:

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_fastq
    >>> fastq_f = StringIO('@seq1\n'
    ...                     'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
    ...                     '+\n'
    ...                     '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
    ...                     '@seq2\n'
    ...                     'TATGTATATATAACATATACATATATACATACATA\n'
    ...                     '+\n'
    ...                     ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
    >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
    ...     print(label)
    ...     print(seq)
    ...     print(qual)
    seq1
    AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
    [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
     35 25 20 28 20 28 25 28 23  6]
    seq2
    TATGTATATATAACATATACATATATACATACATA
    [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
     35 32 28 33 20 32 32 34 34 34]
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    with open_file(data, 'rb') as data:
        iters = [iter(data)] * 4
        for seqid, seq, qualid, qual in zip_longest(*iters):
            seqid = seqid.strip()
            # If the file simply ended in a blankline, do not error
            if seqid is '':
                continue
            # Error if an incomplete record is found
            # Note: seqid cannot be None, because if all 4 values were None,
            # then the loop condition would be false, and we could not have
            # gotten to this point
            if seq is None or qualid is None or qual is None:
                raise FastqParseError("Incomplete FASTQ record found at end "
                                      "of file")

            seq = seq.strip()
            qualid = qualid.strip()
            qual = qual.strip()

            seqid = _drop_id_marker(seqid)

            try:
                seq = str(seq.decode("utf-8"))
            except AttributeError:
                pass

            qualid = _drop_id_marker(qualid)
            if strict:
                if seqid != qualid:
                    raise FastqParseError('ID mismatch: {} != {}'.format(
                        seqid, qualid))

            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            qual = phred_f(qual)
            if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()):
                raise FastqParseError("Failed qual conversion for seq id: %s. "
                                      "This may be because you passed an "
                                      "incorrect value for phred_offset." %
                                      seqid)

            yield (seqid, seq, qual)
Example #18
0
def parse_qseq(infile, phred_offset=33):
    r"""Generator of seq ids, seqs, quals and other records from a qseq file.

    Parameters
    ----------
    infile : open file object or str
        An open qseq file or a path to a qseq file.
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols
        to integers.

    Returns
    -------
    four-item tuple: (str, str, np.array(dtype=int), namedtuple)
        yields the sequence id, sequence, qual array and other record
        information for each entry.  The sequence ID format is:
        <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y>#
        <Index>/<Read number>.  The namedtuple attributes are:
        machine_name, run, lane, tile, x, y, index, read and filtered.

    Examples
    --------
    Assume we have a qseq-formatted file with the following contents::

        CRESSIA       242     1       2204    1453    1918    0       1
            .TTAATAAGAATGTCTGTTGTGGCTTAAAA  B[[[W][Y[Zccccccccc\cccac_____  1
        CRESSIA       242     1       2204    1490    1921    0       2
            ..GTAAAACCCATATATTGAAAACTACAAA  BWUTWcXVXXcccc_cccccccccc_cccc  1

    >>> from future.utils.six import StringIO
    >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t'
    ...   '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n'
    ...                   'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t'
    ...   '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n'
    ... )

    We can parse this as follows:

    >>> from skbio import parse_qseq
    >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64):
    ...     print(seq_id)
    ...     print(seq)
    ...     print(qual[:10])
    ...     print(record.run)
    ...     print(record.lane)
    CRESSIA_242:1:2204:1453:1918#0/1
    .TTAATAAGAATGTCTGTTGTGGCTTAAAA
    [ 2 27 27 27 23 29 27 25 27 26]
    242
    1
    CRESSIA_242:1:2204:1490:1921#0/2
    ..GTAAAACCCATATATTGAAAACTACAAA
    [ 2 23 21 20 23 35 24 22 24 24]
    242
    1
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    # namedtuple to store all other record information
    Record = collections.namedtuple(
        'Record',
        ['machine_name',
         'run',
         'lane',
         'tile',
         'x',
         'y',
         'index',
         'read',
         'filtered'])

    with open_file(infile) as lines:
        for rec in lines:
            try:
                rec = str(rec.decode('utf-8'))
            except AttributeError:
                pass
            # parse record.
            try:
                (machine_name, run, lane, tile, x, y, index, read, seq, qual,
                 filtered) = rec.split()
            except ValueError:
                raise QseqParseError("Invalid QSEQ record found.")
            # sequence ID is formatted using the first eight items.
            seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (
                machine_name, run, lane, tile, x, y, index, read)
            # qual string is converted to an array of ints.
            qual = phred_f(qual)
            # other items are returned as a namedtuple
            record = Record(
                machine_name=machine_name,
                run=int(run),
                lane=int(lane),
                tile=int(tile),
                x=int(x),
                y=int(y),
                index=int(index),
                read=int(read),
                filtered=bool(int(filtered)))

            yield seq_id, seq, qual, record
Example #19
0
def parse_qseq(infile, phred_offset=33):
    r"""Generator of seq ids, seqs, quals and other records from a qseq file.

    Parameters
    ----------
    infile : open file object or str
        An open qseq file or a path to a qseq file.
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols
        to integers.

    Returns
    -------
    four-item tuple: (str, str, np.array(dtype=int), namedtuple)
        yields the sequence id, sequence, qual array and other record
        information for each entry.  The sequence ID format is:
        <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y>#
        <Index>/<Read number>.  The namedtuple attributes are:
        machine_name, run, lane, tile, x, y, index, read and filtered.

    Examples
    --------
    Assume we have a qseq-formatted file with the following contents::

        CRESSIA       242     1       2204    1453    1918    0       1
            .TTAATAAGAATGTCTGTTGTGGCTTAAAA  B[[[W][Y[Zccccccccc\cccac_____  1
        CRESSIA       242     1       2204    1490    1921    0       2
            ..GTAAAACCCATATATTGAAAACTACAAA  BWUTWcXVXXcccc_cccccccccc_cccc  1

    >>> from future.utils.six import StringIO
    >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t'
    ...   '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n'
    ...                   'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t'
    ...   '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n'
    ... )

    We can parse this as follows:

    >>> from skbio import parse_qseq
    >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64):
    ...     print(seq_id)
    ...     print(seq)
    ...     print(qual[:10])
    ...     print(record.run)
    ...     print(record.lane)
    CRESSIA_242:1:2204:1453:1918#0/1
    .TTAATAAGAATGTCTGTTGTGGCTTAAAA
    [ 2 27 27 27 23 29 27 25 27 26]
    242
    1
    CRESSIA_242:1:2204:1490:1921#0/2
    ..GTAAAACCCATATATTGAAAACTACAAA
    [ 2 23 21 20 23 35 24 22 24 24]
    242
    1
    """
    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    # namedtuple to store all other record information
    Record = collections.namedtuple('Record', [
        'machine_name', 'run', 'lane', 'tile', 'x', 'y', 'index', 'read',
        'filtered'
    ])

    with open_file(infile) as lines:
        for rec in lines:
            try:
                rec = str(rec.decode('utf-8'))
            except AttributeError:
                pass
            # parse record.
            try:
                (machine_name, run, lane, tile, x, y, index, read, seq, qual,
                 filtered) = rec.split()
            except ValueError:
                raise QseqParseError("Invalid QSEQ record found.")
            # sequence ID is formatted using the first eight items.
            seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (machine_name, run, lane,
                                                  tile, x, y, index, read)
            # qual string is converted to an array of ints.
            qual = phred_f(qual)
            # other items are returned as a namedtuple
            record = Record(machine_name=machine_name,
                            run=int(run),
                            lane=int(lane),
                            tile=int(tile),
                            x=int(x),
                            y=int(y),
                            index=int(index),
                            read=int(read),
                            filtered=bool(int(filtered)))

            yield seq_id, seq, qual, record
Example #20
0
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33):
    r"""yields label, seq, and qual from a fastq file.

    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.
    strict : bool, optional
        Defaults to ``False``. If strict is true a FastqParse error will be
        raised if the seq and qual labels dont' match.
    enforce_qual_range : bool, optional
        Defaults to ``True``. If ``True``, an exception will be raised if a
        quality score outside the range [0, 62] is detected
    phred_offset : {33, 64}, optional
        What Phred offset to use when converting qual score symbols to integers

    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry

    Examples
    --------
    Assume we have a fastq formatted file with the following contents::

        @seq1
        AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
        +
        ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
        @seq2
        TATGTATATATAACATATACATATATACATACATA
        +
        ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb

    We can use the following code:

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_fastq
    >>> fastq_f = StringIO('@seq1\n'
    ...                     'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
    ...                     '+\n'
    ...                     '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
    ...                     '@seq2\n'
    ...                     'TATGTATATATAACATATACATATATACATACATA\n'
    ...                     '+\n'
    ...                     ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
    >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
    ...     print(label)
    ...     print(seq)
    ...     print(qual)
    seq1
    AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
    [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
     35 25 20 28 20 28 25 28 23  6]
    seq2
    TATGTATATATAACATATACATATATACATACATA
    [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
     35 32 28 33 20 32 32 34 34 34]
    """
    if phred_offset == 33:
        phred_f = _ascii_to_phred33
    elif phred_offset == 64:
        phred_f = _ascii_to_phred64
    else:
        raise ValueError("Unknown PHRED offset of %s" % phred_offset)

    with open_file(data, 'rb') as data:
        iters = [iter(data)] * 4
        for seqid, seq, qualid, qual in zip_longest(*iters):
            seqid = seqid.strip()
            # If the file simply ended in a blankline, do not error
            if seqid is '':
                continue
            # Error if an incomplete record is found
            # Note: seqid cannot be None, because if all 4 values were None,
            # then the loop condition would be false, and we could not have
            # gotten to this point
            if seq is None or qualid is None or qual is None:
                raise FastqParseError("Incomplete FASTQ record found at end "
                                      "of file")

            seq = seq.strip()
            qualid = qualid.strip()
            qual = qual.strip()

            seqid = _drop_id_marker(seqid)

            try:
                seq = str(seq.decode("utf-8"))
            except AttributeError:
                pass

            qualid = _drop_id_marker(qualid)
            if strict:
                if seqid != qualid:
                    raise FastqParseError('ID mismatch: {} != {}'.format(
                        seqid, qualid))

            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            qual = phred_f(qual)
            if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()):
                raise FastqParseError("Failed qual conversion for seq id: %s. "
                                      "This may be because you passed an "
                                      "incorrect value for phred_offset." %
                                      seqid)

            yield (seqid, seq, qual)
Example #21
0
    def from_file(cls, dm_f, delimiter='\t'):
        """Load dissimilarity matrix from a delimited text file or file path.

        Creates a `DissimilarityMatrix` instance from a serialized
        dissimilarity matrix stored as delimited text.

        `dm_f` can be a file-like or a file path object containing delimited
        text. The first line (header) must contain the IDs of each object. The
        subsequent lines must contain an ID followed by each dissimilarity
        (float) between the current object and all other objects, where the
        order of objects is determined by the header line.  For example, a 2x2
        dissimilarity matrix with IDs ``'a'`` and ``'b'`` might look like::

            <del>a<del>b
            a<del>0.0<del>1.0
            b<del>1.0<del>0.0

        where ``<del>`` is the delimiter between elements.

        Parameters
        ----------
        dm_f : iterable of str or str
            Iterable of strings (e.g., open file handle, file-like object, list
            of strings, etc.) or a file path (a string) containing a serialized
            dissimilarity matrix.
        delimiter : str, optional
            String delimiting elements in `dm_f`.

        Returns
        -------
        DissimilarityMatrix
            Instance of type `cls` containing the parsed contents of `dm_f`.

        Notes
        -----
        Whitespace-only lines can occur anywhere throughout the "file" and are
        ignored. Lines starting with ``#`` are treated as comments and ignored.
        These comments can only occur *before* the ID header.

        IDs will have any leading/trailing whitespace removed when they are
        parsed.

        .. note::
            File-like objects passed to this method will not be closed upon the
            completion of the parsing, it is responsibility of the owner of the
            object to perform this operation.

        """
        # We aren't using np.loadtxt because it uses *way* too much memory
        # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing
        # has finished). See:
        # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html

        with open_file(dm_f, 'U') as dm_f:

            # We use iter() as we want to take a single pass over the
            # iterable and maintain our current position after finding
            # the header (mainly necessary for something like a list
            # of strings).
            dm_f = iter(dm_f)

            # Strategy:
            #   - find the header
            #   - initialize an empty ndarray
            #   - for each row of data in the input file:
            #     - populate the corresponding row in the ndarray with floats

            ids = cls._parse_ids(dm_f, delimiter)
            num_ids = len(ids)
            data = np.empty((num_ids, num_ids), dtype=np.float64)

            # curr_row_idx keeps track of the row index within the data matrix.
            # We're not using enumerate() because there may be
            # empty/whitespace-only lines throughout the data matrix. We want
            # to ignore those and only count the actual rows of data.
            curr_row_idx = 0
            for line in dm_f:
                line = line.strip()

                if not line:
                    continue
                elif curr_row_idx >= num_ids:
                    # We've hit a nonempty line after we already filled the
                    # data matrix. Raise an error because we shouldn't ignore
                    # extra data.
                    raise DissimilarityMatrixFormatError(
                        "Encountered extra rows without corresponding IDs in"
                        " the header.")

                tokens = line.split(delimiter)

                # -1 because the first element contains the current ID.
                if len(tokens) - 1 != num_ids:
                    raise DissimilarityMatrixFormatError(
                        "There are %d values in row number %d, which is not"
                        " equal to the number of IDs in the header (%d)." %
                        (len(tokens) - 1, curr_row_idx + 1, num_ids))

                curr_id = tokens[0].strip()
                expected_id = ids[curr_row_idx]
                if curr_id == expected_id:
                    data[curr_row_idx, :] = np.asarray(tokens[1:], dtype=float)
                else:
                    raise DissimilarityMatrixFormatError(
                        "Encountered mismatched IDs while parsing the "
                        "dissimilarity matrix file. Found '%s' but expected "
                        "'%s'. Please ensure that the IDs match between the "
                        "dissimilarity matrix header (first row) and the row "
                        "labels (first column)." % (curr_id, expected_id))

                curr_row_idx += 1

        if curr_row_idx != num_ids:
            raise DissimilarityMatrixFormatError(
                "Expected %d row(s) of data, but found %d." %
                (num_ids, curr_row_idx))

        return cls(data, ids)
Example #22
0
 def test_BytesIO(self):
     """BytesIO (useful e.g. for testing) slips through."""
     f = BytesIO(b"File contents")
     with open_file(f) as fh:
         self.assertTrue(fh is f)
Example #23
0
def parse_fastq(data, strict=False, phred_offset=33):
    r"""yields label, seq, and qual from a fastq file.

    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.

    strict : bool
        If strict is true a FastqParse error will be raised if the seq and qual
        labels dont' match.

    phred_offset : int or None
        Force a Phred offset, currently restricted to either 33 or 64.
        Default behavior is to infer the Phred offset.

    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry

    Examples
    --------
    Assume we have a fastq formatted file with the following contents::

        @seq1
        AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
        +
        ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
        @seq2
        TATGTATATATAACATATACATATATACATACATA
        +
        ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb

    We can use the following code:

    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_fastq
    >>> fastq_f = StringIO('@seq1\n'
    ...                     'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
    ...                     '+\n'
    ...                     '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
    ...                     '@seq2\n'
    ...                     'TATGTATATATAACATATACATATATACATACATA\n'
    ...                     '+\n'
    ...                     ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
    >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
    ...     print label
    ...     print seq
    ...     print qual
    seq1
    AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
    [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
     35 25 20 28 20 28 25 28 23  6]
    seq2
    TATGTATATATAACATATACATATATACATACATA
    [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
     35 32 28 33 20 32 32 34 34 34]

    """
    # line number for modulus operation
    SEQUENCEID = 0
    SEQUENCE = 1
    QUALID = 2
    QUAL = 3

    with open_file(data, 'rb') as data:
        data = iter(data)
        first_line = next(data).strip()

        if phred_offset == 33:
            phred_f = _ascii_to_phred33
        elif phred_offset == 64:
            phred_f = _ascii_to_phred64
        else:
            raise ValueError("Unknown PHRED offset of %s" % phred_offset)

        seqid = _drop_id_marker(first_line)
        seq = None
        qualid = None
        qual = None

        for idx, line in enumerate(data):
            # +1 due to fetch of line prior to loop
            lineno = idx + 1
            linetype = lineno % 4
            line = line.strip()

            if linetype == SEQUENCEID:
                yield seqid, seq, qual

                seqid = _drop_id_marker(line)
                seq = None
                qualid = None
                qual = None
            elif linetype == SEQUENCE:
                seq = line
                try:
                    seq = str(seq.decode("utf-8"))
                except AttributeError:
                    pass
            elif linetype == QUALID:
                qualid = _drop_id_marker(line)
                if strict:
                    if seqid != qualid:
                        raise FastqParseError('ID mismatch: {} != {}'.format(
                            seqid, qualid))
            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            elif linetype == QUAL:
                qual = phred_f(line)
                if (qual < 0).any() or (qual > 62).any():
                    raise FastqParseError("Failed qual conversion for seq "
                                          "id: %s. This may be because you "
                                          "passed an incorrect value for "
                                          "phred_offset." % seqid)
        if seqid:
            yield (seqid, seq, qual)