def test_file_closed(self): """File gets closed in decorator""" f = tempfile.NamedTemporaryFile('r') filepath = f.name with open_file(filepath) as fh: pass self.assertTrue(fh.closed)
def test_filehandle(self): """Filehandles slip through untouched""" with tempfile.TemporaryFile('r') as fh: with open_file(fh) as ffh: self.assertTrue(fh is ffh) # And it doesn't close the file-handle self.assertFalse(fh.closed)
def to_file(self, out_f, delimiter='\t'): """Save the dissimilarity matrix to file in delimited text format. Parameters ---------- out_f : file-like object or filename File-like object to write serialized data to, or name of file. If it's a file-like object, it must have a ``write`` method, and it won't be closed. Else, it is opened and closed after writing. delimiter : str, optional Delimiter used to separate elements in output format. See Also -------- from_file """ with open_file(out_f, 'w') as out_f: formatted_ids = self._format_ids(delimiter) out_f.write(formatted_ids) out_f.write('\n') for id_, vals in zip(self.ids, self.data): out_f.write(id_) out_f.write(delimiter) out_f.write(delimiter.join(np.asarray(vals, dtype=np.str))) out_f.write('\n')
def test_file_closed_harder(self): """File gets closed in decorator, even if exceptions happen.""" f = tempfile.NamedTemporaryFile('r') filepath = f.name try: with open_file(filepath) as fh: raise TypeError except TypeError: self.assertTrue(fh.closed) else: # If we're here, no exceptions have been raised inside the # try clause, so the context manager swallowed them. No # good. raise Exception("`open_file` didn't propagate exceptions")
def parser(lines): with open_file(lines) as lines: curr = [] for l in lines: try: l = str(l.decode('utf-8')) except AttributeError: pass if constructor is not None: line = constructor(l) else: line = l if ignore(line): continue # if we find the label, return the previous record if is_label_line(line): if curr: yield curr curr = [] curr.append(line) # don't forget to return the last record in the file if curr: yield curr
def from_file(cls, dm_f, delimiter='\t'): """Load dissimilarity matrix from a delimited text file or file path. Creates a `DissimilarityMatrix` instance from a serialized dissimilarity matrix stored as delimited text. `dm_f` can be a file-like or a file path object containing delimited text. The first line (header) must contain the IDs of each object. The subsequent lines must contain an ID followed by each dissimilarity (float) between the current object and all other objects, where the order of objects is determined by the header line. For example, a 2x2 dissimilarity matrix with IDs ``'a'`` and ``'b'`` might look like:: <del>a<del>b a<del>0.0<del>1.0 b<del>1.0<del>0.0 where ``<del>`` is the delimiter between elements. Parameters ---------- dm_f : iterable of str or str Iterable of strings (e.g., open file handle, file-like object, list of strings, etc.) or a file path (a string) containing a serialized dissimilarity matrix. delimiter : str, optional String delimiting elements in `dm_f`. Returns ------- DissimilarityMatrix Instance of type `cls` containing the parsed contents of `dm_f`. Notes ----- Whitespace-only lines can occur anywhere throughout the "file" and are ignored. Lines starting with ``#`` are treated as comments and ignored. These comments can only occur *before* the ID header. IDs will have any leading/trailing whitespace removed when they are parsed. .. note:: File-like objects passed to this method will not be closed upon the completion of the parsing, it is responsibility of the owner of the object to perform this operation. """ # We aren't using np.loadtxt because it uses *way* too much memory # (e.g, a 2GB matrix eats up 10GB, which then isn't freed after parsing # has finished). See: # http://mail.scipy.org/pipermail/numpy-tickets/2012-August/006749.html with open_file(dm_f, 'U') as dm_f: # We use iter() as we want to take a single pass over the # iterable and maintain our current position after finding # the header (mainly necessary for something like a list # of strings). dm_f = iter(dm_f) # Strategy: # - find the header # - initialize an empty ndarray # - for each row of data in the input file: # - populate the corresponding row in the ndarray with floats ids = cls._parse_ids(dm_f, delimiter) num_ids = len(ids) data = np.empty((num_ids, num_ids), dtype=np.float64) # curr_row_idx keeps track of the row index within the data matrix. # We're not using enumerate() because there may be # empty/whitespace-only lines throughout the data matrix. We want # to ignore those and only count the actual rows of data. curr_row_idx = 0 for line in dm_f: line = line.strip() if not line: continue elif curr_row_idx >= num_ids: # We've hit a nonempty line after we already filled the # data matrix. Raise an error because we shouldn't ignore # extra data. raise DissimilarityMatrixFormatError( "Encountered extra rows without corresponding IDs in" " the header.") tokens = line.split(delimiter) # -1 because the first element contains the current ID. if len(tokens) - 1 != num_ids: raise DissimilarityMatrixFormatError( "There are %d values in row number %d, which is not" " equal to the number of IDs in the header (%d)." % (len(tokens) - 1, curr_row_idx + 1, num_ids)) curr_id = tokens[0].strip() expected_id = ids[curr_row_idx] if curr_id == expected_id: data[curr_row_idx, :] = np.asarray(tokens[1:], dtype=float) else: raise DissimilarityMatrixFormatError( "Encountered mismatched IDs while parsing the " "dissimilarity matrix file. Found '%s' but expected " "'%s'. Please ensure that the IDs match between the " "dissimilarity matrix header (first row) and the row " "labels (first column)." % (curr_id, expected_id)) curr_row_idx += 1 if curr_row_idx != num_ids: raise DissimilarityMatrixFormatError( "Expected %d row(s) of data, but found %d." % (num_ids, curr_row_idx)) return cls(data, ids)
def test_BytesIO(self): """BytesIO (useful e.g. for testing) slips through.""" f = BytesIO(b"File contents") with open_file(f) as fh: self.assertTrue(fh is f)
def from_file(cls, ord_res_f): r"""Load ordination results from text file. Creates a `OrdinationResults` instance from serialized results stored as text. `ord_res_f` must be a file-like object containing text. The ord_res_f format should look like:: Eigvals<tab>2 0.096<tab>0.040 Proportion explained<tab>2 0.512<tab>0.488 Species<tab>3<tab>2 Species1<tab>0.408<tab>0.069 Species2<tab>-0.115<tab>-0.299 Species3<tab>-0.309<tab>0.187 Site<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 Biplot<tab>4<tab>3 0.422<tab>-0.559<tab>-0.713 0.988<tab>0.150<tab>-0.011 -0.556<tab>0.817<tab>0.147 -0.404<tab>-0.905<tab>-0.127 Site constraints<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 If a given result attribute is not present (e.g. Biplot), it should be still defined and declare its dimensions as 0:: Biplot<tab>0<tab>0 Parameters ---------- ord_res_f : iterable of str or str Iterable of strings (e.g., open file handle, file-like object, list of strings, etc.) or a file path (a string) containing the serialized ordination results. Returns ------- OrdinationResults Instance of type `cls` containing the parsed contents of `ord_res_f`. Raises ------ ValueError if the shapes of the different sections of the file are not consistent FileFormatError if the format of the file is not recognized Examples -------- Assume we have the following tab-delimited text file storing the ordination results:: Eigvals\t2 0.0961330159181\t0.0409418140138 Proportion explained\t0 Species\t3\t2 Species1\t0.408869425742\t0.0695518116298 Species2\t-0.1153860437\t-0.299767683538 Species3\t-0.309967102571\t0.187391917117 Site\t3\t2 Site1\t-0.848956053187\t0.882764759014 Site2\t-0.220458650578\t-1.34482000302 Site3\t1.66697179591\t0.470324389808 Biplot\t0\t0 Site constraints\t0\t0 Load the ordination results from the file: >>> from StringIO import StringIO >>> from skbio.math.stats.ordination import OrdinationResults >>> or_f = StringIO("Eigvals\t2\n" ... "0.0961330159181\t0.0409418140138\n" ... "\n" ... "Proportion explained\t0\n" ... "\n" ... "Species\t3\t2\n" ... "Species1\t0.408869425742\t0.0695518116298\n" ... "Species2\t-0.1153860437\t-0.299767683538\n" ... "Species3\t-0.309967102571\t0.187391917117\n" ... "\n" ... "Site\t3\t2\n" ... "Site1\t-0.848956053187\t0.882764759014\n" ... "Site2\t-0.220458650578\t-1.34482000302\n" ... "Site3\t1.66697179591\t0.470324389808\n" ... "\n" ... "Biplot\t0\t0\n" ... "\n" ... "Site constraints\t0\t0\n") >>> ord_res = OrdinationResults.from_file(or_f) """ with open_file(ord_res_f, 'U') as fd: orf = iter(fd) # Starting at line 0, we should find the eigvals eigvals = cls._parse_eigvals(orf) # The next line should be an empty line cls._check_empty_line(orf) # Now we should find the proportion explained section prop_expl = cls._parse_proportion_explained(orf) if prop_expl is not None: if len(prop_expl) != len(eigvals): raise ValueError( 'There should be as many proportion explained' ' values as eigvals: %d != %d' % (len(prop_expl), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the species section species, species_ids = cls._parse_coords(orf, 'Species') if species is not None: if len(species[0]) != len(eigvals): raise ValueError('There should be as many coordinates per' ' species as eigvals: %d != %d' % (len(species[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site section site, site_ids = cls._parse_coords(orf, 'Site') if site is not None: if len(site[0]) != len(eigvals): raise ValueError('There should be as many coordinates per' ' site as eigvals: %d != %d' % (len(site[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the biplot section biplot = cls._parse_biplot(orf) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site constraints section cons, cons_ids = cls._parse_coords(orf, 'Site constraints') if cons_ids is not None and site_ids is not None: if cons_ids != site_ids: raise ValueError( 'Site constraints ids and site ids must be' ' equal: %s != %s' % (cons_ids, site_ids)) return cls(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=cons, proportion_explained=prop_expl, species_ids=species_ids, site_ids=site_ids)
def to_file(self, out_f): """Save the ordination results to file in text format. Parameters ---------- out_f : file-like object or filename File-like object to write serialized data to, or name of file. If it's a file-like object, it must have a ``write`` method, and it won't be closed. Else, it is opened and closed after writing. See Also -------- from_file """ with open_file(out_f, 'w') as out_f: # Write eigvals out_f.write("Eigvals\t%d\n" % self.eigvals.shape) out_f.write("%s\n\n" % '\t'.join(np.asarray(self.eigvals, dtype=np.str))) # Write proportion explained if self.proportion_explained is None: out_f.write("Proportion explained\t0\n\n") else: out_f.write("Proportion explained\t%d\n" % self.proportion_explained.shape) out_f.write("%s\n\n" % '\t'.join( np.asarray(self.proportion_explained, dtype=np.str))) # Write species if self.species is None: out_f.write("Species\t0\t0\n\n") else: out_f.write("Species\t%d\t%d\n" % self.species.shape) for id_, vals in zip(self.species_ids, self.species): out_f.write( "%s\t%s\n" % (id_, '\t'.join(np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write site if self.site is None: out_f.write("Site\t0\t0\n\n") else: out_f.write("Site\t%d\t%d\n" % self.site.shape) for id_, vals in zip(self.site_ids, self.site): out_f.write( "%s\t%s\n" % (id_, '\t'.join(np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write biplot if self.biplot is None: out_f.write("Biplot\t0\t0\n\n") else: out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape) for vals in self.biplot: out_f.write("%s\n" % '\t'.join(np.asarray(vals, dtype=np.str))) out_f.write("\n") # Write site-constraints if self.site_constraints is None: out_f.write("Site constraints\t0\t0\n") else: out_f.write("Site constraints\t%d\t%d\n" % self.site_constraints.shape) for id_, vals in zip(self.site_ids, self.site_constraints): out_f.write( "%s\t%s\n" % (id_, '\t'.join(np.asarray(vals, dtype=np.str))))
def to_file(self, out_f): """Save the ordination results to file in text format. Parameters ---------- out_f : file-like object or filename File-like object to write serialized data to, or name of file. If it's a file-like object, it must have a ``write`` method, and it won't be closed. Else, it is opened and closed after writing. See Also -------- from_file """ with open_file(out_f, 'w') as out_f: # Write eigvals out_f.write("Eigvals\t%d\n" % self.eigvals.shape) out_f.write("%s\n\n" % '\t'.join(np.asarray(self.eigvals, dtype=np.str))) # Write proportion explained if self.proportion_explained is None: out_f.write("Proportion explained\t0\n\n") else: out_f.write("Proportion explained\t%d\n" % self.proportion_explained.shape) out_f.write("%s\n\n" % '\t'.join( np.asarray(self.proportion_explained, dtype=np.str))) # Write species if self.species is None: out_f.write("Species\t0\t0\n\n") else: out_f.write("Species\t%d\t%d\n" % self.species.shape) for id_, vals in zip(self.species_ids, self.species): out_f.write("%s\t%s\n" % (id_, '\t'.join(np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write site if self.site is None: out_f.write("Site\t0\t0\n\n") else: out_f.write("Site\t%d\t%d\n" % self.site.shape) for id_, vals in zip(self.site_ids, self.site): out_f.write("%s\t%s\n" % (id_, '\t'.join( np.asarray(vals, dtype=np.str)))) out_f.write("\n") # Write biplot if self.biplot is None: out_f.write("Biplot\t0\t0\n\n") else: out_f.write("Biplot\t%d\t%d\n" % self.biplot.shape) for vals in self.biplot: out_f.write("%s\n" % '\t'.join( np.asarray(vals, dtype=np.str))) out_f.write("\n") # Write site-constraints if self.site_constraints is None: out_f.write("Site constraints\t0\t0\n") else: out_f.write("Site constraints\t%d\t%d\n" % self.site_constraints.shape) for id_, vals in zip(self.site_ids, self.site_constraints): out_f.write("%s\t%s\n" % (id_, '\t'.join( np.asarray(vals, dtype=np.str))))
def from_file(cls, ord_res_f): r"""Load ordination results from text file. Creates a `OrdinationResults` instance from serialized results stored as text. `ord_res_f` must be a file-like object containing text. The ord_res_f format should look like:: Eigvals<tab>2 0.096<tab>0.040 Proportion explained<tab>2 0.512<tab>0.488 Species<tab>3<tab>2 Species1<tab>0.408<tab>0.069 Species2<tab>-0.115<tab>-0.299 Species3<tab>-0.309<tab>0.187 Site<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 Biplot<tab>4<tab>3 0.422<tab>-0.559<tab>-0.713 0.988<tab>0.150<tab>-0.011 -0.556<tab>0.817<tab>0.147 -0.404<tab>-0.905<tab>-0.127 Site constraints<tab>3<tab>2 Site1<tab>-0.848<tab>0.882 Site2<tab>-0.220<tab>-1.344 Site3<tab>1.666<tab>0.470 If a given result attribute is not present (e.g. Biplot), it should be still defined and declare its dimensions as 0:: Biplot<tab>0<tab>0 Parameters ---------- ord_res_f : iterable of str or str Iterable of strings (e.g., open file handle, file-like object, list of strings, etc.) or a file path (a string) containing the serialized ordination results. Returns ------- OrdinationResults Instance of type `cls` containing the parsed contents of `ord_res_f`. Raises ------ ValueError if the shapes of the different sections of the file are not consistent FileFormatError if the format of the file is not recognized Examples -------- Assume we have the following tab-delimited text file storing the ordination results:: Eigvals\t2 0.0961330159181\t0.0409418140138 Proportion explained\t0 Species\t3\t2 Species1\t0.408869425742\t0.0695518116298 Species2\t-0.1153860437\t-0.299767683538 Species3\t-0.309967102571\t0.187391917117 Site\t3\t2 Site1\t-0.848956053187\t0.882764759014 Site2\t-0.220458650578\t-1.34482000302 Site3\t1.66697179591\t0.470324389808 Biplot\t0\t0 Site constraints\t0\t0 Load the ordination results from the file: >>> from StringIO import StringIO >>> from skbio.math.stats.ordination import OrdinationResults >>> or_f = StringIO("Eigvals\t2\n" ... "0.0961330159181\t0.0409418140138\n" ... "\n" ... "Proportion explained\t0\n" ... "\n" ... "Species\t3\t2\n" ... "Species1\t0.408869425742\t0.0695518116298\n" ... "Species2\t-0.1153860437\t-0.299767683538\n" ... "Species3\t-0.309967102571\t0.187391917117\n" ... "\n" ... "Site\t3\t2\n" ... "Site1\t-0.848956053187\t0.882764759014\n" ... "Site2\t-0.220458650578\t-1.34482000302\n" ... "Site3\t1.66697179591\t0.470324389808\n" ... "\n" ... "Biplot\t0\t0\n" ... "\n" ... "Site constraints\t0\t0\n") >>> ord_res = OrdinationResults.from_file(or_f) """ with open_file(ord_res_f, 'U') as fd: orf = iter(fd) # Starting at line 0, we should find the eigvals eigvals = cls._parse_eigvals(orf) # The next line should be an empty line cls._check_empty_line(orf) # Now we should find the proportion explained section prop_expl = cls._parse_proportion_explained(orf) if prop_expl is not None: if len(prop_expl) != len(eigvals): raise ValueError( 'There should be as many proportion explained' ' values as eigvals: %d != %d' % (len(prop_expl), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the species section species, species_ids = cls._parse_coords(orf, 'Species') if species is not None: if len(species[0]) != len(eigvals): raise ValueError( 'There should be as many coordinates per' ' species as eigvals: %d != %d' % (len(species[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site section site, site_ids = cls._parse_coords(orf, 'Site') if site is not None: if len(site[0]) != len(eigvals): raise ValueError( 'There should be as many coordinates per' ' site as eigvals: %d != %d' % (len(site[0]), len(eigvals))) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the biplot section biplot = cls._parse_biplot(orf) # The next line should be an empty line cls._check_empty_line(orf) # Next section should be the site constraints section cons, cons_ids = cls._parse_coords(orf, 'Site constraints') if cons_ids is not None and site_ids is not None: if cons_ids != site_ids: raise ValueError( 'Site constraints ids and site ids must be' ' equal: %s != %s' % (cons_ids, site_ids)) return cls(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=cons, proportion_explained=prop_expl, species_ids=species_ids, site_ids=site_ids)
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33): r"""yields label, seq, and qual from a fastq file. Parameters ---------- data : open file object or str An open fastq file (opened in binary mode) or a path to it. strict : bool, optional Defaults to ``False``. If strict is true a FastqParse error will be raised if the seq and qual labels dont' match. enforce_qual_range : bool, optional Defaults to ``True``. If ``True``, an exception will be raised if a quality score outside the range [0, 62] is detected phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers Returns ------- label, seq, qual : (str, bytes, np.array) yields the label, sequence and quality for each entry Examples -------- Assume we have a fastq formatted file with the following contents:: @seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG + ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF @seq2 TATGTATATATAACATATACATATATACATACATA + ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb We can use the following code: >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_fastq >>> fastq_f = StringIO('@seq1\n' ... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n' ... '+\n' ... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n' ... '@seq2\n' ... 'TATGTATATATAACATATACATATATACATACATA\n' ... '+\n' ... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n') >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64): ... print(label) ... print(seq) ... print(qual) seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35 35 25 20 28 20 28 25 28 23 6] seq2 TATGTATATATAACATATACATATATACATACATA [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32 35 32 28 33 20 32 32 34 34 34] """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) with open_file(data, 'rb') as data: iters = [iter(data)] * 4 for seqid, seq, qualid, qual in zip_longest(*iters): seqid = seqid.strip() # If the file simply ended in a blankline, do not error if seqid is '': continue # Error if an incomplete record is found # Note: seqid cannot be None, because if all 4 values were None, # then the loop condition would be false, and we could not have # gotten to this point if seq is None or qualid is None or qual is None: raise FastqParseError("Incomplete FASTQ record found at end " "of file") seq = seq.strip() qualid = qualid.strip() qual = qual.strip() seqid = _drop_id_marker(seqid) try: seq = str(seq.decode("utf-8")) except AttributeError: pass qualid = _drop_id_marker(qualid) if strict: if seqid != qualid: raise FastqParseError('ID mismatch: {} != {}'.format( seqid, qualid)) # bounds based on illumina limits, see: # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html qual = phred_f(qual) if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()): raise FastqParseError("Failed qual conversion for seq id: %s. " "This may be because you passed an " "incorrect value for phred_offset." % seqid) yield (seqid, seq, qual)
def parse_qseq(infile, phred_offset=33): r"""Generator of seq ids, seqs, quals and other records from a qseq file. Parameters ---------- infile : open file object or str An open qseq file or a path to a qseq file. phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers. Returns ------- four-item tuple: (str, str, np.array(dtype=int), namedtuple) yields the sequence id, sequence, qual array and other record information for each entry. The sequence ID format is: <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y># <Index>/<Read number>. The namedtuple attributes are: machine_name, run, lane, tile, x, y, index, read and filtered. Examples -------- Assume we have a qseq-formatted file with the following contents:: CRESSIA 242 1 2204 1453 1918 0 1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA B[[[W][Y[Zccccccccc\cccac_____ 1 CRESSIA 242 1 2204 1490 1921 0 2 ..GTAAAACCCATATATTGAAAACTACAAA BWUTWcXVXXcccc_cccccccccc_cccc 1 >>> from future.utils.six import StringIO >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t' ... '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n' ... 'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t' ... '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n' ... ) We can parse this as follows: >>> from skbio import parse_qseq >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64): ... print(seq_id) ... print(seq) ... print(qual[:10]) ... print(record.run) ... print(record.lane) CRESSIA_242:1:2204:1453:1918#0/1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA [ 2 27 27 27 23 29 27 25 27 26] 242 1 CRESSIA_242:1:2204:1490:1921#0/2 ..GTAAAACCCATATATTGAAAACTACAAA [ 2 23 21 20 23 35 24 22 24 24] 242 1 """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) # namedtuple to store all other record information Record = collections.namedtuple( 'Record', ['machine_name', 'run', 'lane', 'tile', 'x', 'y', 'index', 'read', 'filtered']) with open_file(infile) as lines: for rec in lines: try: rec = str(rec.decode('utf-8')) except AttributeError: pass # parse record. try: (machine_name, run, lane, tile, x, y, index, read, seq, qual, filtered) = rec.split() except ValueError: raise QseqParseError("Invalid QSEQ record found.") # sequence ID is formatted using the first eight items. seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % ( machine_name, run, lane, tile, x, y, index, read) # qual string is converted to an array of ints. qual = phred_f(qual) # other items are returned as a namedtuple record = Record( machine_name=machine_name, run=int(run), lane=int(lane), tile=int(tile), x=int(x), y=int(y), index=int(index), read=int(read), filtered=bool(int(filtered))) yield seq_id, seq, qual, record
def parse_qseq(infile, phred_offset=33): r"""Generator of seq ids, seqs, quals and other records from a qseq file. Parameters ---------- infile : open file object or str An open qseq file or a path to a qseq file. phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers. Returns ------- four-item tuple: (str, str, np.array(dtype=int), namedtuple) yields the sequence id, sequence, qual array and other record information for each entry. The sequence ID format is: <Machine name>_<Run number>:<Lane number>:<Tile number>:<x>:<y># <Index>/<Read number>. The namedtuple attributes are: machine_name, run, lane, tile, x, y, index, read and filtered. Examples -------- Assume we have a qseq-formatted file with the following contents:: CRESSIA 242 1 2204 1453 1918 0 1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA B[[[W][Y[Zccccccccc\cccac_____ 1 CRESSIA 242 1 2204 1490 1921 0 2 ..GTAAAACCCATATATTGAAAACTACAAA BWUTWcXVXXcccc_cccccccccc_cccc 1 >>> from future.utils.six import StringIO >>> qseq_f = StringIO('CRESSIA\t242\t1\t2204\t1453\t1918\t0\t1\t' ... '.TTAATAAGAATGTCTGTTGTGGCTTAAAA\tB[[[W][Y[Zccccccccc\cccac_____\t1\n' ... 'CRESSIA\t242\t1\t2204\t1490\t1921\t0\t2\t' ... '..GTAAAACCCATATATTGAAAACTACAAA\tBWUTWcXVXXcccc_cccccccccc_cccc\t1\n' ... ) We can parse this as follows: >>> from skbio import parse_qseq >>> for seq_id, seq, qual, record in parse_qseq(qseq_f, phred_offset=64): ... print(seq_id) ... print(seq) ... print(qual[:10]) ... print(record.run) ... print(record.lane) CRESSIA_242:1:2204:1453:1918#0/1 .TTAATAAGAATGTCTGTTGTGGCTTAAAA [ 2 27 27 27 23 29 27 25 27 26] 242 1 CRESSIA_242:1:2204:1490:1921#0/2 ..GTAAAACCCATATATTGAAAACTACAAA [ 2 23 21 20 23 35 24 22 24 24] 242 1 """ if phred_offset == 33: phred_f = ascii_to_phred33 elif phred_offset == 64: phred_f = ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) # namedtuple to store all other record information Record = collections.namedtuple('Record', [ 'machine_name', 'run', 'lane', 'tile', 'x', 'y', 'index', 'read', 'filtered' ]) with open_file(infile) as lines: for rec in lines: try: rec = str(rec.decode('utf-8')) except AttributeError: pass # parse record. try: (machine_name, run, lane, tile, x, y, index, read, seq, qual, filtered) = rec.split() except ValueError: raise QseqParseError("Invalid QSEQ record found.") # sequence ID is formatted using the first eight items. seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (machine_name, run, lane, tile, x, y, index, read) # qual string is converted to an array of ints. qual = phred_f(qual) # other items are returned as a namedtuple record = Record(machine_name=machine_name, run=int(run), lane=int(lane), tile=int(tile), x=int(x), y=int(y), index=int(index), read=int(read), filtered=bool(int(filtered))) yield seq_id, seq, qual, record
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=33): r"""yields label, seq, and qual from a fastq file. Parameters ---------- data : open file object or str An open fastq file (opened in binary mode) or a path to it. strict : bool, optional Defaults to ``False``. If strict is true a FastqParse error will be raised if the seq and qual labels dont' match. enforce_qual_range : bool, optional Defaults to ``True``. If ``True``, an exception will be raised if a quality score outside the range [0, 62] is detected phred_offset : {33, 64}, optional What Phred offset to use when converting qual score symbols to integers Returns ------- label, seq, qual : (str, bytes, np.array) yields the label, sequence and quality for each entry Examples -------- Assume we have a fastq formatted file with the following contents:: @seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG + ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF @seq2 TATGTATATATAACATATACATATATACATACATA + ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb We can use the following code: >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_fastq >>> fastq_f = StringIO('@seq1\n' ... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n' ... '+\n' ... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n' ... '@seq2\n' ... 'TATGTATATATAACATATACATATATACATACATA\n' ... '+\n' ... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n') >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64): ... print(label) ... print(seq) ... print(qual) seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35 35 25 20 28 20 28 25 28 23 6] seq2 TATGTATATATAACATATACATATATACATACATA [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32 35 32 28 33 20 32 32 34 34 34] """ if phred_offset == 33: phred_f = _ascii_to_phred33 elif phred_offset == 64: phred_f = _ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) with open_file(data, 'rb') as data: iters = [iter(data)] * 4 for seqid, seq, qualid, qual in zip_longest(*iters): seqid = seqid.strip() # If the file simply ended in a blankline, do not error if seqid is '': continue # Error if an incomplete record is found # Note: seqid cannot be None, because if all 4 values were None, # then the loop condition would be false, and we could not have # gotten to this point if seq is None or qualid is None or qual is None: raise FastqParseError("Incomplete FASTQ record found at end " "of file") seq = seq.strip() qualid = qualid.strip() qual = qual.strip() seqid = _drop_id_marker(seqid) try: seq = str(seq.decode("utf-8")) except AttributeError: pass qualid = _drop_id_marker(qualid) if strict: if seqid != qualid: raise FastqParseError('ID mismatch: {} != {}'.format( seqid, qualid)) # bounds based on illumina limits, see: # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html qual = phred_f(qual) if enforce_qual_range and ((qual < 0).any() or (qual > 62).any()): raise FastqParseError("Failed qual conversion for seq id: %s. " "This may be because you passed an " "incorrect value for phred_offset." % seqid) yield (seqid, seq, qual)
def parse_fastq(data, strict=False, phred_offset=33): r"""yields label, seq, and qual from a fastq file. Parameters ---------- data : open file object or str An open fastq file (opened in binary mode) or a path to it. strict : bool If strict is true a FastqParse error will be raised if the seq and qual labels dont' match. phred_offset : int or None Force a Phred offset, currently restricted to either 33 or 64. Default behavior is to infer the Phred offset. Returns ------- label, seq, qual : (str, bytes, np.array) yields the label, sequence and quality for each entry Examples -------- Assume we have a fastq formatted file with the following contents:: @seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG + ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF @seq2 TATGTATATATAACATATACATATATACATACATA + ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb We can use the following code: >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_fastq >>> fastq_f = StringIO('@seq1\n' ... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n' ... '+\n' ... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n' ... '@seq2\n' ... 'TATGTATATATAACATATACATATATACATACATA\n' ... '+\n' ... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n') >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64): ... print label ... print seq ... print qual seq1 AACACCAAACTTCTCCACCACGTGAGCTACAAAAG [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35 35 25 20 28 20 28 25 28 23 6] seq2 TATGTATATATAACATATACATATATACATACATA [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32 35 32 28 33 20 32 32 34 34 34] """ # line number for modulus operation SEQUENCEID = 0 SEQUENCE = 1 QUALID = 2 QUAL = 3 with open_file(data, 'rb') as data: data = iter(data) first_line = next(data).strip() if phred_offset == 33: phred_f = _ascii_to_phred33 elif phred_offset == 64: phred_f = _ascii_to_phred64 else: raise ValueError("Unknown PHRED offset of %s" % phred_offset) seqid = _drop_id_marker(first_line) seq = None qualid = None qual = None for idx, line in enumerate(data): # +1 due to fetch of line prior to loop lineno = idx + 1 linetype = lineno % 4 line = line.strip() if linetype == SEQUENCEID: yield seqid, seq, qual seqid = _drop_id_marker(line) seq = None qualid = None qual = None elif linetype == SEQUENCE: seq = line try: seq = str(seq.decode("utf-8")) except AttributeError: pass elif linetype == QUALID: qualid = _drop_id_marker(line) if strict: if seqid != qualid: raise FastqParseError('ID mismatch: {} != {}'.format( seqid, qualid)) # bounds based on illumina limits, see: # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html elif linetype == QUAL: qual = phred_f(line) if (qual < 0).any() or (qual > 62).any(): raise FastqParseError("Failed qual conversion for seq " "id: %s. This may be because you " "passed an incorrect value for " "phred_offset." % seqid) if seqid: yield (seqid, seq, qual)