def _label_line_parser(record, splitter, strict=True): """Returns dict mapping list of data to labels, plus list with field order. Field order contains labels in order encountered in file. NOTE: doesn't care if lines are out of order in different blocks. This should never happen anyway, but it's possible that this behavior should be changed to tighten up validation. """ labels = [] result = {} for line in record: try: key, val = splitter(line.rstrip()) except: if strict: raise RecordError( "Failed to extract key and value from line %s" % line) else: continue # just skip the line if not strict if key in result: result[key].append(val) else: result[key] = [val] labels.append(key) return result, labels
def parse_qual(infile, full_header=False): r"""yields label and qual from a qual file. Parameters ---------- infile : open file object or str An open fasta file or path to it. full_header : bool Return the full header or just the id Returns ------- label : str The quality label qual : array The quality at each position Examples -------- Assume we have a qual formatted file with the following contents:: >seq1 10 20 30 40 >seq2 1 2 3 4 >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_qual >>> qual_f = StringIO('>seq1\n' ... '10 20 30 40\n' ... '>seq2\n' ... '1 2 3 4\n') >>> for label, qual in parse_qual(qual_f): ... print(label) ... print(qual) seq1 [10 20 30 40] seq2 [1 2 3 4] """ for rec in FastaFinder(infile): curr_id = rec[0][1:] curr_qual = ' '.join(rec[1:]) try: parts = np.asarray(curr_qual.split(), dtype=int) except ValueError: raise RecordError( "Invalid qual file. Check the format of the qual file: each " "quality score must be convertible to an integer.") if full_header: curr_pid = curr_id else: curr_pid = curr_id.split()[0] yield (curr_pid, parts)
def parser(lines): curr = [] for l in lines: if constructor is not None: line = constructor(l) else: line = l if ignore(line): continue curr.append(line) if len(curr) == num: yield curr curr = [] if curr: raise RecordError("Non-blank lines not even multiple of %s" % num)
def verify_valid_fasta_format(input_fasta_fp): """ Tests fasta filepath to determine if valid format input_fasta_fp: fasta filepath """ fasta_f = open(input_fasta_fp, "U") try: for label, seq in parse_fasta(fasta_f): continue except RecordError: raise RecordError("Input fasta file not valid fasta format. Error " + "found at %s label and %s sequence " % (label, seq)) fasta_f.close()
def parser(lines): curr = [] for line in lines: if constructor is not None: line = constructor(line) if ignore(line): continue curr.append(line) # if we find the label, return the previous record if is_tail_line(line): yield curr curr = [] # don't forget to return the last record in the file if curr: if strict: raise RecordError('lines exist after the last tail_line ' 'or no tail_line at all') else: yield curr
def parser(lines): curr = [] for line in lines: if constructor is not None: line = constructor(line) # else: # line = l # ignore blank lines if ignore(line): continue # if we find the delimiter, return the line; otherwise, keep it if line == delimiter: if keep_delimiter: curr.append(line) yield curr curr = [] else: curr.append(line) if curr: if strict: raise RecordError("Found additional data after records: %s" % (curr)) else: yield curr
def parse_fasta(infile, strict=True, label_to_name=None, finder=FastaFinder, label_characters='>', ignore_comment=False): r"""Generator of labels and sequences from a fasta file. Parameters ---------- infile : open file object or str An open fasta file or a path to a fasta file. strict : bool If ``True`` a ``RecordError`` will be raised if there is a fasta label line with no associated sequence, or a sequence with no associated label line (in other words, if there is a partial record). If ``False``, partial records will be skipped. label_to_name : function A function to apply to the sequence label (i.e., text on the header line) before yielding it. By default, the sequence label is returned with no processing. This function must take a single string as input and return a single string as output. finder : function The function to apply to find records in the fasta file. In general you should not have to change this. label_characters : str String used to indicate the beginning of a new record. In general you should not have to change this. ignore_comment : bool If `True`, split the sequence label on spaces, and return the label only as the first space separated field (i.e., the sequence identifier). Note: if both ``ignore_comment`` and ``label_to_name`` are passed, ``ignore_comment`` is ignored (both operate on the label, so there is potential for things to get messy otherwise). Returns ------- two-item tuple of str yields the label and sequence for each entry. Raises ------ RecordError If ``strict == True``, raises a ``RecordError`` if there is a fasta label line with no associated sequence, or a sequence with no associated label line (in other words, if there is a partial record). Examples -------- Assume we have a fasta-formatted file with the following contents:: >seq1 db-accession-149855 CGATGTCGATCGATCGATCGATCAG >seq2 db-accession-34989 CATCGATCGATCGATGCATGCATGCATG >>> from StringIO import StringIO >>> fasta_f = StringIO('>seq1 db-accession-149855\n' ... 'CGATGTCGATCGATCGATCGATCAG\n' ... '>seq2 db-accession-34989\n' ... 'CATCGATCGATCGATGCATGCATGCATG\n') We can parse this as follows: >>> from skbio.parse.sequences import parse_fasta >>> for label, seq in parse_fasta(fasta_f): ... print(label, seq) seq1 db-accession-149855 CGATGTCGATCGATCGATCGATCAG seq2 db-accession-34989 CATCGATCGATCGATGCATGCATGCATG The sequence label or header line in a fasta file is defined as containing two separate pieces of information, delimited by a space. The first space- separated entry is the sequence identifier, and everything following the first space is considered additional information (e.g., comments about the source of the sequence or the molecule that it encodes). Often we don't care about that information within our code. If you want to just return the sequence identifier from that line, you can pass ``ignore_comment=True``: >>> from StringIO import StringIO >>> fasta_f = StringIO('>seq1 db-accession-149855\n' ... 'CGATGTCGATCGATCGATCGATCAG\n' ... '>seq2 db-accession-34989\n' ... 'CATCGATCGATCGATGCATGCATGCATG\n') >>> from skbio.parse.sequences import parse_fasta >>> for label, seq in parse_fasta(fasta_f, ignore_comment=True): ... print(label, seq) seq1 CGATGTCGATCGATCGATCGATCAG seq2 CATCGATCGATCGATGCATGCATGCATG """ for rec in finder(infile): # first line must be a label line if not rec[0][0] in label_characters: if strict: raise RecordError( "Found Fasta record without label line: %s" % rec) else: continue # record must have at least one sequence if len(rec) < 2: if strict: raise RecordError( "Found label line without sequences: %s" % rec) else: continue # remove the label character from the beginning of the label label = rec[0][1:].strip() # if the user passed a label_to_name function, apply that to the label if label_to_name is not None: label = label_to_name(label) # otherwise, if the user passed ignore_comment, split the label on # spaces, and return the first space separated field (i.e., the # sequence identifier) elif ignore_comment: label = label.split()[0] else: pass # join the sequence lines into a single string seq = ''.join(rec[1:]) yield label, seq
def parse_qual(infile, full_header=False): r"""yields label and qual from a qual file. .. note:: Deprecated in scikit-bio 0.2.0-dev ``parse_qual`` will be removed in scikit-bio 0.3.0. It is replaced by ``read``, which is a more general method for deserializing FASTA/QUAL-formatted files. ``read`` supports multiple file formats, automatic file format detection, etc. by taking advantage of scikit-bio's I/O registry system. See :mod:`skbio.io` for more details. Parameters ---------- infile : open file object or str An open fasta file or path to it. full_header : bool Return the full header or just the id Returns ------- label : str The quality label qual : array The quality at each position Examples -------- Assume we have a qual formatted file with the following contents:: >seq1 10 20 30 40 >seq2 1 2 3 4 >>> from StringIO import StringIO >>> from skbio.parse.sequences import parse_qual >>> qual_f = StringIO('>seq1\n' ... '10 20 30 40\n' ... '>seq2\n' ... '1 2 3 4\n') >>> for label, qual in parse_qual(qual_f): ... print(label) ... print(qual) seq1 [10 20 30 40] seq2 [1 2 3 4] """ warnings.warn( "`parse_qual` is deprecated and will be removed in scikit-bio 0.3.0. " "Please update your code to use " "`skbio.io.read(fasta_fh, qual=qual_fh, format='fasta')` to obtain a " "generator of `BiologicalSequence` objects (or subclasses, see the " "`constructor` parameter) with quality scores.", DeprecationWarning) for rec in FastaFinder(infile): curr_id = rec[0][1:] curr_qual = ' '.join(rec[1:]) try: parts = np.asarray(curr_qual.split(), dtype=int) except ValueError: raise RecordError( "Invalid qual file. Check the format of the qual file: each " "quality score must be convertible to an integer.") if full_header: curr_pid = curr_id else: curr_pid = curr_id.split()[0] yield (curr_pid, parts)