def MinimalRdbParser(infile, strict=True): """Yield successive sequences as (headerLines, sequence) tuples. If strict is True (default) raises RecordError when 'seq' label is missing and if the record doesn't contain any sequences. """ for rec in RdbFinder(infile): index = None for line in rec: if is_seq_label(line): index = rec.index(line) + 1 # index of first sequence line # if there is no line that starts with 'seq:' throw error or skip if not index: if strict: raise RecordError( "Found Rdb record without seq label " + "line: %s" % rec[0] ) else: continue headerLines = rec[:index] sequence = "".join(rec[index:-1]) # strip off the delimiter if sequence.endswith("*"): sequence = sequence[:-1] # strip off '*' # if there are no sequences throw error or skip if not sequence: if strict: raise RecordError("Found Rdb record without sequences: %s" % rec[0]) else: continue yield headerLines, sequence
def MinimalFastaParser( infile, strict=True, label_to_name=str, finder=FastaFinder, label_characters=">" ): """Yields successive sequences from infile as (label, seq) tuples. If strict is True (default), raises RecordError when label or seq missing. """ try: infile = open_(infile) close_at_end = True except (TypeError, AttributeError): close_at_end = False for rec in finder(infile): # first line must be a label line if not rec[0][0] in label_characters: if strict: raise RecordError("Found Fasta record without label line: %s" % rec) continue # record must have at least one sequence if len(rec) < 2: if strict: raise RecordError("Found label line without sequences: %s" % rec) else: continue label = rec[0][1:].strip() label = label_to_name(label) seq = "".join(rec[1:]) yield label, seq if close_at_end: infile.close()
def FastaParser(infile, seq_maker=None, info_maker=MinimalInfo, strict=True): """Yields successive sequences from infile as (name, sequence) tuples. Constructs the sequence using seq_maker(seq, info=Info(info_maker(label))). If strict is True (default), raises RecordError when label or seq missing. Also raises RecordError if seq_maker fails. It is info_maker's responsibility to raise the appropriate RecordError or FieldError on failure. Result of info_maker need not actually be an info object, but can just be a dict or other data that Info can use in its constructor. """ if seq_maker is None: seq_maker = Sequence for label, seq in MinimalFastaParser(infile, strict=strict): if strict: # need to do error checking when constructing info and sequence try: name, info = info_maker(label) # will raise exception if bad yield name, seq_maker(seq, name=name, info=info) except Exception as e: raise RecordError( "Sequence construction failed on record with label %s" % label) else: # not strict: just skip any record that raises an exception try: name, info = info_maker(label) yield (name, seq_maker(seq, name=name, info=info)) except Exception as e: continue
def LabelLineParser(record, splitter, strict=True): """Returns dict mapping list of data to labels, plus list with field order. Field order contains labels in order encountered in file. NOTE: doesn't care if lines are out of order in different blocks. This should never happen anyway, but it's possible that this behavior should be changed to tighten up validation. """ labels = [] result = {} for line in record: try: key, val = splitter(line.rstrip()) except: if strict: raise RecordError( "Failed to extract key and value from line %s" % line) else: continue # just skip the line if not strict if key in result: result[key].append(val) else: result[key] = [val] labels.append(key) return result, labels
def MinimalNexusAlignParser(align_path): """returns {label: seq, ...}""" if type(align_path) == str: infile = open_(align_path) else: infile = align_path isblock = re.compile(r"begin\s+(data|characters)").search inblock = False try: line = infile.readline() except AttributeError: # guessing it's a list of strings from a nexus file line = infile.pop(0) if not line.lower().startswith("#nexus"): raise ValueError("not a nexus file") block = [] index = None for line in infile: if isblock(line.lower()): inblock = True elif inblock and line.lower().startswith("end;"): break elif inblock: line = line.strip() if line.lower().startswith("matrix"): index = len(block) elif not line.startswith(";"): block.append(line) if hasattr(infile, "close"): infile.close() if not block: raise ValueError("not found DATA or CHARACTER block") elif index is None: raise RecordError("malformed block, no 'matrix' line") block = block[index:] seqs = defaultdict(list) for line in block: if not line or (line.startswith("[") and line.endswith("]")): # blank or comment line continue line = line.split() seqs[line[0]].append("".join(line[1:])) for n, s in seqs.items(): yield n, "".join(s)
def NcbiFastaLabelParser(line): """Creates an Info object and populates it with the line contents. As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta files were consistent with this format. """ info = Info() try: ignore, gi, db, db_ref, description = list(map(strip, line.split("|", 4))) except ValueError: # probably got wrong value raise RecordError("Unable to parse label line %s" % line) info.GI = gi info[NcbiLabels[db]] = db_ref info.Description = description return gi, info
def parser(lines): curr = [] for l in lines: if constructor: line = constructor(l) else: line = l if ignore(line): continue curr.append(line) if len(curr) == num: yield curr curr = [] if curr: raise RecordError("Non-blank lines not even multiple of %s" % num)
def parser(lines): curr = [] for line in lines: if constructor: line = constructor(line) if ignore(line): continue curr.append(line) # if we find the label, return the previous record if is_tail_line(line): yield curr curr = [] # don't forget to return the last record in the file if curr: if strict: raise RecordError("lines exist after the last tail_line " "or no tail_line at all") else: yield curr
def parser(lines): curr = [] for line in lines: if constructor: line = constructor(line) # else: # line = l # ignore blank lines if ignore(line): continue # if we find the delimiter, return the line; otherwise, keep it if line == delimiter: if keep_delimiter: curr.append(line) yield curr curr = [] else: curr.append(line) if curr: if strict: raise RecordError("Found additional data after records: %s" % (curr)) else: yield curr
def RdbParser( lines, SeqConstructor=RnaSequence, LabelConstructor=InfoMaker, strict=True ): """Yield sequences from the Rdb record. lines: a stream of Rdb records. SeqConstructor: constructor function to create the final sequence object LabelConstructor: function that creates Info dictionary from label lines strict: boolean, when True, an error is raised when one occurs, when False, the record is ignored when an error occurs. This function returns proper RnaSequence objects when possible. It strips out the secondary structure information, and it replaces 'o' by '?'. The original sequence is stored in the info dictionary under 'OriginalSeq'. If the original sequence is the desired end product, use MinimalRdbParser. """ for header, sequence in MinimalRdbParser(lines, strict=strict): info = LabelConstructor(header) clean_seq = create_acceptable_sequence(sequence) # add original raw sequence to info info["OriginalSeq"] = sequence if strict: # need to do error checking while constructing info and sequence try: yield SeqConstructor(clean_seq, info=info) except AlphabetError: raise RecordError( "Sequence construction failed on record with reference %s." % (info.Refs) ) else: # not strict: just skip any record that raises an exception try: yield SeqConstructor(clean_seq, info=info) except: continue
def DndParser(lines, constructor=PhyloNode, unescape_name=False): """Returns tree from the Clustal .dnd file format, and anything equivalent. Tree is made up of cogent3.base.tree.PhyloNode objects, with branch lengths (by default, although you can pass in an alternative constructor explicitly). """ if isinstance(lines, str): data = lines else: data = "".join(lines) # skip arb comment stuff if present: start at first paren paren_index = data.find("(") data = data[paren_index:] left_count = data.count("(") right_count = data.count(")") if left_count != right_count: raise RecordError("Found %s left parens but %s right parens." % (left_count, right_count)) tokens = DndTokenizer(data) curr_node = None state = "PreColon" state1 = "PreClosed" last_token = None for t in tokens: if t == ":": # expecting branch length state = "PostColon" # prevent state reset last_token = t continue if t == ")" and (last_token == "," or last_token == "("): # node without name new_node = _new_child(curr_node, constructor) new_node.name = None curr_node = new_node.parent state1 = "PostClosed" last_token = t continue if t == ")": # closing the current node curr_node = curr_node.parent state1 = "PostClosed" last_token = t continue if t == "(": # opening a new node curr_node = _new_child(curr_node, constructor) elif t == ";": # end of data last_token = t break # node without name elif t == "," and (last_token == "," or last_token == "("): new_node = _new_child(curr_node, constructor) new_node.name = None curr_node = new_node.parent elif t == ",": # separator: next node adds to this node's parent curr_node = curr_node.parent elif state == "PreColon" and state1 == "PreClosed": # data for the current node new_node = _new_child(curr_node, constructor) if unescape_name: if t.startswith("'") and t.endswith("'"): while t.startswith("'") and t.endswith("'"): t = t[1:-1] else: if "_" in t: t = t.replace("_", " ") new_node.name = t curr_node = new_node elif state == "PreColon" and state1 == "PostClosed": if unescape_name: while t.startswith("'") and t.endswith("'"): t = t[1:-1] curr_node.name = t elif state == "PostColon": # length data for the current node curr_node.length = float(t) else: # can't think of a reason to get here raise RecordError("Incorrect PhyloNode state? %s" % t) state = "PreColon" # get here for any non-colon token state1 = "PreClosed" last_token = t if curr_node is not None and curr_node.parent is not None: raise RecordError("Didn't get back to root of tree.") if curr_node is None: # no data -- return empty node return constructor() return curr_node # this should be the root of the tree
def dnastrict(x, **kwargs): try: return Dna(x, check=True, **kwargs) except Exception as e: raise RecordError("Could not convert sequence")
def dnastrict(x, **kwargs): try: return DnaSequence(x, **kwargs) except Exception: raise RecordError("Could not convert sequence")
def MinimalPhylipParser(data, id_map=None, interleaved=True): """Yields successive sequences from data as (label, seq) tuples. **Need to implement id map. **NOTE if using phylip interleaved format, will cache entire file in memory before returning sequences. If phylip file not interleaved then will yield each successive sequence. data: sequence of lines in phylip format (an open file, list, etc) id_map: optional id mapping from external ids to phylip labels - not sure if we're going to implement this returns (id, sequence) tuples """ seq_cache = {} interleaved_id_map = {} id_offset = 10 curr_ct = -1 for line in data: if curr_ct == -1: # get header info num_seqs, seq_len, interleaved = _get_header_info(line) if not num_seqs or not seq_len: return curr_ct += 1 continue curr_id, curr_seq = _split_line(line, id_offset) # skip blank lines if not curr_id and not curr_seq: continue if not interleaved: if curr_id: if seq_cache: yield seq_cache[0], "".join(seq_cache[1:]) seq_cache = [curr_id, curr_seq] else: seq_cache.append(curr_seq) else: curr_id_ix = curr_ct % num_seqs if (curr_ct + 1) % num_seqs == 0: id_offset = 0 if curr_id_ix not in interleaved_id_map: interleaved_id_map[curr_id_ix] = curr_id seq_cache[curr_id_ix] = [] seq_cache[curr_id_ix].append(curr_seq) curr_ct += 1 # return joined sequences if interleaved if interleaved: for curr_id_ix, seq_parts in list(seq_cache.items()): join_seq = "".join(seq_parts) if len(join_seq) != seq_len: raise RecordError( "Length of sequence '%s' is not the same as in header " "Found %d, Expected %d" % (interleaved_id_map[curr_id_ix], len(join_seq), seq_len)) yield interleaved_id_map[curr_id_ix], join_seq # return last seq if not interleaved else: if seq_cache: yield seq_cache[0], "".join(seq_cache[1:])
def check_tree_info(tree_info): """makes sure that there is a tree section in the file""" if tree_info: pass else: raise RecordError("not a valid Nexus Tree File")