def CutgParser(infile, strict=True, constructor=CodonUsage): """Yields successive sequences from infile as CodonUsage objects. If strict is True (default), raises RecordError when label or seq missing. """ if not strict: #not much error checking needed: following makes logic clear for rec in CutgFinder(infile): try: label, counts = rec if not is_cutg_label(label): continue info = InfoFromLabel(label) freqs = constructor(list( zip(codon_order, list(map(int, counts.split())))), Info=info) yield freqs except: continue else: #need to do more detailed error checking count = 0 for rec in CutgFinder(infile): try: label, counts = rec except ValueError: #can't have got any counts raise RecordError("Found label without sequences: %s" % rec) if not is_cutg_label(label): raise RecordError("Found CUTG record without label: %s" % rec) info = InfoFromLabel(label) try: freqs = constructor(list( zip(codon_order, list(map(int, counts.split())))), Info=info) except NotImplementedError: raise RecordError("Unable to convert counts: %s" % counts) yield freqs
def MinimalFastaParser(infile, strict=True, \ label_to_name=str, finder=FastaFinder, \ is_label=None, label_characters='>'): """Yields successive sequences from infile as (label, seq) tuples. If strict is True (default), raises RecordError when label or seq missing. """ for rec in finder(infile): #first line must be a label line if not rec[0][0] in label_characters: if strict: raise RecordError("Found Fasta record without label line: %s"%\ rec) else: continue #record must have at least one sequence if len(rec) < 2: if strict: raise RecordError("Found label line without sequences: %s" % \ rec) else: continue label = rec[0][1:].strip() label = label_to_name(label) seq = ''.join(rec[1:]) yield label, seq
def MinimalRfamParser(infile, strict=True, seq_constructor=ChangedRnaSequence): """Yield successive sequences as (header, sequences, structure) tuples. header is a list of header lines sequences is an Alignment object. Sequences are objects keyed by the original labels in the database. structure is a WussStructure """ for record in RfamFinder(infile): header = [] sequences = [] structure = [] for line in record: if is_header_line(line): header.append(line.strip()) elif is_seq_line(line): sequences.append(line) elif is_structure_line(line): structure.append(line) else: continue #sequence and structure are required. #for example when looking at the stockholm format of just one family if not sequences or not structure: if strict: error = 'Found record with missing element(s): ' if not sequences: error += 'sequences ' if not structure: error += 'structure ' raise RecordError(error) else: continue #join all sequence parts together, construct label try: new_seqs = load_from_clustal(sequences, strict=strict, seq_constructor=seq_constructor) sequences = new_seqs except (DataError, RecordError) as e: if strict: raise RecordError(str(e)) else: continue #construct the structure try: res = load_from_clustal(structure, strict=strict) assert len(res.NamedSeqs) == 1 #otherwise multiple keys structure = res.NamedSeqs['#=GC SS_cons'] except (RecordError, KeyError, AssertionError) as e: if strict: raise RecordError("Can't parse structure of family: %s" % (str(header))) else: structure = None yield header, sequences, structure
def LabelLineParser(record, splitter, strict=True): """Returns dict mapping list of data to labels, plus list with field order. Field order contains labels in order encountered in file. NOTE: doesn't care if lines are out of order in different blocks. This should never happen anyway, but it's possible that this behavior should be changed to tighten up validation. """ labels = [] result = {} for line in record: try: key, val = splitter(line.rstrip()) except: if strict: raise RecordError("Failed to extract key and value from line %s" % line) else: continue #just skip the line if not strict if key in result: result[key].append(val) else: result[key] = [val] labels.append(key) return result, labels
def FastaParser(infile,seq_maker=None,info_maker=MinimalInfo,strict=True): """Yields successive sequences from infile as (name, sequence) tuples. Constructs the sequence using seq_maker(seq, info=Info(info_maker(label))). If strict is True (default), raises RecordError when label or seq missing. Also raises RecordError if seq_maker fails. It is info_maker's responsibility to raise the appropriate RecordError or FieldError on failure. Result of info_maker need not actually be an info object, but can just be a dict or other data that Info can use in its constructor. """ if seq_maker is None: seq_maker = Sequence for label, seq in MinimalFastaParser(infile, strict=strict): if strict: #need to do error checking when constructing info and sequence try: name, info = info_maker(label) #will raise exception if bad yield name, seq_maker(seq, Name=name, Info=info) except Exception as e: raise RecordError("Sequence construction failed on record with label %s" % label) else: #not strict: just skip any record that raises an exception try: name, info = info_maker(label) yield(name, seq_maker(seq, Name=name, Info=info)) except Exception as e: continue
def GcToInfo(gc_lines, strict=True): """Returns a dict constructed from the GC lines. gc_lines is a list of lines that contain per column annotation. Fields that (should) occur only once are stored as a single value """ # construct temporary dictionary containing all original information initial_info = defaultdict(list) for line in gc_lines: line = line.strip() if not line: continue try: init, feature, content = line.split(None, 2) if not init == '#=GC': raise RecordError except: if strict: raise RecordError("Failed to extract feature and content " +\ "information from line %s"%(line)) else: continue initial_info[feature].append(content.strip()) # transform initial dict into final one # throw away useless information; group information final_info = {} for key in list(initial_info.keys()): name = _gc_field_names.get(key, key) value = initial_info[key] final_info[name] = ''.join(value) return final_info
def RdbParser(lines, SeqConstructor=RnaSequence, LabelConstructor=InfoMaker, \ strict=True): """Yield sequences from the Rdb record. lines: a stream of Rdb records. SeqConstructor: constructor function to create the final sequence object LabelConstructor: function that creates Info dictionary from label lines strict: boolean, when True, an error is raised when one occurs, when False, the record is ignored when an error occurs. This function returns proper RnaSequence objects when possible. It strips out the secondary structure information, and it replaces 'o' by '?'. The original sequence is stored in the info dictionary under 'OriginalSeq'. If the original sequence is the desired end product, use MinimalRdbParser. """ for header, sequence in MinimalRdbParser(lines, strict=strict): info = LabelConstructor(header) clean_seq = create_acceptable_sequence(sequence) # add original raw sequence to info info['OriginalSeq'] = sequence if strict: #need to do error checking while constructing info and sequence try: yield SeqConstructor(clean_seq, Info=info) except AlphabetError: raise RecordError(\ "Sequence construction failed on record with reference %s."\ %(info.Refs)) else: #not strict: just skip any record that raises an exception try: yield SeqConstructor(clean_seq, Info=info) except: continue
def StockholmParser(lines, seq_constructor=Rna, info_constructor_dict=\ AllToInfo,struct_constructor=WussStructure,strict=True): """Yields (family_info, sequences, structure). Treats lines as a stream of Stockholm records. Family_info is the general information about the alignment. Sequences is an Alignment object. Each sequence has its own Info object with Genbank ID etc. Sequences are keyed by the original label in the database. Structure is the consensus structure of the alignment, in Wuss format """ for annotation, alignment, structure in MinimalStockholmParser\ (lines,strict=strict,seq_constructor=seq_constructor): family_info = {} if strict: for k, v in list(annotation.items()): label_constructor = info_constructor_dict[k] try: family_info[k] = label_constructor(v, strict=strict) except: raise RecordError("Info construction failed on " +\ "record on the %s annotation"%(k)) try: for seq in alignment.Seqs: _process_seq(seq, strict) structure = struct_constructor(structure) alignment.Info.update(family_info) alignment.Info.update({'Struct': structure}) yield alignment except Exception as e: raise RecordError("Sequence construction failed on " +\ "record with reference %s"%\ (family_info['GF'].get('AccessionNumber',None))) else: try: for k, v in list(annotation.items()): label_constructor = info_constructor_dict[k] family_info[k] = label_constructor(v, strict=strict) for seq in alignment.Seqs: _process_seq(seq, strict) structure = struct_constructor(structure) alignment.Info.update(family_info) alignment.Info.update({'Struct': structure}) yield alignment except Exception as e: continue
def NameToInfo(sequence, strict=True): """Returns an Info object constructed from the sequence Name sequence: Sequence object with a Name attribute The label will be split on Genbank acc. no. and sequence coordinates. The coordinates will be shifted one position, since in Python the first position is 0. """ #adjust label label = sequence.Name try: gb, pos = label.split('/', 1) #split genbank label and pos if not gb: gb = None if not pos: pos = None except: #unable to split, so string doesn't contain '/' if strict: raise RecordError("Failed to extract genbank id and positions" +\ " from label %s"%label) else: gb = None pos = None if pos: try: start, end = pos.split('-', 1) #split start and end pos except: if strict: raise RecordError("Failed to extract genbank id and positions from label %s"\ %label) else: start = None end = None else: start = None end = None if start: # adjust start position to do the correct thing in python # see comment in docstring start = int(start) - 1 if end: end = int(end) info = Info({'GenBank': gb, 'Start': start, 'End': end}) return info
def RfamParser(lines, seq_constructor=ChangedRnaSequence, label_constructor=\ HeaderToInfo,struct_constructor=WussStructure,strict=True,verbose=False): """Yields (family_info, sequences, structure). Treats lines as a stream of Rfam records. Family_info is the general information about the alignment. Sequences is an Alignment object. Each sequence has its own Info object with Genbank ID etc. Sequences are keyed by the original label in the database. Structure is the consensus structure of the alignment, in Wuss format """ for header, alignment, structure in MinimalRfamParser\ (lines,strict=strict,seq_constructor=seq_constructor): if strict: try: family_info = label_constructor(header, strict=strict) except: raise RecordError("Info construction failed on " +\ "record with header %s"%header) try: for seq in alignment.Seqs: _process_seq(seq, strict) structure = struct_constructor(structure) yield family_info, alignment, structure except Exception as e: raise RecordError("Sequence construction failed on " +\ "record with reference %s"%(family_info.Refs)) else: try: family_info = label_constructor(header, strict=strict) for seq in alignment.Seqs: _process_seq(seq, strict) structure = struct_constructor(structure) yield family_info, alignment, structure except Exception as e: if verbose: print(Exception, e) continue
def NcbiFastaLabelParser(line): """Creates an Info object and populates it with the line contents. As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta files were consistent with this format. """ info = Info() try: ignore, gi, db, db_ref, description = list(map(strip, line.split('|', 4))) except ValueError: #probably got wrong value raise RecordError("Unable to parse label line %s" % line) info.GI = gi info[NcbiLabels[db]] = db_ref info.Description = description return gi, info
def parser(lines): curr = [] for l in lines: if constructor: line = constructor(l) else: line = l if ignore(line): continue curr.append(line) if len(curr) == num: yield curr curr = [] if curr: raise RecordError("Non-blank lines not even multiple of %s" % num)
def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage): """Yields successive sequences from infile as CodonUsage objects. If strict is True (default), raises RecordError when label or seq missing. """ if not strict: #easier to see logic without detailed error handling for rec in CutgSpeciesFinder(infile): try: label, counts = rec if not is_cutg_species_label(label): continue species, genes = species_label_splitter(label) info = Info({'Species': species, 'NumGenes': int(genes)}) freqs = constructor(list( zip(codon_order, list(map(int, counts.split())))), Info=info) yield freqs except: continue else: for rec in CutgSpeciesFinder(infile): try: label, counts = rec except ValueError: #can't have got any counts raise RecordError("Found label without sequences: %s" % rec) if not is_cutg_species_label(label): raise RecordError("Found CUTG record without label: %s" % rec) species, genes = species_label_splitter(label) info = Info({'Species': species, 'NumGenes': int(genes)}) try: d = list(zip(codon_order, list(map(int, counts.split())))) freqs = constructor(d, Info=info) except: raise RecordError("Unable to convert counts: %s" % counts) yield freqs
def verify_valid_fasta_format(input_fasta_fp): """ Tests fasta filepath to determine if valid format input_fasta_fp: fasta filepath """ fasta_f = open(input_fasta_fp, "U") try: for label, seq in MinimalFastaParser(fasta_f): continue except RecordError: raise RecordError("Input fasta file not valid fasta format. Error " + "found at %s label and %s sequence " % (label, seq)) fasta_f.close()
def InfoFromLabel(line): """Takes a CUTG codon description line and returns an Info object. Raises RecordError if wrong number of fields etc. """ try: raw_fields = line.split('\\') result = Info( dict(list(zip(field_order, list(map(strip, raw_fields[1:])))))) #extra processing for first field first = raw_fields[0] if '#' in first: locus, cds_num = list(map(strip, raw_fields[0].split('#'))) else: locus, cds_num = first, '1' result['Locus'] = locus[1:] #remove leading '>' result['CdsNumber'] = cds_num #additional processing for last field: mostly key="value" pairs description = result['Description'] descrs = description.split('/') for d in descrs: if '=' in d: #assume key-value pair key, val = list(map(strip, d.split('=', 1))) #might be '=' in value #cut off leading and trailing " if present, but _not_ internal! if val.startswith('"'): val = val[1:] if val.endswith('"'): val = val[:-1] if key == 'db_xref': #handle cross-refs specially try: key, val = val.split(':') except ValueError: #missing actual reference? continue #just skip the bad db records try: if result[key]: result[key].append(val) else: result[key] = [val] except (KeyError, TypeError): #didn't recognize database result[key] = val else: #remember to convert the key to MixedCase naming convention result[cfu(key)] = val return result except: raise RecordError("Failed to read label line:\n%s" % line)
def HeaderToInfo(header, strict=True): """Returns an Info object constructed from the header lines. Header is a list of lines that contain header information. Fields that can occur multiple times in a header are stored in a list. Fields that (should) occur only once are stored as a single value Comments are joined by ' ' to one field. Fields concerning the references are ignored, except for MedLine ID. """ # construct temporary dictionary containing all original information initial_info = {} for line in header: line = line.strip() if not line: continue try: init, label, content = line.split(' ', 2) if not init == '#=GF' or len(label) != 2: raise RecordError except: if strict: raise RecordError("Failed to extract label and content " +\ "information from line %s"%(line)) else: continue if label in ['BM', 'DR', 'RM', 'CC']: if label in initial_info: initial_info[label].append(content.strip()) else: initial_info[label] = [content.strip()] else: initial_info[label] = content.strip() # transform initial dict into final one # throw away useless information; group information final_info = {} for key in list(initial_info.keys()): name = _field_names.get(key, key) if name == 'Comment': value = ' '.join(initial_info[key]) else: value = initial_info[key] final_info[name] = value return Info(final_info)
def GfToInfo(gf_lines, strict=True): """Returns a dict constructed from the GF lines. gf_lines is a list of lines that contain per-file annotation. Fields that can occur multiple times in a header are stored in a list. Fields that (should) occur only once are stored as a single value Comments are joined by ' ' to one field. Fields concerning the references are ignored, except for MedLine ID. """ # construct temporary dictionary containing all original information initial_info = {} for line in gf_lines: line = line.strip() if not line: continue try: init, feature, content = line.split(None, 2) if not init == '#=GF': raise RecordError except: if strict: raise RecordError("Failed to extract feature and content " +\ "information from line %s"%(line)) else: continue if feature in ['BM', 'DR', 'RM', 'CC', 'FT']: if feature in initial_info: initial_info[feature].append(content.strip()) else: initial_info[feature] = [content.strip()] else: initial_info[feature] = content.strip() # transform initial dict into final one # throw away useless information; group information final_info = {} for key in list(initial_info.keys()): name = _gf_field_names.get(key, key) if name == 'Comment': value = ' '.join(initial_info[key]) else: value = initial_info[key] final_info[name] = value return final_info
def GsToInfo(gs_lines, strict=True): """Returns a dict constructed from the GS lines. gs_lines is a list of lines that contain per-sequence annotation. Fields that can occur multiple times in a header are stored in a list. Fields that (should) occur only once are stored as a single value """ # construct temporary dictionary containing all original information initial_info = {} for line in gs_lines: line = line.strip() if not line: continue try: init, seqname, feature, content = line.split(None, 3) if not init == '#=GS': raise RecordError except: if strict: raise RecordError("Failed to extract feature and content " +\ "information from line %s"%(line)) else: continue if feature in ['DE', 'DR', 'BP']: if feature in initial_info: initial_info[feature][seqname].append(content.strip()) else: initial_info[feature] = {seqname: [content.strip()]} elif feature not in initial_info: initial_info[feature] = {seqname: content.strip()} else: initial_info[feature][seqname] = content.strip() # transform initial dict into final one # throw away useless information; group information final_info = {} for key in list(initial_info.keys()): name = _gs_field_names.get(key, key) value = initial_info[key] final_info[name] = value return final_info
def GrToInfo(gr_lines, strict=True): """Returns a dict constructed from the GR lines. gr_lines is a list of lines that contain per-sequence AND per-Column annotation. Fields that can occur multiple times in a header are stored in a list. Fields that (should) occur only once are stored as a single value """ # construct temporary dictionary containing all original information initial_info = defaultdict(dict) for line in gr_lines: line = line.strip() if not line: continue try: init, seqname, feature, content = line.split(None, 3) if not init == '#=GR': raise RecordError except: if strict: raise RecordError("Failed to extract feature and content " +\ "information from line %s"%(line)) else: continue if feature not in initial_info: initial_info[feature][seqname] = [] elif seqname not in initial_info[feature]: initial_info[feature][seqname] = [] initial_info[feature][seqname].append(content.strip()) # transform initial dict into final one # throw away useless information; group information final_info = {} for feature in list(initial_info.keys()): name = _gr_field_names.get(feature, feature) value = initial_info[feature] for k, v in list(value.items()): value[k] = ''.join(v) final_info[name] = value return final_info
def parser(lines): curr = [] for line in lines: if constructor: line = constructor(line) if ignore(line): continue curr.append(line) #if we find the label, return the previous record if is_tail_line(line): yield curr curr = [] #don't forget to return the last record in the file if curr: if strict: raise RecordError('lines exist after the last tail_line ' 'or no tail_line at all') else: yield curr
def parser(lines): curr = [] for line in lines: if constructor: line = constructor(line) #else: # line = l #ignore blank lines if ignore(line): continue #if we find the delimiter, return the line; otherwise, keep it if line == delimiter: if keep_delimiter: curr.append(line) yield curr curr = [] else: curr.append(line) if curr: if strict: raise RecordError("Found additional data after records: %s"%\ (curr)) else: yield curr
def check_tree_info(tree_info): """makes sure that there is a tree section in the file""" if tree_info: pass else: raise RecordError("not a valid Nexus Tree File")
def dnastrict(x, **kwargs): try: return DnaSequence(x, **kwargs) except Exception: raise RecordError("Could not convert sequence")
def MinimalPhylipParser(data, id_map=None, interleaved=True): """Yields successive sequences from data as (label, seq) tuples. **Need to implement id map. **NOTE if using phylip interleaved format, will cache entire file in memory before returning sequences. If phylip file not interleaved then will yield each successive sequence. data: sequence of lines in phylip format (an open file, list, etc) id_map: optional id mapping from external ids to phylip labels - not sure if we're going to implement this returns (id, sequence) tuples """ seq_cache = {} interleaved_id_map = {} id_offset = 10 curr_ct = -1 for line in data: if curr_ct == -1: # get header info num_seqs, seq_len, interleaved = _get_header_info(line) if not num_seqs or not seq_len: return curr_ct += 1 continue curr_id, curr_seq = _split_line(line, id_offset) # skip blank lines if not curr_id and not curr_seq: continue if not interleaved: if curr_id: if seq_cache: yield seq_cache[0], ''.join(seq_cache[1:]) seq_cache = [curr_id, curr_seq] else: seq_cache.append(curr_seq) else: curr_id_ix = curr_ct % num_seqs if (curr_ct + 1) % num_seqs == 0: id_offset = 0 if curr_id_ix not in interleaved_id_map: interleaved_id_map[curr_id_ix] = curr_id seq_cache[curr_id_ix] = [] seq_cache[curr_id_ix].append(curr_seq) curr_ct += 1 # return joined sequences if interleaved if interleaved: for curr_id_ix, seq_parts in seq_cache.items(): join_seq = ''.join(seq_parts) if len(join_seq) != seq_len: raise RecordError( "Length of sequence '%s' is not the same as in header " "Found %d, Expected %d" % ( interleaved_id_map[curr_id_ix], len(join_seq), seq_len)) yield interleaved_id_map[curr_id_ix], join_seq #return last seq if not interleaved else: if seq_cache: yield seq_cache[0], ''.join(seq_cache[1:])
def dnastrict(x, **kwargs): try: return Dna(x, check=True, **kwargs) except Exception as e: raise RecordError("Could not convert sequence")
def DndParser(lines, constructor=PhyloNode, unescape_name=False): """Returns tree from the Clustal .dnd file format, and anything equivalent. Tree is made up of cogent.base.tree.PhyloNode objects, with branch lengths (by default, although you can pass in an alternative constructor explicitly). """ if isinstance(lines, str): data = lines else: data = ''.join(lines) #skip arb comment stuff if present: start at first paren paren_index = data.find('(') data = data[paren_index:] left_count = data.count('(') right_count = data.count(')') if left_count != right_count: raise RecordError("Found %s left parens but %s right parens." % \ (left_count, right_count)) tokens = DndTokenizer(data) curr_node = None state = 'PreColon' state1 = 'PreClosed' last_token = None for t in tokens: if t == ':': #expecting branch length state = 'PostColon' #prevent state reset last_token = t continue if t == ')' and (last_token == ',' or last_token == '('): # node without name new_node = _new_child(curr_node, constructor) new_node.Name = None curr_node = new_node.Parent state1 = 'PostClosed' last_token = t continue if t == ')': #closing the current node curr_node = curr_node.Parent state1 = 'PostClosed' last_token = t continue if t == '(': #opening a new node curr_node = _new_child(curr_node, constructor) elif t == ';': #end of data last_token = t break # node without name elif t == ',' and (last_token == ',' or last_token == '('): new_node = _new_child(curr_node, constructor) new_node.Name = None curr_node = new_node.Parent elif t == ',': #separator: next node adds to this node's parent curr_node = curr_node.Parent elif state == 'PreColon' and state1 == 'PreClosed': #data for the current node new_node = _new_child(curr_node, constructor) if unescape_name: if t.startswith("'") and t.endswith("'"): while t.startswith("'") and t.endswith("'"): t = t[1:-1] else: if '_' in t: t = t.replace('_', ' ') new_node.Name = t curr_node = new_node elif state == 'PreColon' and state1 == 'PostClosed': if unescape_name: while t.startswith("'") and t.endswith("'"): t = t[1:-1] curr_node.Name = t elif state == 'PostColon': #length data for the current node curr_node.Length = float(t) else: #can't think of a reason to get here raise RecordError("Incorrect PhyloNode state? %s" % t) state = 'PreColon' #get here for any non-colon token state1 = 'PreClosed' last_token = t if curr_node is not None and curr_node.Parent is not None: raise RecordError("Didn't get back to root of tree.") if curr_node is None: #no data -- return empty node return constructor() return curr_node #this should be the root of the tree
def MinimalStockholmParser(infile, strict=True, seq_constructor=Rna): """Yield successive records as (gf, gc, gs, gr, sequences, structure). gf is a list of GF lines gc is a list of GC lines gs is a list of GS lines gr is a list of GR lines sequences is an Alignment object. Sequences are Rna objects keyed by the original labels in the database. structure is a WussStructure """ for record in StockholmFinder(infile): gf = [] gc = [] gs = [] gr = [] sequences = [] structure = [] for line in record: if is_gf_line(line): gf.append(line.strip()) elif is_gc_line(line): gc.append(line.strip()) if is_structure_line(line): structure.append(line) elif is_gs_line(line): gs.append(line.strip()) elif is_gr_line(line): gr.append(line.strip()) elif is_seq_line(line): sequences.append(line) else: continue #sequence and structure are required. #for example when looking at the stockholm format of just one family if not sequences: if strict: error = 'Found record with missing element(s): ' if not sequences: error += 'sequences' raise RecordError(error) else: continue #join all sequence parts together, construct label try: new_seqs = load_from_clustal(sequences, strict=strict, seq_constructor=seq_constructor) sequences = new_seqs except (DataError, RecordError) as e: if strict: raise RecordError(str(e)) else: continue #construct the structure if structure: try: res = load_from_clustal(structure, strict=strict, gap_char='.') assert len(res.NamedSeqs) == 1 #otherwise multiple keys structure = res.NamedSeqs['#=GC SS_cons'] except (RecordError, KeyError, AssertionError) as e: if strict: raise RecordError("Can't parse structure of family") structure = None yield {'GF': gf, 'GC': gc, 'GS': gs, 'GR': gr}, sequences, structure