def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") # skip header crap inHeader = True lineNum = 0 sequences = [] for line in f: line = line.strip() lineNum += 1 if not line: continue fields = line.split() if inHeader: if len(fields[0]) == 2: continue inHeader = False if len(fields) != 2: # since this format lacks mandatory headers, # cannot be sure it _is_ a Pfam file and therefore # cannot raise FormatSyntaxError raise WrongFileTypeError() seq = Sequence(makeReadable(fields[0])) seq.extend(fields[1]) sequences.append(seq) f.close() if not sequences: raise WrongFileTypeError() return sequences, {}, {}
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") inSequence = 0 sequences = [] for line in f.readlines(): if inSequence: if not line or line.isspace(): inSequence = 0 continue if line[0] == '>': inSequence = 0 # fall through else: sequences[-1].extend(line.strip()) if not inSequence: if line[0] == '>': if sequences and len(sequences[-1]) == 0: raise FormatSyntaxError("No sequence" " found for %s" % sequences[-1].name) inSequence = 1 sequences.append(Sequence(makeReadable( line[1:]))) f.close() if not sequences: raise WrongFileTypeError() return sequences, {}, {}
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") inHeader = 1 sequences = [] lineNum = 0 for line in f.readlines(): lineNum += 1 if inHeader: if line.startswith("CLUSTAL"): inHeader = 0 firstBlock = 1 else: if line.strip() !="": raise WrongFileTypeError() continue if not line or line[0].isspace(): if sequences: firstBlock = 0 expect = 0 continue try: seqName, seqBlock, numResidues = line.split() except ValueError: try: seqName, seqBlock = line.split() except ValueError: raise FormatSyntaxError("Line %d is not " "sequence name followed by sequence " " contents and optional ungapped length" % lineNum) if firstBlock: sequences.append(Sequence(makeReadable(seqName))) sequences[-1].append(seqBlock) continue try: seq = sequences[expect] except IndexError: raise FormatSyntaxError("Sequence on line %d not in" " initial sequence block" % lineNum) expect += 1 seq.append(seqBlock) f.close() if not sequences: raise WrongFileTypeError() return sequences, {}, {}
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") inHeader = 1 sequences = [] lineNum = 0 for line in f.readlines(): lineNum += 1 if inHeader: if line.startswith("CLUSTAL"): inHeader = 0 firstBlock = 1 else: if line.strip() != "": raise WrongFileTypeError() continue if not line or line[0].isspace(): if sequences: firstBlock = 0 expect = 0 continue try: seqName, seqBlock, numResidues = line.split() except ValueError: try: seqName, seqBlock = line.split() except ValueError: raise FormatSyntaxError( "Line %d is not " "sequence name followed by sequence " " contents and optional ungapped length" % lineNum) if firstBlock: sequences.append(Sequence(makeReadable(seqName))) sequences[-1].append(seqBlock) continue try: seq = sequences[expect] except IndexError: raise FormatSyntaxError("Sequence on line %d not in" " initial sequence block" % lineNum) expect += 1 seq.append(seqBlock) f.close() if not sequences: raise WrongFileTypeError() return sequences, {}, {}
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") want = 'init' sequences = [] for line in f.readlines(): line = line.strip() if want == 'init': if len(line) < 4: continue if line[0] != '>' or line[3] != ';': continue sequences.append(Sequence(makeReadable(line[4:]))) pirType = line[1:3] if pirType in ("P1", "F1"): sequences[-1].nucleic = 0 else: sequences[-1].nucleic = 1 sequences[-1].PIRtype = pirType want = 'descript' elif want == 'descript': sequences[-1].descript = line sequences[-1].PIRdescript = line want = 'sequence' elif want == 'sequence': if not line: continue if line[-1] == '*': want = 'init' line = line[:-1] sequences[-1].extend(filter(lambda c, whsp=string.whitespace: not c in whsp, line)) f.close() if not sequences: raise WrongFileTypeError() if want != 'init': raise FormatSyntaxError("Could not find end of sequence '%s'" % sequences[-1].name) return sequences, {}, {}
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") want = 'init' sequences = [] for line in f.readlines(): line = line.strip() if want == 'init': if len(line) < 4: continue if line[0] != '>' or line[3] != ';': continue sequences.append(Sequence(makeReadable(line[4:]))) pirType = line[1:3] if pirType in ("P1", "F1"): sequences[-1].nucleic = 0 else: sequences[-1].nucleic = 1 sequences[-1].PIRtype = pirType want = 'descript' elif want == 'descript': sequences[-1].descript = line sequences[-1].PIRdescript = line want = 'sequence' elif want == 'sequence': if not line: continue if line[-1] == '*': want = 'init' line = line[:-1] sequences[-1].extend( filter(lambda c, whsp=string.whitespace: not c in whsp, line)) f.close() if not sequences: raise WrongFileTypeError() if want != 'init': raise FormatSyntaxError("Could not find end of sequence '%s'" % sequences[-1].name) return sequences, {}, {}
def _readSequences(self, f): #self.sequenceDict = {} self.sequenceList = [] while 1: line = f.readline() if not line: raise FormatSyntaxError('no alignment separator') if line == '//\n' or line == '//\r\n': break m = MSF._Sum.match(line) if m is not None: name = m.group(1) length = m.group(2) check = m.group(3) weight = m.group(4) s = Sequence(makeReadable(name)) self.sequenceList.append(s) s.attrs = {} s.attrs['MSF length'] = length s.attrs['MSF check'] = check s.attrs['MSF weight'] = weight if not self.sequenceList: raise FormatSyntaxError('No sequences found in header')
def _readSequences(self, f): #self.sequenceDict = {} self.sequenceList = [] while 1: line = f.readline() if not line: raise FormatSyntaxError( 'no alignment separator') if line == '//\n' or line == '//\r\n': break m = MSF._Sum.match(line) if m is not None: name = m.group(1) length = m.group(2) check = m.group(3) weight = m.group(4) s = Sequence(makeReadable(name)) self.sequenceList.append(s) s.attrs = {} s.attrs['MSF length'] = length s.attrs['MSF check'] = check s.attrs['MSF weight'] = weight if not self.sequenceList: raise FormatSyntaxError('No sequences found in header')
def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") doing = None sequences = [] headerOK = False lineNum = 0 alignStartIndex = None for line in f: if doing == 'alignments': # don't strip() alignment section since it has significant # leading spaces line = line.rstrip() else: line = line.strip() lineNum += 1 if not headerOK: if line.lower().startswith('hssp'): headerOK = True continue raise WrongFileTypeError("No initial HSSP header line") if line.startswith('##'): if doing == 'proteins' and not sequences: raise FormatSyntaxError("No entries in PROTEINS section") try: doing = line.split()[1].lower() except IndexError: doing = None if doing == 'alignments': try: hashes, alignments, begin, dash, end = line.strip().split() begin = int(begin) end = int(end) except ValueError: raise FormatSyntaxError( "ALIGNMENTS line (line #%d) not of " "the form: ## ALIGNMENTS (number) - (number)" % lineNum) continue if doing == 'proteins': if not line[0].isdigit(): continue try: seqName = line.split()[2] except IndexError: raise WrongFormatError( "Line %d in PROTEINS section does not " "start with [integer] : [sequence name]" % lineNum) sequences.append(Sequence(makeReadable(seqName))) elif doing == 'alignments': if line.lstrip().lower().startswith('seqno'): try: alignStartIndex = line.index('.') except: raise FormatSyntaxError( "No indication of alignment " " starting column ('.' character) in SeqNo line " " in ALIGNMENTS section") continue if alignStartIndex == None: raise FormatSyntaxError("No initial SeqNo line in " "ALIGNMENTS section") block = line[alignStartIndex:] if not block: raise FormatSyntaxError("No alignment block given on line %d" % lineNum) blockLen = end - begin + 1 if len(block) > blockLen: raise FormatSyntaxError( "Too many characters (%d, only %d " " sequences) in alignment block given on line %d" % (len(block), blockLen, lineNum)) block = block + ' ' * (blockLen - len(block)) for seq, c in zip(sequences[begin - 1:end], block): seq.append(c) f.close() return sequences, {}, {}
def parse(fileName): IN_HEADER = 0 START_ATTRS = 1 IN_ATTRS = 2 IN_FEATURES = 3 IN_SEQ = 4 state = IN_HEADER from OpenSave import osOpen f = osOpen(fileName, "r") sequences = [] lineNum = 0 hasOffset = 0 longest = None fileAttrs = {} for line in f: line = line.rstrip() # remove trailing whitespace/newline lineNum += 1 if lineNum == 1: if line.startswith("!!RICH_SEQUENCE"): continue raise WrongFileTypeError() if state == IN_HEADER: if line.strip() == "..": state = START_ATTRS continue if "comments" in fileAttrs: fileAttrs["comments"] += "\n" + line else: fileAttrs["comments"] = line continue if not line.strip(): continue if state == START_ATTRS: if line.strip() == "{": state = IN_ATTRS curAttr = None attrs = {} elif line: raise FormatSyntaxError("Unexpected text before" " start of sequence on line %d" % lineNum) continue if state == IN_ATTRS or state == IN_FEATURES: if line.strip() == "sequence" and line[0] == "s": if "RSF name" not in attrs: raise FormatSyntaxError("sequence on " "line %d has no name" % lineNum) state = IN_SEQ seq = Sequence(makeReadable(attrs["RSF name"])) del attrs["RSF name"] seq.attrs = attrs if "RSF descrip" in attrs: attrs["description"] = attrs[ "RSF descrip"] del attrs["RSF descrip"] sequences.append(seq) if "RSF offset" in attrs: seq.extend("." * int( attrs["RSF offset"])) hasOffset = 1 del attrs["RSF offset"] continue if line.startswith("feature"): if state == IN_ATTRS: attrs["RSF features"] = [[line[8:]]] else: attrs["RSF features"].append([line[8:]]) state = IN_FEATURES continue if state == IN_ATTRS: if line[0].isspace(): # continuation if not curAttr: raise FormatSyntaxError("Bogus " "indentation at line %d" % lineNum) if attrs[curAttr]: attrs[curAttr] += "\n" + line else: attrs[curAttr] = line continue if " " in line.strip(): curAttr, val = line.split(None, 1) curAttr.replace("_", " ") curAttr = "RSF " + curAttr attrs[curAttr] = val.strip() else: curAttr = "RSF " + line.strip().replace("_", " ") attrs[curAttr] = "" continue if state == IN_FEATURES: attrs["RSF features"][-1].append(line) continue if line.strip() == "}": state = START_ATTRS if not longest: longest = len(seq) else: if len(seq) < longest: seq.extend("." * (longest - len(seq))) elif len(seq) > longest: longest = len(seq) for s in sequences[:-1]: s.extend("." * (longest - len(s))) continue seq.extend(line.strip()) if not seq[0].isalpha(): hasOffset = 1 f.close() if state == IN_HEADER: raise FormatSyntaxError( "No end to header (i.e. '..' line) found") if state == IN_ATTRS or state == IN_FEATURES: if "RSF name" in attrs: raise FormatSyntaxError( "No sequence data found for sequence %s" % attrs["RSF name"]) raise FormatSyntaxError("Sequence without sequence data") if state == IN_SEQ: raise FormatSyntaxError("No terminating brace for sequence %s" % attrs["RSF name"]) if not sequences: raise FormatSyntaxError("No sequences found") if not hasOffset: from chimera import replyobj replyobj.warning("No offset fields in RSF file;" " assuming zero offset\n") return sequences, fileAttrs, {}
def parse(fileName): from OpenSave import osOpen from chimera import replyobj f = osOpen(fileName, "r") lineNum = 0 fileAttrs = {} fileMarkups = {} seqAttrs = {} seqMarkups = {} sequences = {} seqSequence = [] for line in f: line = line[:-1] # drop newline lineNum += 1 if lineNum == 1: if line.startswith("# STOCKHOLM"): continue raise WrongFileTypeError() if not line: continue if line.startswith('#='): markupType = line[2:4] markup = line[5:].strip() def trySplit(numSplit): fields = markup.split(None, numSplit) if len(fields) == numSplit: # value is empty fields.append("") if len(fields) != numSplit + 1: raise FormatSyntaxError("Not enough" " arguments after #=%s markup" " on line %d" % (markupType, lineNum)) return fields if markupType == "GF": tag, val = trySplit(1) tag = tag.replace("_", " ") tag = genericFileAttrs.get(tag, "Stockholm " + tag) if tag in fileAttrs: fileAttrs[tag] += '\n' + val else: fileAttrs[tag] = val elif markupType == "GS": seqName, tag, val = trySplit(2) tag = tag.replace("_", " ") attrs = seqAttrs.setdefault(seqName, {}) tag = genericSeqAttrs.get(tag, "Stockholm " + tag) if tag in attrs: attrs[tag] += '\n' + val else: attrs[tag] = val elif markupType == "GC": tag, val = trySplit(1) tag = tag.replace("_", " ") fileMarkups[tag] = fileMarkups.get(tag, "") + val elif markupType == "GR": seqName, tag, val = trySplit(2) tag = tag.replace("_", " ") seqMarkups.setdefault(seqName, {}).setdefault(tag, "") seqMarkups[seqName][tag] += val # ignore other types continue elif line.startswith('#'): # unstructured comment if 'comments' in fileAttrs: fileAttrs['comments'] += "\n" + line[1:] else: fileAttrs['comments'] = line[1:] continue elif line.strip() == "//": # end of sequence alignment blocks, but comments # may follow this, so keep going... continue # sequence info... try: seqName, block = line.split(None, 1) except ValueError: raise FormatSyntaxError("Sequence info not in name/" "contents format on line %d" % lineNum) if seqName not in sequences: sequences[seqName] = Sequence(makeReadable(seqName)) seqSequence.append(seqName) sequences[seqName].extend(block) f.close() if not sequences: raise FormatSyntaxError("No sequences found") for seqName, seq in sequences.items(): if seqName in seqAttrs: seq.attrs = seqAttrs[seqName] if seqName in seqMarkups: seq.markups = seqMarkups[seqName] for tag, markup in seq.markups.items(): if len(markup) != len(seq): replyobj.warning("Markup %s for" " sequence %s is wrong length;" " ignoring\n" % (tag, seqName)) del seq.markups[tag] for seqInfo, label in [(seqAttrs, "sequence"), (seqMarkups, "residue")]: for seqName in seqInfo.keys(): if seqName in sequences: continue # might be sequence name without trailing '/start-end' for fullName in sequences.keys(): if fullName.startswith(seqName) \ and fullName[len(seqName)] == '/' \ and '/' not in fullName[len(seqName)+1:]: break else: raise FormatSyntaxError( "%s annotations " "provided for non-existent sequence %s" % (label.capitalize(), seqName)) replyobj.info("Updating %s %s annotions with %s " "annotations\n" % (fullName, label, seqName)) seqInfo[fullName].update(seqInfo[seqName]) del seqInfo[seqName] for tag, markup in fileMarkups.items(): if len(markup) != len(sequences[seqSequence[0]]): raise FormatSyntaxError("Column annotation %s is" " wrong length" % tag) return map(lambda name: sequences[name], seqSequence), \ fileAttrs, fileMarkups