Ejemplo n.º 1
0
def parse(fileName):
    from OpenSave import osOpen
    f = osOpen(fileName, "r")
    inHeader = 1
    sequences = []
    lineNum = 0
    for line in f.readlines():
        lineNum += 1
        if inHeader:
            if line.startswith("CLUSTAL"):
                inHeader = 0
                firstBlock = 1
            else:
                if line.strip() != "":
                    raise WrongFileTypeError()
            continue
        if not line or line[0].isspace():
            if sequences:
                firstBlock = 0
                expect = 0
            continue
        try:
            seqName, seqBlock, numResidues = line.split()
        except ValueError:
            try:
                seqName, seqBlock = line.split()
            except ValueError:
                raise FormatSyntaxError(
                    "Line %d is not "
                    "sequence name followed by sequence "
                    " contents and optional ungapped length" % lineNum)
        if firstBlock:
            sequences.append(Sequence(makeReadable(seqName)))
            sequences[-1].append(seqBlock)
            continue
        try:
            seq = sequences[expect]
        except IndexError:
            raise FormatSyntaxError("Sequence on line %d not in"
                                    " initial sequence block" % lineNum)
        expect += 1
        seq.append(seqBlock)
    f.close()
    if not sequences:
        raise WrongFileTypeError()
    return sequences, {}, {}
 def trySplit(numSplit):
     fields = markup.split(None, numSplit)
     if len(fields) == numSplit:
         # value is empty
         fields.append("")
     if len(fields) != numSplit + 1:
         raise FormatSyntaxError("Not enough"
                                 " arguments after #=%s markup"
                                 " on line %d" %
                                 (markupType, lineNum))
     return fields
 def _readSequences(self, f):
     #self.sequenceDict = {}
     self.sequenceList = []
     while 1:
         line = f.readline()
         if not line:
             raise FormatSyntaxError('no alignment separator')
         if line == '//\n' or line == '//\r\n':
             break
         m = MSF._Sum.match(line)
         if m is not None:
             name = m.group(1)
             length = m.group(2)
             check = m.group(3)
             weight = m.group(4)
             s = Sequence(makeReadable(name))
             self.sequenceList.append(s)
             s.attrs = {}
             s.attrs['MSF length'] = length
             s.attrs['MSF check'] = check
             s.attrs['MSF weight'] = weight
     if not self.sequenceList:
         raise FormatSyntaxError('No sequences found in header')
 def _readBlock(self, f):
     line = f.readline()
     if not line:
         return 0
     if line == '\n' or line == '\r\n':
         return 1  # ignore empty line
     # check (and skip) any column numbering
     if "".join(line.split()).isdigit():
         line = f.readline()
         if not line:
             raise FormatSyntaxError('unexpected EOF')
     seqIndex = 0
     while 1:
         if line.isspace():
             break
         field = line.split()
         try:
             seq = self.sequenceList[seqIndex]
         except IndexError:
             raise FormatSyntaxError('more sequences'
                                     ' in actual alignment than in header')
         #try:
         #	seq = self.sequenceDict[field[0]]
         #except KeyError:
         #	raise FormatSyntaxError(
         #		'unexpected sequence ' + field[0])
         for block in field[1:]:
             seq.append(block)
         line = f.readline()
         if not line:
             # allow for files that don't end in newline
             if self.sequenceList[-1] == seq \
             and len(seq) == int(seq.attrs['MSF length']):
                 return 0
             raise FormatSyntaxError('unexpected EOF')
         seqIndex += 1
     return 1
Ejemplo n.º 5
0
def parse(fileName):
    from OpenSave import osOpen
    f = osOpen(fileName, "r")
    want = 'init'
    sequences = []
    for line in f.readlines():
        line = line.strip()
        if want == 'init':
            if len(line) < 4:
                continue
            if line[0] != '>' or line[3] != ';':
                continue
            sequences.append(Sequence(makeReadable(line[4:])))
            pirType = line[1:3]
            if pirType in ("P1", "F1"):
                sequences[-1].nucleic = 0
            else:
                sequences[-1].nucleic = 1
            sequences[-1].PIRtype = pirType
            want = 'descript'
        elif want == 'descript':
            sequences[-1].descript = line
            sequences[-1].PIRdescript = line
            want = 'sequence'
        elif want == 'sequence':
            if not line:
                continue
            if line[-1] == '*':
                want = 'init'
                line = line[:-1]
            sequences[-1].extend(
                filter(lambda c, whsp=string.whitespace: not c in whsp, line))
    f.close()
    if not sequences:
        raise WrongFileTypeError()
    if want != 'init':
        raise FormatSyntaxError("Could not find end of sequence '%s'" %
                                sequences[-1].name)
    return sequences, {}, {}
Ejemplo n.º 6
0
def parse(fileName):
    from OpenSave import osOpen
    f = osOpen(fileName, "r")
    doing = None
    sequences = []
    headerOK = False
    lineNum = 0
    alignStartIndex = None
    for line in f:
        if doing == 'alignments':
            # don't strip() alignment section since it has significant
            # leading spaces
            line = line.rstrip()
        else:
            line = line.strip()
        lineNum += 1
        if not headerOK:
            if line.lower().startswith('hssp'):
                headerOK = True
                continue
            raise WrongFileTypeError("No initial HSSP header line")
        if line.startswith('##'):
            if doing == 'proteins' and not sequences:
                raise FormatSyntaxError("No entries in PROTEINS section")
            try:
                doing = line.split()[1].lower()
            except IndexError:
                doing = None
            if doing == 'alignments':
                try:
                    hashes, alignments, begin, dash, end = line.strip().split()
                    begin = int(begin)
                    end = int(end)
                except ValueError:
                    raise FormatSyntaxError(
                        "ALIGNMENTS line (line #%d) not of "
                        "the form: ## ALIGNMENTS (number) - (number)" %
                        lineNum)
            continue
        if doing == 'proteins':
            if not line[0].isdigit():
                continue
            try:
                seqName = line.split()[2]
            except IndexError:
                raise WrongFormatError(
                    "Line %d in PROTEINS section does not "
                    "start with [integer] : [sequence name]" % lineNum)
            sequences.append(Sequence(makeReadable(seqName)))
        elif doing == 'alignments':
            if line.lstrip().lower().startswith('seqno'):
                try:
                    alignStartIndex = line.index('.')
                except:
                    raise FormatSyntaxError(
                        "No indication of alignment "
                        " starting column ('.' character) in SeqNo line "
                        " in ALIGNMENTS section")
                continue
            if alignStartIndex == None:
                raise FormatSyntaxError("No initial SeqNo line in "
                                        "ALIGNMENTS section")
            block = line[alignStartIndex:]
            if not block:
                raise FormatSyntaxError("No alignment block given on line %d" %
                                        lineNum)
            blockLen = end - begin + 1
            if len(block) > blockLen:
                raise FormatSyntaxError(
                    "Too many characters (%d, only %d "
                    " sequences) in alignment block given on line %d" %
                    (len(block), blockLen, lineNum))
            block = block + ' ' * (blockLen - len(block))
            for seq, c in zip(sequences[begin - 1:end], block):
                seq.append(c)
    f.close()
    return sequences, {}, {}
def parse(fileName):
	IN_HEADER = 0
	START_ATTRS = 1
	IN_ATTRS = 2
	IN_FEATURES = 3
	IN_SEQ = 4

	state = IN_HEADER

	from OpenSave import osOpen
	f = osOpen(fileName, "r")
	sequences = []
	lineNum = 0
	hasOffset = 0
	longest = None
	fileAttrs = {}
	for line in f:
		line = line.rstrip() # remove trailing whitespace/newline
		lineNum += 1
		if lineNum == 1:
			if line.startswith("!!RICH_SEQUENCE"):
				continue
			raise WrongFileTypeError()
		if state == IN_HEADER:
			if line.strip() == "..":
				state = START_ATTRS
				continue
			if "comments" in fileAttrs:
				fileAttrs["comments"] += "\n" + line
			else:
				fileAttrs["comments"] = line
			continue
		if not line.strip():
			continue
		if state == START_ATTRS:
			if line.strip() == "{":
				state = IN_ATTRS
				curAttr = None
				attrs = {}
			elif line:
				raise FormatSyntaxError("Unexpected text before"
						" start of sequence on line %d"
						% lineNum)
			continue
		if state == IN_ATTRS or state == IN_FEATURES:
			if line.strip() == "sequence" and line[0] == "s":
				if "RSF name" not in attrs:
					raise FormatSyntaxError("sequence on "
						"line %d has no name" % lineNum)
				state = IN_SEQ
				seq = Sequence(makeReadable(attrs["RSF name"]))
				del attrs["RSF name"]
				seq.attrs = attrs
				if "RSF descrip" in attrs:
					attrs["description"] = attrs[
								"RSF descrip"]
					del attrs["RSF descrip"]
				sequences.append(seq)
				if "RSF offset" in attrs:
					seq.extend("." * int(
							attrs["RSF offset"]))
					hasOffset = 1
					del attrs["RSF offset"]
				continue
			if line.startswith("feature"):
				if state == IN_ATTRS:
					attrs["RSF features"] = [[line[8:]]]
				else:
					attrs["RSF features"].append([line[8:]])
				state = IN_FEATURES
				continue

		if state == IN_ATTRS:
			if line[0].isspace():
				# continuation
				if not curAttr:
					raise FormatSyntaxError("Bogus "
							"indentation at line %d"
							% lineNum)
				if attrs[curAttr]:
					attrs[curAttr] += "\n" + line
				else:
					attrs[curAttr] = line
				continue
			if " " in line.strip():
				curAttr, val = line.split(None, 1)
				curAttr.replace("_", " ")
				curAttr = "RSF " + curAttr
				attrs[curAttr] = val.strip()
			else:
				curAttr = "RSF " + line.strip().replace("_", " ")
				attrs[curAttr] = ""
			continue

		if state == IN_FEATURES:
			attrs["RSF features"][-1].append(line)
			continue
		if line.strip() == "}":
			state = START_ATTRS
			if not longest:
				longest = len(seq)
			else:
				if len(seq) < longest:
					seq.extend("." * (longest - len(seq)))
				elif len(seq) > longest:
					longest = len(seq)
					for s in sequences[:-1]:
						s.extend("." *
							(longest - len(s)))
			continue
		seq.extend(line.strip())
		if not seq[0].isalpha():
			hasOffset = 1
			
	f.close()
	if state == IN_HEADER:
		raise FormatSyntaxError(
				"No end to header (i.e. '..' line) found")
	if state == IN_ATTRS or state == IN_FEATURES:
		if "RSF name" in attrs:
			raise FormatSyntaxError(
					"No sequence data found for sequence %s"
					% attrs["RSF name"])
		raise FormatSyntaxError("Sequence without sequence data")
	
	if state == IN_SEQ:
		raise FormatSyntaxError("No terminating brace for sequence %s"
							% attrs["RSF name"])
			
	if not sequences:
		raise FormatSyntaxError("No sequences found")
	if not hasOffset:
		from chimera import replyobj
		replyobj.warning("No offset fields in RSF file;"
						" assuming zero offset\n")
	return sequences, fileAttrs, {}
def parse(fileName):
    from OpenSave import osOpen
    from chimera import replyobj
    f = osOpen(fileName, "r")
    lineNum = 0
    fileAttrs = {}
    fileMarkups = {}
    seqAttrs = {}
    seqMarkups = {}
    sequences = {}
    seqSequence = []
    for line in f:
        line = line[:-1]  # drop newline
        lineNum += 1
        if lineNum == 1:
            if line.startswith("# STOCKHOLM"):
                continue
            raise WrongFileTypeError()
        if not line:
            continue
        if line.startswith('#='):
            markupType = line[2:4]
            markup = line[5:].strip()

            def trySplit(numSplit):
                fields = markup.split(None, numSplit)
                if len(fields) == numSplit:
                    # value is empty
                    fields.append("")
                if len(fields) != numSplit + 1:
                    raise FormatSyntaxError("Not enough"
                                            " arguments after #=%s markup"
                                            " on line %d" %
                                            (markupType, lineNum))
                return fields

            if markupType == "GF":
                tag, val = trySplit(1)
                tag = tag.replace("_", " ")
                tag = genericFileAttrs.get(tag, "Stockholm " + tag)
                if tag in fileAttrs:
                    fileAttrs[tag] += '\n' + val
                else:
                    fileAttrs[tag] = val
            elif markupType == "GS":
                seqName, tag, val = trySplit(2)
                tag = tag.replace("_", " ")
                attrs = seqAttrs.setdefault(seqName, {})
                tag = genericSeqAttrs.get(tag, "Stockholm " + tag)
                if tag in attrs:
                    attrs[tag] += '\n' + val
                else:
                    attrs[tag] = val
            elif markupType == "GC":
                tag, val = trySplit(1)
                tag = tag.replace("_", " ")
                fileMarkups[tag] = fileMarkups.get(tag, "") + val
            elif markupType == "GR":
                seqName, tag, val = trySplit(2)
                tag = tag.replace("_", " ")
                seqMarkups.setdefault(seqName, {}).setdefault(tag, "")
                seqMarkups[seqName][tag] += val
            # ignore other types
            continue
        elif line.startswith('#'):
            # unstructured comment
            if 'comments' in fileAttrs:
                fileAttrs['comments'] += "\n" + line[1:]
            else:
                fileAttrs['comments'] = line[1:]
            continue
        elif line.strip() == "//":
            # end of sequence alignment blocks, but comments
            # may follow this, so keep going...
            continue
        # sequence info...
        try:
            seqName, block = line.split(None, 1)
        except ValueError:
            raise FormatSyntaxError("Sequence info not in name/"
                                    "contents format on line %d" % lineNum)
        if seqName not in sequences:
            sequences[seqName] = Sequence(makeReadable(seqName))
            seqSequence.append(seqName)
        sequences[seqName].extend(block)
    f.close()

    if not sequences:
        raise FormatSyntaxError("No sequences found")
    for seqName, seq in sequences.items():
        if seqName in seqAttrs:
            seq.attrs = seqAttrs[seqName]
        if seqName in seqMarkups:
            seq.markups = seqMarkups[seqName]
            for tag, markup in seq.markups.items():
                if len(markup) != len(seq):
                    replyobj.warning("Markup %s for"
                                     " sequence %s is wrong length;"
                                     " ignoring\n" % (tag, seqName))
                    del seq.markups[tag]
    for seqInfo, label in [(seqAttrs, "sequence"), (seqMarkups, "residue")]:
        for seqName in seqInfo.keys():
            if seqName in sequences:
                continue
            # might be sequence name without trailing '/start-end'
            for fullName in sequences.keys():
                if fullName.startswith(seqName) \
                and fullName[len(seqName)] == '/' \
                and '/' not in fullName[len(seqName)+1:]:
                    break
            else:
                raise FormatSyntaxError(
                    "%s annotations "
                    "provided for non-existent sequence %s" %
                    (label.capitalize(), seqName))
            replyobj.info("Updating %s %s annotions with %s "
                          "annotations\n" % (fullName, label, seqName))
            seqInfo[fullName].update(seqInfo[seqName])
            del seqInfo[seqName]
    for tag, markup in fileMarkups.items():
        if len(markup) != len(sequences[seqSequence[0]]):
            raise FormatSyntaxError("Column annotation %s is"
                                    " wrong length" % tag)

    return map(lambda name: sequences[name], seqSequence), \
          fileAttrs, fileMarkups
 def _readAlignment(self, f):
     line = f.readline()
     if not line:
         raise FormatSyntaxError('no alignment data')
     while self._readBlock(f):
         pass