def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith('<lexelt'): lexelt_num += 1 m = re.search('item=("[^"]+"|\'[^\']+\')', line) assert m is not None # <lexelt> has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('<instance'): assert instance_lines == [] in_instance = True # Body of an instance? if in_instance: instance_lines.append(line) # End of an instance? if line.lstrip().startswith('</instance'): xml_block = '\n'.join(instance_lines) xml_block = _fixXML(xml_block) inst = ElementTree.fromstring(xml_block) return [self._parse_instance(inst, lexelt)]
def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith('<lexelt'): lexelt_num += 1 m = re.search('item=("[^"]+"|\'[^\']+\')', line) assert m is not None # <lexelt> has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('<instance'): assert instance_lines == [] in_instance = True # Body of an instance? if in_instance: instance_lines.append(line) # End of an instance? if line.lstrip().startswith('</instance'): xml_block = '\n'.join(instance_lines) xml_block = _fixXML(xml_block) inst = ElementTree.fromstring(xml_block) return [self._parse_instance(inst, lexelt)]
def parse_from_string(self, string): raw = ElementTree.fromstring(string) self.parse_tree(raw)
def fromstring(s): return CRFInfo._read(ElementTree.fromstring(s))
def fromstring(s): return CRFInfo._read(ElementTree.fromstring(s))