Esempio n. 1
0
    def from_nersuite(cls, line):
        """Return Token given NERsuite format representation."""

        line = line.rstrip('\n')
        fields = line.split('\t')
        try:
            tag, start, end, text = fields[:4]
        except ValueError:
            raise FormatError('NERsuite format: too few fields ("%s")' % line)
        try:
            start, end = int(start), int(end)
        except ValueError:
            raise FormatError('NERsuite format: non-int start/end ("%s")' %
                              line)
        if end - start != len(text):
            raise FormatError('NERsuite format: length mismatch ("%s")' % line)

        return cls(text, start, tag, fields[4:])
Esempio n. 2
0
def read_documents(flo, label, config=defaults):
    """Load documents from file-like object, return list of Document objects."""
    documents = []
    for ln, line in enumerate(flo, start=1):
        line = line.strip()
        if not line:
            raise FormatError('empty line {} in {}'.format(ln, flo.name))
        token_texts = tokenize(line, config)
        document = make_document(token_texts, label)
        documents.append(document)
    return documents
Esempio n. 3
0
 def from_str(cls, string, discont_rule=None):
     try:
         id_, type_offsets, text = string.split('\t',2)
         type_, offsets = type_offsets.split(' ', 1)
         offsets = cls._parse_offsets(offsets)
         was_discontinuous = False
         if len(offsets) != 1:
             offsets, text = cls._resolve_discontinuous(offsets, text,
                                                        discont_rule)
             was_discontinuous = True
         start, end = offsets[0]
         ann = cls(id_, type_, start, end, text)
         ann.skip_validation = (was_discontinuous and
                                discont_rule != LAST_SPAN)
         return ann
     except ValueError, e:
         raise FormatError('Standoff: failed to parse %s' % string)
Esempio n. 4
0
def verify_textbounds(textbounds, text):
    """Verify that given textbounds are valid with reference to given text.

    Return True on success, raise FormatError on any issue.
    """

    for t in textbounds:
        if t.skip_validation:
            # TODO fix: hack around the constraint that discontinuous
            # annotations don't have access to the full text
            print >> sys.stderr, 'Resolve discontinuous "%s" to full span "%s"' \
            % (t.text, text[t.start:t.end])
            t.text = text[t.start:t.end]
        else:
            try:
                assert t.is_valid(text)
            except Exception, e:
                s = u'Error verifying textbound %s: %s' % (t, e)
                raise FormatError(s.encode('utf-8'))