Ejemplo n.º 1
0
 def reconcile(self, sents):
     """
     Update this document with the newly annotated tokens.
     """
     # TIMEXes need unique IDs
     all_ts = set()
     for sent in sents:
         for (tok, pos, ts) in sent:
             for t in ts:
                 all_ts.add(t)
     add_timex_ids(all_ts)
     self._sents = copy.deepcopy(sents)
Ejemplo n.º 2
0
 def test_assign_IDs(self):
     # Get some sample IDs
     ts = set([Timex(), Timex(), Timex()])
     add_timex_ids(ts)
     
     # Get the assigned IDs
     tids = set()
     for t in ts:
         tids.add(t.id)
     
     # Should be exactly 3 unique IDs
     self.assertEquals(len(tids), 3)
     
     # Should be consecutive
     self.assertTrue(1 in tids)
     self.assertTrue(2 in tids)
     self.assertTrue(3 in tids)
Ejemplo n.º 3
0
    def get_extents(self):
        """
        Print out the format suitable for timex-extents.tab
        """

        # TIMEXes need unique IDs
        all_ts = set()
        for sent in self._sents:
            for (tok, pos, ts) in sent:
                for t in ts:
                    all_ts.add(t)
        add_timex_ids(all_ts)

        s = ""
        for i in range(len(self._sents)):
            for j in range(len(self._sents[i])):
                for timex in self._sents[i][j][2]:
                    s += self._get_timex_line(i, j, timex) + "\n"

        return s
Ejemplo n.º 4
0
    def get_extents(self):
        """
        Print out the format suitable for timex-extents.tab
        """

        # TIMEXes need unique IDs
        all_ts = set()
        for sent in self._sents:
            for (tok, pos, ts) in sent:
                for t in ts:
                    all_ts.add(t)
        add_timex_ids(all_ts)

        s = ""
        for i in range(len(self._sents)):
            for j in range(len(self._sents[i])):
                for timex in self._sents[i][j][2]:
                    s += self._get_timex_line(i, j, timex) + "\n"

        return s
Ejemplo n.º 5
0
 def test_assign_IDs_consecutive(self):
     # Get some sample IDs
     ts = set([Timex(), Timex(), Timex()])
     at = Timex()
     at.id = 2
     ts.add(at)
     add_timex_ids(ts)
     
     # Get the assigned IDs
     tids = set()
     for t in ts:
         tids.add(t.id)
     
     # Should be exactly 4 unique IDs and pre-assigned one hasn't changed
     self.assertEquals(len(tids), 4)
     self.assertEquals(2, at.id)
     
     # Should be consecutive for new ones
     self.assertTrue(1 in tids)
     self.assertTrue(2 in tids)
     self.assertTrue(3 in tids)
     self.assertTrue(4 in tids)
Ejemplo n.º 6
0
    def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False):
        """
        Reconciles this document against the new internal representation. If
        add_S is set to anything other than False, this means tags are indicated
        to indicate the sentence boundaries, with the tag names being the value
        of add_S. add_LEX is the same, but for marking token boundaries, and
        pos_attr is the name of the attribute which holds the POS tag for that
        token. This is mainly useful for transforming the TERN documents into
        something that GUTime can parse.
        
        If your document already contains S and LEX tags, and add_S/add_LEX is
        set to add them, old S/LEX tags will be stripped first. If pos_attr is
        set and the attribute name differs from the old POS attribute name on
        the lex tag, then the old attribute will be removed.
        
        Sentence/token boundaries will not be altered in the final document
        unless add_S/add_LEX is set. If you have changed the token boundaries in
        the internal representation from the original form, but are not then
        adding them back in, reconciliation may give undefined results.
        
        There are some inputs which would output invalid XML. For example, if
        this document has elements which span multiple sentences, but not whole
        parts of them, then you will be unable to add XML tags and get valid
        XML, so failure will occur in unexpected ways.
        
        If you are adding LEX tags, and your XML document contains tags internal
        to tokens, then reconciliation will fail, as it expects tokens to be in
        a continuous piece of whitespace.
        """

        # First, add S tags if need be.
        if add_S:
            # First, strip any old ones
            if self._has_S:
                self._strip_tags(self._xml_doc, self._has_S, self._xml_body)

            # Then add the new ones
            leftover = self._add_S_tags(self._xml_body, sents, add_S)
            if len(leftover) > 1:
                raise NestingError(
                    'Unable to add all S tags, possibly due to bad tag nesting'
                    + str(leftover))

            # Update what we consider to be our S tags
            self._has_S = add_S

        # Now, get a list of the S nodes, which are used to reconcile individual
        # tokens
        if self._has_S:
            s_nodes = self._xml_body.getElementsByTagName(self._has_S)
        else:
            # There are no S tokens in the text. So, going forward, only
            # consider there being one sentence, which belongs to the root node
            s_nodes = [self._xml_body]
            new_sent = []
            for sent in sents:
                for part in sent:
                    new_sent.append(part)
            sents = [new_sent]

        # Now, add LEX tags if need be
        if add_LEX:
            # First, strip any old ones
            if self._has_LEX:
                self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body)

            # Now add those LEX tokens
            for i in range(len(sents)):
                self._add_LEX_tags(s_nodes[i], sents[i], add_LEX)

            # Update what we consider to be our LEX tags
            self._has_LEX = add_LEX

        # Now, add the POS attribute
        if pos_attr and self._has_LEX:
            # Get each LEX tag and add the attribute
            for i in range(len(sents)):
                lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX)
                for j in range(len(sents[i])):
                    # Strip the existing attribute if need be
                    try:
                        lex_tags[j].removeAttribute(self._pos_attr)
                    except xml.dom.NotFoundErr:
                        pass

                    # Now set the new POS attr
                    lex_tags[j].setAttribute(pos_attr, sents[i][j][1])

            # Update what we think is the pos attr
            self._pos_attr = pos_attr

        # Strip old TIMEXes to avoid duplicates
        self.strip_timexes()

        # For XML documents, TIMEXes need unique IDs
        all_ts = set()
        for sent in sents:
            for (tok, pos, ts) in sent:
                for t in ts:
                    all_ts.add(t)
        add_timex_ids(all_ts)

        # Now iterate over each sentence
        for i in range(len(sents)):
            # Get all timexes in this sentence
            timexes = set()
            for (word, pos, ts) in sents[i]:
                for t in ts:
                    timexes.add(t)

            # Now, for each timex, add it to the sentence
            for timex in timexes:
                try:
                    self._add_timex(timex, sents[i], s_nodes[i])
                except NestingError as e:
                    LOGGER.exception("Error whilst attempting to add TIMEX")
Ejemplo n.º 7
0
    def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False):
        """
        Reconciles this document against the new internal representation. If
        add_S is set to anything other than False, this means tags are indicated
        to indicate the sentence boundaries, with the tag names being the value
        of add_S. add_LEX is the same, but for marking token boundaries, and
        pos_attr is the name of the attribute which holds the POS tag for that
        token. This is mainly useful for transforming the TERN documents into
        something that GUTime can parse.
        
        If your document already contains S and LEX tags, and add_S/add_LEX is
        set to add them, old S/LEX tags will be stripped first. If pos_attr is
        set and the attribute name differs from the old POS attribute name on
        the lex tag, then the old attribute will be removed.
        
        Sentence/token boundaries will not be altered in the final document
        unless add_S/add_LEX is set. If you have changed the token boundaries in
        the internal representation from the original form, but are not then
        adding them back in, reconciliation may give undefined results.
        
        There are some inputs which would output invalid XML. For example, if
        this document has elements which span multiple sentences, but not whole
        parts of them, then you will be unable to add XML tags and get valid
        XML, so failure will occur in unexpected ways.
        
        If you are adding LEX tags, and your XML document contains tags internal
        to tokens, then reconciliation will fail, as it expects tokens to be in
        a continuous piece of whitespace.
        """

        # First, add S tags if need be.
        if add_S:
            # First, strip any old ones
            if self._has_S:
                self._strip_tags(self._xml_doc, self._has_S, self._xml_body)

            # Then add the new ones
            leftover = self._add_S_tags(self._xml_body, sents, add_S)
            if len(leftover) > 1:
                raise NestingError('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover))

            # Update what we consider to be our S tags
            self._has_S = add_S

        # Now, get a list of the S nodes, which are used to reconcile individual
        # tokens
        if self._has_S:
            s_nodes = self._xml_body.getElementsByTagName(self._has_S)
        else:
            # There are no S tokens in the text. So, going forward, only
            # consider there being one sentence, which belongs to the root node
            s_nodes = [self._xml_body]
            new_sent = []
            for sent in sents:
                for part in sent:
                    new_sent.append(part)
            sents = [new_sent]

        # Now, add LEX tags if need be
        if add_LEX:
            # First, strip any old ones
            if self._has_LEX:
                self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body)

            # Now add those LEX tokens
            for i in range(len(sents)):
                self._add_LEX_tags(s_nodes[i], sents[i], add_LEX)

            # Update what we consider to be our LEX tags
            self._has_LEX = add_LEX

        # Now, add the POS attribute
        if pos_attr and self._has_LEX:
            # Get each LEX tag and add the attribute
            for i in range(len(sents)):
                lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX)
                for j in range(len(sents[i])):
                    # Strip the existing attribute if need be
                    try:
                        lex_tags[j].removeAttribute(self._pos_attr)
                    except xml.dom.NotFoundErr:
                        pass

                    # Now set the new POS attr
                    lex_tags[j].setAttribute(pos_attr, sents[i][j][1])

            # Update what we think is the pos attr
            self._pos_attr = pos_attr

        # Strip old TIMEXes to avoid duplicates
        self.strip_timexes()

        # For XML documents, TIMEXes need unique IDs
        all_ts = set()
        for sent in sents:
            for (tok, pos, ts) in sent:
                for t in ts:
                    all_ts.add(t)
        add_timex_ids(all_ts)

        # Now iterate over each sentence
        for i in range(len(sents)):
            # Get all timexes in this sentence
            timexes = set()
            for (word, pos, ts) in sents[i]:
                for t in ts:
                    timexes.add(t)

            # Now, for each timex, add it to the sentence
            for timex in timexes:
                try:
                    self._add_timex(timex, sents[i], s_nodes[i])
                except NestingError as e:
                    LOGGER.exception("Error whilst attempting to add TIMEX")