def apply(self, timex, cur_context, dct, body, before, after): """ Applies this rule to this timex, where body is the full extent covered by this timex, before is the preceeding text in the sentence, and after is the proceeding text in the sentence, in the [(token, POS), ...] form A boolean indicating whether or not application was successful is returned. The timex may also be modified, so should be passed in by reference. """ # Check this rule type matches the timex type if self._type != None and timex.type.lower() != self._type.lower(): return (False, cur_context) # Check before, after and whole sentence guards if not self._check_guards(self._toks_to_str(before), self._before_guards): return (False, cur_context) if not self._check_guards(self._toks_to_str(after), self._after_guards): return (False, cur_context) if not self._check_guards(self._toks_to_str(body), self._guards): return (False, cur_context) if not self._check_guards(self._toks_to_str(before + body + after), self._sent_guards): return (False, cur_context) # Now, check if we match: if self._tokenise == True: senttext = self._toks_to_str(body) if self._deliminate_numbers: senttext = self._do_deliminate_numbers(senttext) else: senttext = self._tokenise.join([tok for (tok, pos, ts) in body]) match = self._match.search(senttext) # If we do, then calculate attributes for the timex if match: if self._DEBUG: timex.comment = self.id try: if self._value_exp != None: timex.value = eval(self._value_exp) if self._type_exp != None: timex.type = eval(self._type_exp) if self._freq_exp != None: timex.freq = eval(self._freq_exp) if self._quant_exp != None: timex.quant = eval(self._quant_exp) if self._mod_exp != None: timex.mod = eval(self._mod_exp) except Exception as e: ternip.warn("Malformed rule expression", e) # Need to update current time context, if necessary return (True, cur_context) else: # Rule did not match return (False, cur_context)
#!/usr/bin/env python from glob import glob import sys import os import os.path sys.path.append('..') import ternip import ternip.formats if not os.path.isdir('preprocessed'): os.mkdir('preprocessed') for fpath in glob(os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')): with open(fpath) as fd: try: doc = ternip.formats.tern(fd.read()) print "Pre-processing", os.path.basename(fpath) doc.reconcile_dct(doc.get_dct_sents(), add_S='s', add_LEX='lex', pos_attr='pos') doc.reconcile(doc.get_sents(), add_S='s', add_LEX='lex', pos_attr='pos') with open(os.path.join('preprocessed', os.path.basename(fpath)), 'w') as ppfd: ppfd.write(str(doc)[22:]) except Exception as e: ternip.warn('Can not load document ' + os.path.basename(fpath), e)
def reconcile(self, sents, add_S = False, add_LEX = False, pos_attr=False): """ Reconciles this document against the new internal representation. If add_S is set to anything other than False, this means tags are indicated to indicate the sentence boundaries, with the tag names being the value of add_S. add_LEX is the same, but for marking token boundaries, and pos_attr is the name of the attribute which holds the POS tag for that token. This is mainly useful for transforming the TERN documents into something that GUTime can parse. If your document already contains S and LEX tags, and add_S/add_LEX is set to add them, old S/LEX tags will be stripped first. If pos_attr is set and the attribute name differs from the old POS attribute name on the lex tag, then the old attribute will be removed. Sentence/token boundaries will not be altered in the final document unless add_S/add_LEX is set. If you have changed the token boundaries in the internal representation from the original form, but are not then adding them back in, reconciliation may give undefined results. There are some inputs which would output invalid XML. For example, if this document has elements which span multiple sentences, but not whole parts of them, then you will be unable to add XML tags and get valid XML, so failure will occur in unexpected ways. If you are adding LEX tags, and your XML document contains tags internal to tokens, then reconciliation will fail, as it expects tokens to be in a continuous piece of whitespace. """ # First, add S tags if need be. if add_S: # First, strip any old ones if self._has_S: self._strip_tags(self._xml_doc, self._has_S, self._xml_body) # Then add the new ones leftover = self._add_S_tags(self._xml_body, sents, add_S) if len(leftover) > 1: raise nesting_error('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover)) # Update what we consider to be our S tags self._has_S = add_S # Now, get a list of the S nodes, which are used to reconcile individual # tokens if self._has_S: s_nodes = self._xml_body.getElementsByTagName(self._has_S) else: # There are no S tokens in the text. So, going forward, only # consider there being one sentence, which belongs to the root node s_nodes = [self._xml_body] new_sent = [] for sent in sents: for part in sent: new_sent.append(part) sents = [new_sent] # Now, add LEX tags if need be if add_LEX: # First, strip any old ones if self._has_LEX: self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body) # Now add those LEX tokens for i in range(len(sents)): self._add_LEX_tags(s_nodes[i], sents[i], add_LEX) # Update what we consider to be our LEX tags self._has_LEX = add_LEX # Now, add the POS attribute if pos_attr and self._has_LEX: # Get each LEX tag and add the attribute for i in range(len(sents)): lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX) for j in range(len(sents[i])): # Strip the existing attribute if need be try: lex_tags[j].removeAttribute(self._pos_attr) except xml.dom.NotFoundErr: pass # Now set the new POS attr lex_tags[j].setAttribute(pos_attr, sents[i][j][1]) # Update what we think is the pos attr self._pos_attr = pos_attr # Strip old TIMEXes to avoid duplicates self.strip_timexes() # For XML documents, TIMEXes need unique IDs all_ts = set() for sent in sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) ternip.add_timex_ids(all_ts) # Now iterate over each sentence for i in range(len(sents)): # Get all timexes in this sentence timexes = set() for (word, pos, ts) in sents[i]: for t in ts: timexes.add(t) # Now, for each timex, add it to the sentence for timex in timexes: try: self._add_timex(timex, sents[i], s_nodes[i]) except nesting_error as e: ternip.warn("Error whilst attempting to add TIMEX", e)
import sys import os import os.path sys.path.append('..') import ternip import ternip.formats if not os.path.isdir('preprocessed'): os.mkdir('preprocessed') for fpath in glob( os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')): with open(fpath) as fd: try: doc = ternip.formats.tern(fd.read()) print "Pre-processing", os.path.basename(fpath) doc.reconcile_dct(doc.get_dct_sents(), add_S='s', add_LEX='lex', pos_attr='pos') doc.reconcile(doc.get_sents(), add_S='s', add_LEX='lex', pos_attr='pos') with open(os.path.join('preprocessed', os.path.basename(fpath)), 'w') as ppfd: ppfd.write(str(doc)[22:]) except Exception as e: ternip.warn('Can not load document ' + os.path.basename(fpath), e)