Esempio n. 1
0
 def __init__(self, word, sent, start, end, tokens, label, pos=True):
     self.word = word
     self.sent = sent
     self.start = start
     self.end = end
     self.posTokens = []
     self.tokens = [x for x in tokens if x[-1]]
     if pos:
         try:
             self.posTokens = pos.tag(sent)
         except:
             pass
     self.label = label.replace('\n', '')
     self.pos = ''
     self.posIndex = None
     self.dict = {}
Esempio n. 2
0
 def process(self):
     for xml in self.xmls:
         path = setup_newdir(xml, self.golddir, self.newdir,
                             self.suffix, self.renew)
         if not path:
             continue
         mkparentdirs(path)
         with open(xml, 'r') as oldfile:
             text = oldfile.read()
         doc = Space_Document(xml)
         tags = [tag for tag in doc.tags if 'start' in tag.attrib]
         new_text = text
         for (i,m) in enumerate(re.finditer(sentence_pattern, text)):
             sentence = doc.sents[i]
             doc_lexes = sentence.getchildren()
             xml_sentence = m.group()
             tokens = [''.join([c if ord(c) < 128
                                else u2ascii[c]
                                for c in x.text]).encode('utf-8')
                       for x in doc_lexes]
             (pos_tags, ner_tags, edges) = ([], [], [])
             if self.heavy:
                 pos_tags = pos.tag(tokens)
                 ner_tags = ner.tag(tokens)
                 try:
                     if self.debug:
                         print ' '.join([x for x in tokens])
                     edges = p(' '.join([x for x in tokens]), split=True)
                 except:
                     'somehow got here'
             c = 0
             for (j, n) in enumerate(re.finditer(lex_attrs_pattern,
                                                 xml_sentence)):
                 doc_lex = doc_lexes[j]
                 new_lex = Lex(doc_lex.text, doc_lex.attrib)
                 attributes = n.group()
                 tag = binary_search((int(doc_lex.attrib['begin']),
                                      int(doc_lex.attrib['end']),
                                      doc_lex.text), tags)
                 label = 'None'
                 if type(tag) != type(None):
                     label = tag.tag
                 new_lex.add(('label', label))
                 new_lex.add(('word', new_lex.text.encode('utf-8')))
                 if type(tag) != type(None):
                         new_lex.addAll([(key.encode('utf-8'), tag.attrib[key].encode('utf-8')) for key in tag.attrib])
                 greedyEdge = p(tokens[j], split=True)
                 if greedyEdge:
                     gedge = greedyEdge[0]
                     if gedge.keyvalues and gedge.m:
                         keyvalues = gedge.keyvalues[gedge.keyvalues.keys()[0]]
                         new_lex.addAll([('L' + key, keyvalues[key]) for key in keyvalues])
                 if pos_tags:
                     if tokens[j] == pos_tags[c][0]:
                         new_lex.add(('pos', pos_tags[c][1]))
                         pos_tags.remove(pos_tags[c])
                 if ner_tags: #this error case comes up for RFC/Durango.xml
                     if tokens[j] == ner_tags[c][0]:
                         new_lex.add(('ner', ner_tags[c][1]))
                         ner_tags.remove(ner_tags[c])
                 if edges:
                     sparser_edge = ledge(edges, tokens[j])
                     if sparser_edge:
                         if sparser_edge.keyvalues:
                             keyvalues = sparser_edge.keyvalues[sparser_edge.keyvalues.keys()[0]]
                             new_lex.addAll([(key, keyvalues[key]) for key in keyvalues])
                 new_lex.addAll([function(new_lex) for function in self.feature_functions])
                 new_text = new_text.replace(attributes, str(new_lex))
         w = open(path, 'w')
         print>>w, new_text
         w.close()