def __init__(self, word, sent, start, end, tokens, label, pos=True): self.word = word self.sent = sent self.start = start self.end = end self.posTokens = [] self.tokens = [x for x in tokens if x[-1]] if pos: try: self.posTokens = pos.tag(sent) except: pass self.label = label.replace('\n', '') self.pos = '' self.posIndex = None self.dict = {}
def process(self): for xml in self.xmls: path = setup_newdir(xml, self.golddir, self.newdir, self.suffix, self.renew) if not path: continue mkparentdirs(path) with open(xml, 'r') as oldfile: text = oldfile.read() doc = Space_Document(xml) tags = [tag for tag in doc.tags if 'start' in tag.attrib] new_text = text for (i,m) in enumerate(re.finditer(sentence_pattern, text)): sentence = doc.sents[i] doc_lexes = sentence.getchildren() xml_sentence = m.group() tokens = [''.join([c if ord(c) < 128 else u2ascii[c] for c in x.text]).encode('utf-8') for x in doc_lexes] (pos_tags, ner_tags, edges) = ([], [], []) if self.heavy: pos_tags = pos.tag(tokens) ner_tags = ner.tag(tokens) try: if self.debug: print ' '.join([x for x in tokens]) edges = p(' '.join([x for x in tokens]), split=True) except: 'somehow got here' c = 0 for (j, n) in enumerate(re.finditer(lex_attrs_pattern, xml_sentence)): doc_lex = doc_lexes[j] new_lex = Lex(doc_lex.text, doc_lex.attrib) attributes = n.group() tag = binary_search((int(doc_lex.attrib['begin']), int(doc_lex.attrib['end']), doc_lex.text), tags) label = 'None' if type(tag) != type(None): label = tag.tag new_lex.add(('label', label)) new_lex.add(('word', new_lex.text.encode('utf-8'))) if type(tag) != type(None): new_lex.addAll([(key.encode('utf-8'), tag.attrib[key].encode('utf-8')) for key in tag.attrib]) greedyEdge = p(tokens[j], split=True) if greedyEdge: gedge = greedyEdge[0] if gedge.keyvalues and gedge.m: keyvalues = gedge.keyvalues[gedge.keyvalues.keys()[0]] new_lex.addAll([('L' + key, keyvalues[key]) for key in keyvalues]) if pos_tags: if tokens[j] == pos_tags[c][0]: new_lex.add(('pos', pos_tags[c][1])) pos_tags.remove(pos_tags[c]) if ner_tags: #this error case comes up for RFC/Durango.xml if tokens[j] == ner_tags[c][0]: new_lex.add(('ner', ner_tags[c][1])) ner_tags.remove(ner_tags[c]) if edges: sparser_edge = ledge(edges, tokens[j]) if sparser_edge: if sparser_edge.keyvalues: keyvalues = sparser_edge.keyvalues[sparser_edge.keyvalues.keys()[0]] new_lex.addAll([(key, keyvalues[key]) for key in keyvalues]) new_lex.addAll([function(new_lex) for function in self.feature_functions]) new_text = new_text.replace(attributes, str(new_lex)) w = open(path, 'w') print>>w, new_text w.close()