def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path): ''' iterate through the i_chunk and tmp_ner_path to generate a new Chunk with body.ner ''' o_chunk = Chunk() input_iter = i_chunk.__iter__() ner = '' stream_id = None all_ner = xml.dom.minidom.parse(open(tmp_ner_path)) for raw_ner in all_ner.getElementsByTagName('FILENAME'): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = raw_ner.attributes.get('docid').value assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner) tagger_id = 'lingpipe' tagging = Tagging() tagging.tagger_id = tagger_id ## get this one file out of its FILENAME tags tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1] tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[tagger_id] = tagging sentences = list(lingpipe.sentences(tagged_doc)) ## make JS labels on individual tokens assert stream_item.ratings[0].mentions, stream_item.stream_id john_smith_label = Label() john_smith_label.annotator = stream_item.ratings[0].annotator john_smith_label.target_id = stream_item.ratings[0].target_id # first map all corefchains to their words equiv_ids = collections.defaultdict(lambda: set()) for sent in sentences: for tok in sent.tokens: if tok.entity_type is not None: equiv_ids[tok.equiv_id].add(cleanse(tok.token)) ## find all the chains that are John Smith johnsmiths = set() for equiv_id, names in equiv_ids.items(): ## detect 'smith' in 'smithye' _names = cleanse(' '.join(names)) if 'john' in _names and 'smith' in _names: johnsmiths.add(equiv_id) print len(johnsmiths) ## now apply the label for sent in sentences: for tok in sent.tokens: if tok.equiv_id in johnsmiths: tok.labels = [john_smith_label] stream_item.body.sentences[tagger_id] = sentences o_chunk.add(stream_item) ## put the o_chunk bytes into the specified file open(tmp_done_path, 'wb').write(str(o_chunk)) ## replace this with log.info() print 'created %s' % tmp_done_path
def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk): ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk ''' ## prepare to iterate over the input chunk input_iter = i_chunk.__iter__() all_ner = xml.dom.minidom.parse(open(ner_xml_path)) ## this converts our UTF-8 data into unicode strings, so when ## we want to compute byte offsets or construct tokens, we ## must .encode('utf8') for ner_dom in all_ner.getElementsByTagName('FILENAME'): #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = ner_dom.attributes.get('stream_id').value if stream_item.stream_id is None: assert not stream_id, 'out of sync: None != %r' % stream_id logger.critical('si.stream_id is None... ignoring') continue assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s' % (stream_id, stream_item.stream_id) if not stream_item.body: ## the XML better have had an empty clean_visible too... #assert not ner_dom....something continue tagging = Tagging() tagging.tagger_id = self.tagger_id # pylint: disable=E1101 ''' ## get this one file out of its FILENAME tags tagged_doc_parts = list(files(ner_dom.toxml())) if not tagged_doc_parts: continue tagged_doc = tagged_doc_parts[0][1] ## hack hope_original = make_clean_visible(tagged_doc, '') open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8')) print ner_xml_path + '-clean' ''' #tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[self.tagger_id] = tagging # pylint: disable=E1101 ## could consume lots of memory here by instantiating everything sentences, relations, attributes = self.get_sentences(ner_dom) stream_item.body.sentences[self.tagger_id] = sentences # pylint: disable=E1101 stream_item.body.relations[self.tagger_id] = relations # pylint: disable=E1101 stream_item.body.attributes[self.tagger_id] = attributes # pylint: disable=E1101 logger.debug('finished aligning tokens %s' % stream_item.stream_id) ''' for num, sent in enumerate(sentences): for tok in sent.tokens: print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token)) ''' if 'align_labels_by' in self.config and self.config['align_labels_by']: assert 'aligner_data' in self.config, 'config missing "aligner_data"' aligner = AlignmentStrategies[ self.config['align_labels_by'] ] aligner( stream_item, self.config['aligner_data'] ) ## forcibly collect dereferenced objects gc.collect() try: o_chunk.add(stream_item) except MemoryError, exc: msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise PipelineOutOfMemory(msg)