def __call__(self, s1, context): s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url) s2.schost = s1.schost s2.source = s1.source s2.source_metadata['kba-2012'] = s1.source_metadata logger.debug('len(original .body.raw) = %d' % len(s1.body.raw)) #logger.critical(repr(s2)) s2.body = ContentItem( raw=s1.body.raw, encoding=s1.body.encoding, ## default, might get overwritten below media_type='text/html', taggings={ 'stanford': Tagging( tagger_id='stanford', raw_tagging=s1.body.ner, generation_time=make_stream_time('2012-06-01T00:00:00.0Z'), tagger_config= 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100', tagger_version='Stanford CoreNLP ver 1.2.0', ) }) if self.config['keep_old_cleansed_as_clean_visible']: s2.body.clean_visible = s1.body.cleansed if s1.source == 'social': s2.body.media_type = 'text/plain' ## the separation of content items in the social stream ## was artificial and annoying, so smoosh them together s2.body.clean_visible = '\n\n'.join( [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed]) changed_body_raw = False if s1.title and s1.title.raw: s2.body.raw = s1.title.raw s2.body.raw += r'\n\n' changed_body_raw = True if s1.anchor and s1.anchor.raw: s2.body.raw += s1.anchor.raw s2.body.raw += r'\n\n' changed_body_raw = True if changed_body_raw: s2.body.raw += s1.body.raw if s1.title: ci = ContentItem( raw=s1.title.raw, encoding=s1.title.encoding, clean_visible=s1.title.cleansed, ) s2.other_content['title'] = ci if s1.anchor: ci = ContentItem(raw=s1.anchor.raw, encoding=s1.anchor.encoding, clean_visible=s1.anchor.cleansed) s2.other_content['anchor'] = ci return s2
def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk): ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk ''' ## prepare to iterate over the input chunk input_iter = i_chunk.__iter__() all_ner = xml.dom.minidom.parse(open(ner_xml_path)) ## this converts our UTF-8 data into unicode strings, so when ## we want to compute byte offsets or construct tokens, we ## must .encode('utf8') for ner_dom in all_ner.getElementsByTagName('FILENAME'): #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = ner_dom.attributes.get('stream_id').value if stream_item.stream_id is None: assert not stream_id, 'out of sync: None != %r' % stream_id logger.critical('si.stream_id is None... ignoring') continue assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s' % (stream_id, stream_item.stream_id) if not stream_item.body: ## the XML better have had an empty clean_visible too... #assert not ner_dom....something continue tagging = Tagging() tagging.tagger_id = self.tagger_id # pylint: disable=E1101 ''' ## get this one file out of its FILENAME tags tagged_doc_parts = list(files(ner_dom.toxml())) if not tagged_doc_parts: continue tagged_doc = tagged_doc_parts[0][1] ## hack hope_original = make_clean_visible(tagged_doc, '') open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8')) print ner_xml_path + '-clean' ''' #tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[self.tagger_id] = tagging # pylint: disable=E1101 ## could consume lots of memory here by instantiating everything sentences, relations, attributes = self.get_sentences(ner_dom) stream_item.body.sentences[self.tagger_id] = sentences # pylint: disable=E1101 stream_item.body.relations[self.tagger_id] = relations # pylint: disable=E1101 stream_item.body.attributes[self.tagger_id] = attributes # pylint: disable=E1101 logger.debug('finished aligning tokens %s' % stream_item.stream_id) ''' for num, sent in enumerate(sentences): for tok in sent.tokens: print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token)) ''' if 'align_labels_by' in self.config and self.config[ 'align_labels_by']: assert 'aligner_data' in self.config, 'config missing "aligner_data"' aligner = AlignmentStrategies[self.config['align_labels_by']] aligner(stream_item, self.config['aligner_data']) ## forcibly collect dereferenced objects gc.collect() try: o_chunk.add(stream_item) except MemoryError, exc: msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise PipelineOutOfMemory(msg)
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path): ''' iterate through the i_chunk and tmp_ner_path to generate a new Chunk with body.ner ''' o_chunk = Chunk() input_iter = i_chunk.__iter__() ner = '' stream_id = None all_ner = xml.dom.minidom.parse(open(tmp_ner_path)) for raw_ner in all_ner.getElementsByTagName('FILENAME'): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = raw_ner.attributes.get('docid').value assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner) tagger_id = 'lingpipe' tagging = Tagging() tagging.tagger_id = tagger_id ## get this one file out of its FILENAME tags tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1] tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[tagger_id] = tagging sentences = list(lingpipe.sentences(tagged_doc)) ## make JS labels on individual tokens assert stream_item.ratings[0].mentions, stream_item.stream_id john_smith_label = Label() john_smith_label.annotator = stream_item.ratings[0].annotator john_smith_label.target_id = stream_item.ratings[0].target_id # first map all corefchains to their words equiv_ids = collections.defaultdict(lambda: set()) for sent in sentences: for tok in sent.tokens: if tok.entity_type is not None: equiv_ids[tok.equiv_id].add(cleanse(tok.token)) ## find all the chains that are John Smith johnsmiths = set() for equiv_id, names in equiv_ids.items(): ## detect 'smith' in 'smithye' _names = cleanse(' '.join(names)) if 'john' in _names and 'smith' in _names: johnsmiths.add(equiv_id) print len(johnsmiths) ## now apply the label for sent in sentences: for tok in sent.tokens: if tok.equiv_id in johnsmiths: tok.labels = [john_smith_label] stream_item.body.sentences[tagger_id] = sentences o_chunk.add(stream_item) ## put the o_chunk bytes into the specified file open(tmp_done_path, 'wb').write(str(o_chunk)) ## replace this with log.info() print 'created %s' % tmp_done_path
def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk): ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk ''' ## prepare to iterate over the input chunk input_iter = i_chunk.__iter__() all_ner = xml.dom.minidom.parse(open(ner_xml_path)) ## this converts our UTF-8 data into unicode strings, so when ## we want to compute byte offsets or construct tokens, we ## must .encode('utf8') for ner_dom in all_ner.getElementsByTagName('FILENAME'): #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = ner_dom.attributes.get('stream_id').value if stream_item.stream_id is None: assert not stream_id, 'out of sync: None != %r' % stream_id logger.critical('si.stream_id is None... ignoring') continue assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s' % (stream_id, stream_item.stream_id) if not stream_item.body: ## the XML better have had an empty clean_visible too... #assert not ner_dom....something continue tagging = Tagging() tagging.tagger_id = self.tagger_id # pylint: disable=E1101 ''' ## get this one file out of its FILENAME tags tagged_doc_parts = list(files(ner_dom.toxml())) if not tagged_doc_parts: continue tagged_doc = tagged_doc_parts[0][1] ## hack hope_original = make_clean_visible(tagged_doc, '') open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8')) print ner_xml_path + '-clean' ''' #tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[self.tagger_id] = tagging # pylint: disable=E1101 ## could consume lots of memory here by instantiating everything sentences, relations, attributes = self.get_sentences(ner_dom) stream_item.body.sentences[self.tagger_id] = sentences # pylint: disable=E1101 stream_item.body.relations[self.tagger_id] = relations # pylint: disable=E1101 stream_item.body.attributes[self.tagger_id] = attributes # pylint: disable=E1101 logger.debug('finished aligning tokens %s' % stream_item.stream_id) ''' for num, sent in enumerate(sentences): for tok in sent.tokens: print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token)) ''' if 'align_labels_by' in self.config and self.config['align_labels_by']: assert 'aligner_data' in self.config, 'config missing "aligner_data"' aligner = AlignmentStrategies[ self.config['align_labels_by'] ] aligner( stream_item, self.config['aligner_data'] ) ## forcibly collect dereferenced objects gc.collect() try: o_chunk.add(stream_item) except MemoryError, exc: msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise PipelineOutOfMemory(msg)