Example #1
0
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len(s1.body.raw))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw=s1.body.raw,
            encoding=s1.body.encoding,
            ## default, might get overwritten below
            media_type='text/html',
            taggings={
                'stanford':
                Tagging(
                    tagger_id='stanford',
                    raw_tagging=s1.body.ner,
                    generation_time=make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config=
                    'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version='Stanford CoreNLP ver 1.2.0',
                )
            })

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join(
                [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw=s1.title.raw,
                encoding=s1.title.encoding,
                clean_visible=s1.title.cleansed,
            )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(raw=s1.anchor.raw,
                             encoding=s1.anchor.encoding,
                             clean_visible=s1.anchor.cleansed)
            s2.other_content['anchor'] = ci
        return s2
    def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk):
        ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk '''
        ## prepare to iterate over the input chunk
        input_iter = i_chunk.__iter__()

        all_ner = xml.dom.minidom.parse(open(ner_xml_path))

        ## this converts our UTF-8 data into unicode strings, so when
        ## we want to compute byte offsets or construct tokens, we
        ## must .encode('utf8')
        for ner_dom in all_ner.getElementsByTagName('FILENAME'):
            #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')):

            stream_item = input_iter.next()

            ## get stream_id out of the XML
            stream_id = ner_dom.attributes.get('stream_id').value
            if stream_item.stream_id is None:
                assert not stream_id, 'out of sync: None != %r' % stream_id
                logger.critical('si.stream_id is None... ignoring')
                continue
            assert stream_id and stream_id == stream_item.stream_id, \
                '%s != %s' % (stream_id, stream_item.stream_id)

            if not stream_item.body:
                ## the XML better have had an empty clean_visible too...
                #assert not ner_dom....something
                continue

            tagging = Tagging()
            tagging.tagger_id = self.tagger_id  # pylint: disable=E1101
            '''
            ## get this one file out of its FILENAME tags
            tagged_doc_parts = list(files(ner_dom.toxml()))
            if not tagged_doc_parts:
                continue

            tagged_doc = tagged_doc_parts[0][1]

            ## hack
            hope_original = make_clean_visible(tagged_doc, '')
            open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8'))
            print ner_xml_path + '-clean'
            '''

            #tagging.raw_tagging = tagged_doc
            tagging.generation_time = streamcorpus.make_stream_time()
            stream_item.body.taggings[self.tagger_id] = tagging  # pylint: disable=E1101

            ## could consume lots of memory here by instantiating everything
            sentences, relations, attributes = self.get_sentences(ner_dom)
            stream_item.body.sentences[self.tagger_id] = sentences  # pylint: disable=E1101
            stream_item.body.relations[self.tagger_id] = relations  # pylint: disable=E1101
            stream_item.body.attributes[self.tagger_id] = attributes  # pylint: disable=E1101

            logger.debug('finished aligning tokens %s' % stream_item.stream_id)
            '''
            for num, sent in enumerate(sentences):
                for tok in sent.tokens:
                    print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token))
            '''

            if 'align_labels_by' in self.config and self.config[
                    'align_labels_by']:
                assert 'aligner_data' in self.config, 'config missing "aligner_data"'
                aligner = AlignmentStrategies[self.config['align_labels_by']]
                aligner(stream_item, self.config['aligner_data'])

            ## forcibly collect dereferenced objects
            gc.collect()

            try:
                o_chunk.add(stream_item)
            except MemoryError, exc:
                msg = traceback.format_exc(exc)
                msg += make_memory_info_msg()
                logger.critical(msg)
                raise PipelineOutOfMemory(msg)
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path):
    '''
    iterate through the i_chunk and tmp_ner_path to generate a new
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
    stream_id = None

    all_ner = xml.dom.minidom.parse(open(tmp_ner_path))

    for raw_ner in all_ner.getElementsByTagName('FILENAME'):
        
        stream_item = input_iter.next()
        ## get stream_id out of the XML
        stream_id = raw_ner.attributes.get('docid').value
        assert stream_id and stream_id == stream_item.stream_id, \
            '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner)

        tagger_id = 'lingpipe'
        tagging = Tagging()
        tagging.tagger_id = tagger_id
        ## get this one file out of its FILENAME tags
        tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1]
        tagging.raw_tagging = tagged_doc
        tagging.generation_time = streamcorpus.make_stream_time()
        stream_item.body.taggings[tagger_id] = tagging

        sentences = list(lingpipe.sentences(tagged_doc))

        ## make JS labels on individual tokens
        assert stream_item.ratings[0].mentions, stream_item.stream_id
        john_smith_label = Label()
        john_smith_label.annotator = stream_item.ratings[0].annotator
        john_smith_label.target_id = stream_item.ratings[0].target_id

        # first map all corefchains to their words
        equiv_ids = collections.defaultdict(lambda: set())
        for sent in sentences:
            for tok in sent.tokens:
                if tok.entity_type is not None:
                    equiv_ids[tok.equiv_id].add(cleanse(tok.token))

        ## find all the chains that are John Smith
        johnsmiths = set()
        for equiv_id, names in equiv_ids.items():
            ## detect 'smith' in 'smithye'
            _names = cleanse(' '.join(names))
            if 'john' in _names and 'smith' in _names:
                johnsmiths.add(equiv_id)

        print len(johnsmiths)
        ## now apply the label
        for sent in sentences:
            for tok in sent.tokens:
                if tok.equiv_id in johnsmiths:
                    tok.labels = [john_smith_label]                

        stream_item.body.sentences[tagger_id] = sentences
        
        o_chunk.add(stream_item)

    ## put the o_chunk bytes into the specified file
    open(tmp_done_path, 'wb').write(str(o_chunk))
    ## replace this with log.info()
    print 'created %s' % tmp_done_path
    def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk):
        ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk '''
        ## prepare to iterate over the input chunk
        input_iter = i_chunk.__iter__()

        all_ner = xml.dom.minidom.parse(open(ner_xml_path))

        ## this converts our UTF-8 data into unicode strings, so when
        ## we want to compute byte offsets or construct tokens, we
        ## must .encode('utf8')
        for ner_dom in all_ner.getElementsByTagName('FILENAME'):
        #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')):

            stream_item = input_iter.next()

            ## get stream_id out of the XML
            stream_id = ner_dom.attributes.get('stream_id').value
            if stream_item.stream_id is None:
                assert not stream_id, 'out of sync: None != %r' % stream_id
                logger.critical('si.stream_id is None... ignoring')
                continue
            assert stream_id and stream_id == stream_item.stream_id, \
                '%s != %s' % (stream_id, stream_item.stream_id)

            if not stream_item.body:
                ## the XML better have had an empty clean_visible too...
                #assert not ner_dom....something
                continue

            tagging = Tagging()
            tagging.tagger_id = self.tagger_id  # pylint: disable=E1101

            '''
            ## get this one file out of its FILENAME tags
            tagged_doc_parts = list(files(ner_dom.toxml()))
            if not tagged_doc_parts:
                continue

            tagged_doc = tagged_doc_parts[0][1]

            ## hack
            hope_original = make_clean_visible(tagged_doc, '')
            open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8'))
            print ner_xml_path + '-clean'
            '''

            #tagging.raw_tagging = tagged_doc
            tagging.generation_time = streamcorpus.make_stream_time()
            stream_item.body.taggings[self.tagger_id] = tagging       # pylint: disable=E1101

            ## could consume lots of memory here by instantiating everything
            sentences, relations, attributes = self.get_sentences(ner_dom)
            stream_item.body.sentences[self.tagger_id] = sentences    # pylint: disable=E1101
            stream_item.body.relations[self.tagger_id] = relations    # pylint: disable=E1101
            stream_item.body.attributes[self.tagger_id] = attributes  # pylint: disable=E1101

            logger.debug('finished aligning tokens %s' % stream_item.stream_id)

            '''
            for num, sent in enumerate(sentences):
                for tok in sent.tokens:
                    print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token))
            '''

            if 'align_labels_by' in self.config and self.config['align_labels_by']:
                assert 'aligner_data' in self.config, 'config missing "aligner_data"'
                aligner = AlignmentStrategies[ self.config['align_labels_by'] ]
                aligner( stream_item, self.config['aligner_data'] )

            ## forcibly collect dereferenced objects
            gc.collect()

            try:
                o_chunk.add(stream_item)
            except MemoryError, exc:
                msg = traceback.format_exc(exc)
                msg += make_memory_info_msg()
                logger.critical(msg)
                raise PipelineOutOfMemory(msg)