Python Tagging.tagger_id Examples

Programming Language: Python

Namespace/Package Name: streamcorpus

Class/Type: Tagging

Method/Function: tagger_id

Examples at hotexamples.com: 2

Python Tagging.tagger_id - 2 examples found. These are the top rated real world Python examples of streamcorpus.Tagging.tagger_id extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tagging(3)

tagger_id(2)

Frequently Used Methods

Tagging (3)

tagger_id (2)

Example #1

Show file

File: _run_lingpipe.py Project: naimdjon/streamcorpus-pipeline

def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path):
    '''
    iterate through the i_chunk and tmp_ner_path to generate a new
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
    stream_id = None

    all_ner = xml.dom.minidom.parse(open(tmp_ner_path))

    for raw_ner in all_ner.getElementsByTagName('FILENAME'):
        
        stream_item = input_iter.next()
        ## get stream_id out of the XML
        stream_id = raw_ner.attributes.get('docid').value
        assert stream_id and stream_id == stream_item.stream_id, \
            '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner)

        tagger_id = 'lingpipe'
        tagging = Tagging()
        tagging.tagger_id = tagger_id
        ## get this one file out of its FILENAME tags
        tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1]
        tagging.raw_tagging = tagged_doc
        tagging.generation_time = streamcorpus.make_stream_time()
        stream_item.body.taggings[tagger_id] = tagging

        sentences = list(lingpipe.sentences(tagged_doc))

        ## make JS labels on individual tokens
        assert stream_item.ratings[0].mentions, stream_item.stream_id
        john_smith_label = Label()
        john_smith_label.annotator = stream_item.ratings[0].annotator
        john_smith_label.target_id = stream_item.ratings[0].target_id

        # first map all corefchains to their words
        equiv_ids = collections.defaultdict(lambda: set())
        for sent in sentences:
            for tok in sent.tokens:
                if tok.entity_type is not None:
                    equiv_ids[tok.equiv_id].add(cleanse(tok.token))

        ## find all the chains that are John Smith
        johnsmiths = set()
        for equiv_id, names in equiv_ids.items():
            ## detect 'smith' in 'smithye'
            _names = cleanse(' '.join(names))
            if 'john' in _names and 'smith' in _names:
                johnsmiths.add(equiv_id)

        print len(johnsmiths)
        ## now apply the label
        for sent in sentences:
            for tok in sent.tokens:
                if tok.equiv_id in johnsmiths:
                    tok.labels = [john_smith_label]                

        stream_item.body.sentences[tagger_id] = sentences
        
        o_chunk.add(stream_item)

    ## put the o_chunk bytes into the specified file
    open(tmp_done_path, 'wb').write(str(o_chunk))
    ## replace this with log.info()
    print 'created %s' % tmp_done_path

Example #2

Show file

File: _taggers.py Project: naimdjon/streamcorpus-pipeline

    def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk):
        ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk '''
        ## prepare to iterate over the input chunk
        input_iter = i_chunk.__iter__()

        all_ner = xml.dom.minidom.parse(open(ner_xml_path))

        ## this converts our UTF-8 data into unicode strings, so when
        ## we want to compute byte offsets or construct tokens, we
        ## must .encode('utf8')
        for ner_dom in all_ner.getElementsByTagName('FILENAME'):
        #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')):

            stream_item = input_iter.next()

            ## get stream_id out of the XML
            stream_id = ner_dom.attributes.get('stream_id').value
            if stream_item.stream_id is None:
                assert not stream_id, 'out of sync: None != %r' % stream_id
                logger.critical('si.stream_id is None... ignoring')
                continue
            assert stream_id and stream_id == stream_item.stream_id, \
                '%s != %s' % (stream_id, stream_item.stream_id)

            if not stream_item.body:
                ## the XML better have had an empty clean_visible too...
                #assert not ner_dom....something
                continue

            tagging = Tagging()
            tagging.tagger_id = self.tagger_id  # pylint: disable=E1101

            '''
            ## get this one file out of its FILENAME tags
            tagged_doc_parts = list(files(ner_dom.toxml()))
            if not tagged_doc_parts:
                continue

            tagged_doc = tagged_doc_parts[0][1]

            ## hack
            hope_original = make_clean_visible(tagged_doc, '')
            open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8'))
            print ner_xml_path + '-clean'
            '''

            #tagging.raw_tagging = tagged_doc
            tagging.generation_time = streamcorpus.make_stream_time()
            stream_item.body.taggings[self.tagger_id] = tagging       # pylint: disable=E1101

            ## could consume lots of memory here by instantiating everything
            sentences, relations, attributes = self.get_sentences(ner_dom)
            stream_item.body.sentences[self.tagger_id] = sentences    # pylint: disable=E1101
            stream_item.body.relations[self.tagger_id] = relations    # pylint: disable=E1101
            stream_item.body.attributes[self.tagger_id] = attributes  # pylint: disable=E1101

            logger.debug('finished aligning tokens %s' % stream_item.stream_id)

            '''
            for num, sent in enumerate(sentences):
                for tok in sent.tokens:
                    print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token))
            '''

            if 'align_labels_by' in self.config and self.config['align_labels_by']:
                assert 'aligner_data' in self.config, 'config missing "aligner_data"'
                aligner = AlignmentStrategies[ self.config['align_labels_by'] ]
                aligner( stream_item, self.config['aligner_data'] )

            ## forcibly collect dereferenced objects
            gc.collect()

            try:
                o_chunk.add(stream_item)
            except MemoryError, exc:
                msg = traceback.format_exc(exc)
                msg += make_memory_info_msg()
                logger.critical(msg)
                raise PipelineOutOfMemory(msg)