def __call__(self, i_str):
        # Read in the entire contents as text; we will need to
        # save it away later
        with open(i_str, 'r') as f:
            serifxml = f.read()

        fname = os.path.basename(i_str)
        stream_time = None
        date_m = date_in_file_name_re.match(fname)
        if date_m:
            year = int(date_m.group('year'))
            month = int(date_m.group('month'))
            day = int(date_m.group('day'))
            try:
                stream_time = streamcorpus.make_stream_time(
                    zulu_timestamp = '%d-%02d-%02dT00:00:01.000000Z' % (year, month, day))
            except Exception, exc:
                logger.info('trapped failed parsing of file name to make stream_time',
                            exc_info=True)
                stream_time = None
Exemple #2
0
    def __call__(self, i_str):
        # Read in the entire contents as text; we will need to
        # save it away later
        with open(i_str, 'r') as f:
            serifxml = f.read()

        fname = os.path.basename(i_str)
        stream_time = None
        date_m = date_in_file_name_re.match(fname)
        if date_m:
            year = int(date_m.group('year'))
            month = int(date_m.group('month'))
            day = int(date_m.group('day'))
            try:
                stream_time = streamcorpus.make_stream_time(
                    zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' %
                    (year, month, day))
            except Exception, exc:
                logger.info(
                    'trapped failed parsing of file name to make stream_time',
                    exc_info=True)
                stream_time = None
Exemple #3
0
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len(s1.body.raw))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw=s1.body.raw,
            encoding=s1.body.encoding,
            ## default, might get overwritten below
            media_type='text/html',
            taggings={
                'stanford':
                Tagging(
                    tagger_id='stanford',
                    raw_tagging=s1.body.ner,
                    generation_time=make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config=
                    'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version='Stanford CoreNLP ver 1.2.0',
                )
            })

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join(
                [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw=s1.title.raw,
                encoding=s1.title.encoding,
                clean_visible=s1.title.cleansed,
            )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(raw=s1.anchor.raw,
                             encoding=s1.anchor.encoding,
                             clean_visible=s1.anchor.cleansed)
            s2.other_content['anchor'] = ci
        return s2
Exemple #4
0
class from_serifxml(Configured):
    '''Read a Serif XML intermediate file as the input to the pipeline.

    This is a specialized reader for unusual circumstances; you will
    still need to run :class:`~streamcorpus_pipeline._serif.serif`
    with special settings to complete the tagging.  This expects to
    find serifxml flat files in a directory and creates a
    :class:`~streamcorpus.Tagging` with
    :attr:`~streamcorpus.Tagging.raw_tagging` holding the serifxml
    string.  This :class:`~streamcorpus.Tagging` is stored in
    :attr:`~streamcorpus.StreamItem.body.taggings`.

    This also fills in :attr:`~streamcorpus.ContentItem.raw` field.

    This has one configuration option, which can usually be left at
    its default value:

    .. code-block:: yaml

        streamcorpus_pipeline:
          from_serifxml:
            tagger_id: serif

    `tagger_id` is the tagger name in the generated
    :class:`~streamcorpus.StreamItem`.

    To obtain :attr:`~streamcorpus.StreamItem.body.sentences`, one
    must run Serif in the special `read_serifxml` mode:

    .. code-block:: yaml

        streamcorpus_pipeline:
          third_dir_path: /third
          tmp_dir_path: tmp
          reader: from_serifxml
          incremental_transforms:
          - language
          - guess_media_type
          - clean_html
          - clean_visible
          - title
          batch_transforms:
          - serif
          language:
            force:
              name: English
              code: en
          guess_media_type:
            fallback_media_type: text/plain
          serif:
            path_in_third: serif/serif-latest
            serif_exe: bin/x86_64/Serif
            par: streamcorpus_read_serifxml
            par_additions:
              streamcorpus_read_serifxml:
              - "# example additional line"
          writer: to_local_chunks
          to_local_chunks:
            output_type: otherdir
            output_path: test_output
            output_name: "%(input_fname)s"

    '''
    config_name = 'from_serifxml'
    default_config = {
        'tagger_id': 'serif',
    }

    def __call__(self, i_str):
        # Read in the entire contents as text; we will need to
        # save it away later
        with open(i_str, 'r') as f:
            serifxml = f.read()

        fname = os.path.basename(i_str)
        stream_time = None
        date_m = date_in_file_name_re.match(fname)
        if date_m:
            year = int(date_m.group('year'))
            month = int(date_m.group('month'))
            day = int(date_m.group('day'))
            try:
                stream_time = streamcorpus.make_stream_time(
                    zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' %
                    (year, month, day))
            except Exception, exc:
                logger.info(
                    'trapped failed parsing of file name to make stream_time',
                    exc_info=True)
                stream_time = None

        if not stream_time:
            ## fall back to using the present moment on this system
            epoch_ticks = time.time()  ### NOT IN THE SERIFXML FILE
            stream_time = streamcorpus.make_stream_time(
                epoch_ticks=epoch_ticks)

        # Parse the XML
        root = etree.fromstring(serifxml)

        # Get some key parts
        doc_id = root.xpath('string(/SerifXML/Document/@docid)')
        source = root.xpath('string(/SerifXML/Document/@source_type)')
        raw = root.xpath('string(/SerifXML/Document/OriginalText/Contents)')

        # Build the streamitem
        tagging = streamcorpus.Tagging(
            tagger_id=self.config['tagger_id'],
            raw_tagging=serifxml,
        )
        body = streamcorpus.ContentItem(
            raw=raw,
            taggings={
                self.config['tagger_id']: tagging,
            },
        )
        si = streamcorpus.StreamItem(
            version=streamcorpus.Versions.v0_3_0,
            doc_id=doc_id,
            abs_url=fname,
            source=source,
            body=body,
            stream_id='%d-%s' % (stream_time.epoch_ticks, doc_id),
            stream_time=stream_time,
        )
        yield si
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path):
    '''
    iterate through the i_chunk and tmp_ner_path to generate a new
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
    stream_id = None

    all_ner = xml.dom.minidom.parse(open(tmp_ner_path))

    for raw_ner in all_ner.getElementsByTagName('FILENAME'):
        
        stream_item = input_iter.next()
        ## get stream_id out of the XML
        stream_id = raw_ner.attributes.get('docid').value
        assert stream_id and stream_id == stream_item.stream_id, \
            '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner)

        tagger_id = 'lingpipe'
        tagging = Tagging()
        tagging.tagger_id = tagger_id
        ## get this one file out of its FILENAME tags
        tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1]
        tagging.raw_tagging = tagged_doc
        tagging.generation_time = streamcorpus.make_stream_time()
        stream_item.body.taggings[tagger_id] = tagging

        sentences = list(lingpipe.sentences(tagged_doc))

        ## make JS labels on individual tokens
        assert stream_item.ratings[0].mentions, stream_item.stream_id
        john_smith_label = Label()
        john_smith_label.annotator = stream_item.ratings[0].annotator
        john_smith_label.target_id = stream_item.ratings[0].target_id

        # first map all corefchains to their words
        equiv_ids = collections.defaultdict(lambda: set())
        for sent in sentences:
            for tok in sent.tokens:
                if tok.entity_type is not None:
                    equiv_ids[tok.equiv_id].add(cleanse(tok.token))

        ## find all the chains that are John Smith
        johnsmiths = set()
        for equiv_id, names in equiv_ids.items():
            ## detect 'smith' in 'smithye'
            _names = cleanse(' '.join(names))
            if 'john' in _names and 'smith' in _names:
                johnsmiths.add(equiv_id)

        print len(johnsmiths)
        ## now apply the label
        for sent in sentences:
            for tok in sent.tokens:
                if tok.equiv_id in johnsmiths:
                    tok.labels = [john_smith_label]                

        stream_item.body.sentences[tagger_id] = sentences
        
        o_chunk.add(stream_item)

    ## put the o_chunk bytes into the specified file
    open(tmp_done_path, 'wb').write(str(o_chunk))
    ## replace this with log.info()
    print 'created %s' % tmp_done_path
    def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk):
        ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk '''
        ## prepare to iterate over the input chunk
        input_iter = i_chunk.__iter__()

        all_ner = xml.dom.minidom.parse(open(ner_xml_path))

        ## this converts our UTF-8 data into unicode strings, so when
        ## we want to compute byte offsets or construct tokens, we
        ## must .encode('utf8')
        for ner_dom in all_ner.getElementsByTagName('FILENAME'):
            #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')):

            stream_item = input_iter.next()

            ## get stream_id out of the XML
            stream_id = ner_dom.attributes.get('stream_id').value
            if stream_item.stream_id is None:
                assert not stream_id, 'out of sync: None != %r' % stream_id
                logger.critical('si.stream_id is None... ignoring')
                continue
            assert stream_id and stream_id == stream_item.stream_id, \
                '%s != %s' % (stream_id, stream_item.stream_id)

            if not stream_item.body:
                ## the XML better have had an empty clean_visible too...
                #assert not ner_dom....something
                continue

            tagging = Tagging()
            tagging.tagger_id = self.tagger_id  # pylint: disable=E1101
            '''
            ## get this one file out of its FILENAME tags
            tagged_doc_parts = list(files(ner_dom.toxml()))
            if not tagged_doc_parts:
                continue

            tagged_doc = tagged_doc_parts[0][1]

            ## hack
            hope_original = make_clean_visible(tagged_doc, '')
            open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8'))
            print ner_xml_path + '-clean'
            '''

            #tagging.raw_tagging = tagged_doc
            tagging.generation_time = streamcorpus.make_stream_time()
            stream_item.body.taggings[self.tagger_id] = tagging  # pylint: disable=E1101

            ## could consume lots of memory here by instantiating everything
            sentences, relations, attributes = self.get_sentences(ner_dom)
            stream_item.body.sentences[self.tagger_id] = sentences  # pylint: disable=E1101
            stream_item.body.relations[self.tagger_id] = relations  # pylint: disable=E1101
            stream_item.body.attributes[self.tagger_id] = attributes  # pylint: disable=E1101

            logger.debug('finished aligning tokens %s' % stream_item.stream_id)
            '''
            for num, sent in enumerate(sentences):
                for tok in sent.tokens:
                    print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token))
            '''

            if 'align_labels_by' in self.config and self.config[
                    'align_labels_by']:
                assert 'aligner_data' in self.config, 'config missing "aligner_data"'
                aligner = AlignmentStrategies[self.config['align_labels_by']]
                aligner(stream_item, self.config['aligner_data'])

            ## forcibly collect dereferenced objects
            gc.collect()

            try:
                o_chunk.add(stream_item)
            except MemoryError, exc:
                msg = traceback.format_exc(exc)
                msg += make_memory_info_msg()
                logger.critical(msg)
                raise PipelineOutOfMemory(msg)
        count = 0
        for i_str in args.purge:
            if not args.quiet:
                print ("purging: %r" % i_str.strip())
            tq.purge(i_str)
            count += 1

        print ("Done purging %d strs" % count)

    if args.counts:
        if args.detailed:
            counts = tq.counts_detailed
        else:
            counts = tq.counts
        print ("\ncounts for %s at %s" % (namespace, tq.addresses))
        print (repr(make_stream_time()))
        print ("\n".join(["\t%s:\t%s" % (k, v) for k, v in counts.items()]))

        available_pending_completed = sum([counts[k] for k in ["available", "pending", "completed"]])
        print ("%d len(available+pending+completed)" % available_pending_completed)
        print ("%d len(tasks)" % counts["tasks"])
        print ("%d missing" % (counts["tasks"] - available_pending_completed))

    if args.list_details:
        data = tq.details(args.list_details)
        # print '\n'.join(data['results'])
        print data

    if args.list_completed:
        for completed in tq.completed:
            print "#%s\n%s" % (completed["i_str"], "\n".join(completed["results"]))
    def align_chunk_with_ner(self, ner_xml_path, i_chunk, o_chunk):
        ''' iterate through ner_xml_path to fuse with i_chunk into o_chunk '''
        ## prepare to iterate over the input chunk
        input_iter = i_chunk.__iter__()

        all_ner = xml.dom.minidom.parse(open(ner_xml_path))

        ## this converts our UTF-8 data into unicode strings, so when
        ## we want to compute byte offsets or construct tokens, we
        ## must .encode('utf8')
        for ner_dom in all_ner.getElementsByTagName('FILENAME'):
        #for stream_id, raw_ner in files(open(ner_xml_path).read().decode('utf8')):

            stream_item = input_iter.next()

            ## get stream_id out of the XML
            stream_id = ner_dom.attributes.get('stream_id').value
            if stream_item.stream_id is None:
                assert not stream_id, 'out of sync: None != %r' % stream_id
                logger.critical('si.stream_id is None... ignoring')
                continue
            assert stream_id and stream_id == stream_item.stream_id, \
                '%s != %s' % (stream_id, stream_item.stream_id)

            if not stream_item.body:
                ## the XML better have had an empty clean_visible too...
                #assert not ner_dom....something
                continue

            tagging = Tagging()
            tagging.tagger_id = self.tagger_id  # pylint: disable=E1101

            '''
            ## get this one file out of its FILENAME tags
            tagged_doc_parts = list(files(ner_dom.toxml()))
            if not tagged_doc_parts:
                continue

            tagged_doc = tagged_doc_parts[0][1]

            ## hack
            hope_original = make_clean_visible(tagged_doc, '')
            open(ner_xml_path + '-clean', 'wb').write(hope_original.encode('utf-8'))
            print ner_xml_path + '-clean'
            '''

            #tagging.raw_tagging = tagged_doc
            tagging.generation_time = streamcorpus.make_stream_time()
            stream_item.body.taggings[self.tagger_id] = tagging       # pylint: disable=E1101

            ## could consume lots of memory here by instantiating everything
            sentences, relations, attributes = self.get_sentences(ner_dom)
            stream_item.body.sentences[self.tagger_id] = sentences    # pylint: disable=E1101
            stream_item.body.relations[self.tagger_id] = relations    # pylint: disable=E1101
            stream_item.body.attributes[self.tagger_id] = attributes  # pylint: disable=E1101

            logger.debug('finished aligning tokens %s' % stream_item.stream_id)

            '''
            for num, sent in enumerate(sentences):
                for tok in sent.tokens:
                    print '%d\t%d\t%s' % (num, tok.offsets[OffsetType.LINES].first, repr(tok.token))
            '''

            if 'align_labels_by' in self.config and self.config['align_labels_by']:
                assert 'aligner_data' in self.config, 'config missing "aligner_data"'
                aligner = AlignmentStrategies[ self.config['align_labels_by'] ]
                aligner( stream_item, self.config['aligner_data'] )

            ## forcibly collect dereferenced objects
            gc.collect()

            try:
                o_chunk.add(stream_item)
            except MemoryError, exc:
                msg = traceback.format_exc(exc)
                msg += make_memory_info_msg()
                logger.critical(msg)
                raise PipelineOutOfMemory(msg)
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp,
                              s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len( s1.body.raw ))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw = s1.body.raw,
            encoding = s1.body.encoding,
            ## default, might get overwritten below
            media_type = 'text/html',
            taggings = {'stanford': Tagging(
                    tagger_id = 'stanford',
                    raw_tagging = s1.body.ner,
                    generation_time = make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config = 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version = 'Stanford CoreNLP ver 1.2.0',
                    )}
            )

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join([
                    s1.title.cleansed,
                    s1.anchor.cleansed,
                    s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw = s1.title.raw,
                encoding = s1.title.encoding,
                clean_visible = s1.title.cleansed,
                )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(
                raw = s1.anchor.raw,
                encoding = s1.anchor.encoding,
                clean_visible = s1.anchor.cleansed
                )
            s2.other_content['anchor'] = ci
        return s2