コード例 #1
0
def test_title():
    
    stage = title({})
    cv = clean_visible({})

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
<tag>   ...  <title>TITLE
HERE
  </title>
'''
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE'
    
    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
  that went <tag>   ...  <title>TITLE
HERE%s
  </title>
''' % ('*' * 80)
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
コード例 #2
0
def test_matcher():

    config = dict(
        ## command to run
        fpat_path="cat"
    )

    fm = fpat_matcher(config)

    si1 = make_stream_item(None, "http://example.com")
    si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.")

    si2 = make_stream_item(None, "http://example.com")
    si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.")

    chunk_path = "/tmp/%s" % uuid.uuid1()

    ch = Chunk(chunk_path, mode="wb")
    ch.add(si1)
    ch.add(si1)
    ch.add(si2)
    ch.close()

    fm(chunk_path)

    ch = Chunk(chunk_path, mode="rb")

    SIs = list(ch)

    ## verify the si has expected things
    for si in SIs:
        len(si.body.labels) == 1

    for i in range(2):
        print SIs[i].ratings
コード例 #3
0
def test_filter_domains(tmpdir):

    domains_path = tmpdir.join('domains_path.txt')
    domains_path.write('cats.com\nhttp://birds.com/')
    
    stage = filter_domains(dict(
            include_domains = ['dogs.com'],
            include_domains_path = str(domains_path),
            ))

    assert stage.domains == set(['dogs.com', 'cats.com', 'birds.com'])

    si = make_stream_item(0, 'http://dogs.com/')
    assert stage(si) is not None

    si = make_stream_item(0, 'http://cats.com/')
    assert stage(si) is not None

    si = make_stream_item(0, 'http://birds.com/')
    assert stage(si) is not None

    si = make_stream_item(0, 'http://things.com/')
    assert stage(si) is None

    si = make_stream_item(0, 'http://things.com/')
    si.schost = 'https://birds.com'
    assert domain_name_cleanse(si.schost) == 'birds.com'
    assert stage(si) is not None
コード例 #4
0
def test_title():

    stage = title({})
    cv = clean_visible({})

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
<tag>   ...  <title>TITLE
HERE
  </title>
'''
    si = cv(si, {})
    si = stage(si)
    assert si.other_content['title'].clean_visible == 'TITLE HERE'

    si = make_stream_item(0, '')
    si.body.clean_html = '''Then there
was a
  that went <tag>   ...  <title>TITLE
HERE%s
  </title>
''' % ('*' * 80)
    si = cv(si, {})
    si = stage(si)
    assert si.other_content[
        'title'].clean_visible == 'TITLE HERE' + '*' * 50 + '...'
コード例 #5
0
def test_ft_with_stream_item():
    si = streamcorpus.make_stream_item('2005-01-01T05:06:07.0Z', 'abc')
    tokens = [
        streamcorpus.Token(offsets={
            streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset(
                type=streamcorpus.OffsetType.XPATH_CHARS,
                xpath='/html[1]/body[1]/p[1]/b[1]/text()[1]',
                first=0,
                xpath_end='/html[1]/body[1]/p[1]/text()[1]',
                xpath_end_offset=2),
        }),
        streamcorpus.Token(offsets={
            streamcorpus.OffsetType.XPATH_CHARS: streamcorpus.Offset(
                type=streamcorpus.OffsetType.XPATH_CHARS,
                xpath='/html[1]/body[1]/p[1]/b[2]/text()[1]',
                first=0,
                xpath_end='/html[1]/body[1]/p[1]/text()[2]',
                xpath_end_offset=4),
        }),
    ]
    si.body.sentences = {'test': [streamcorpus.Sentence(tokens=tokens)]}
    si.body.clean_html = '<html><body><p><b>T</b>om ' \
                         '<b>B</b>rady</p></body></html>'

    ft = FeatureTokens()
    ft['tom brady'].append([('test', 0, 0), ('test', 0, 1)])
    assert next(ft.xpath_slices(si, 'tom brady')) == 'Tom Brady'
コード例 #6
0
def test_fix_text(test_data_dir):
    fpath = path.join(test_data_dir, 'test/microsoft-quotes.txt')
    si = make_stream_item(None, 'test')
    si.body = ContentItem(raw=open(fpath).read())
    fixer = fix_text(config={'read_from': 'raw', 'write_to': 'clean_visible'})
    fixer(si, {})
    assert(si.body.clean_visible.strip() == 'Do not "quote me."')
コード例 #7
0
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(
            creation_time,
            abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)
        
        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id = target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url)
        return stream_item
コード例 #8
0
    def process_response(self, resp):
        logger.info('retrieved %d bytes for %r', len(resp.content), resp.url)

        last_modified = resp.headers.get('last-modified')
        if last_modified:
            try:
                last_modified = dateutil.parser.parse(last_modified)
                last_modified = int(last_modified.strftime('%s'))
            except Exception:
                last_modified = None
        if not last_modified:
            last_modified = int(time.time())
        si = streamcorpus.make_stream_item(last_modified, resp.url)
        # don't try to convert it... e.g. if we got a PDF
        si.original_url = resp.url
        si.body.raw = resp.content
        media_type = resp.headers.get('content-type')
        try:
            media_type = (media_type.decode('utf8',
                                            'ignore').encode('utf8', 'ignore'))
        except Exception:
            media_type = repr(media_type)
        si.body.media_type = media_type
        si.body.encoding = resp.apparent_encoding

        return si
コード例 #9
0
def generate_stream_items_from_kba_json(json_file_path):
    ## iterate over gzip'ed file of JSON lines
    data = gzip.GzipFile(fileobj=open(json_file_path, 'rb'), mode='rb').read()
    for line in data.splitlines():
        try: 
            doc = json.loads(line)
        except Exception, exc: 
            print('trapped: %s' % traceback.format_exc(exc))
            print('continuing')
            continue

        assert doc['source'] == 'social', doc['source']

        ## make a StreamItem with valid StreamTime computed from
        ## zulu_timestamp.  This will fix the four-hour offsets in
        ## some of the KBA 2012 files.
        stream_item = make_stream_item(
            doc['stream_time']['zulu_timestamp'],
            bytes(doc['abs_url'].encode('utf-8'))
            )

        ## capture schost and source
        stream_item.schost = doc.pop('schost')
        stream_item.source = doc.pop('source')

        ## assemble source_metadata
        stream_item.source_metadata['kba-2012'] = json.dumps(doc.pop('source_metadata'))
        
        ## might have a funky original URL
        stream_item.original_url = doc['original_url'] and \
            bytes(doc['original_url'].encode('utf-8')) or b''

        ## get the three possible ContentItems
        body   = doc.pop('body',   {}).pop('raw', '').decode('string-escape')
        title  = doc.pop('title',  {}).pop('raw', '').decode('string-escape')
        anchor = doc.pop('anchor', {}).pop('raw', '').decode('string-escape')

        stream_item.body = ContentItem(
            raw = b''.join(['<p>', anchor, '</p>',
                            '<p>', title, '</p>',
                            body]),
            media_type = 'text/html',
            encoding = 'UTF-8',
            )

        if title:
            stream_item.other_content['title']  = ContentItem(
                raw = title,
                media_type = 'text/html',
                encoding = 'UTF-8',
                )

        if anchor:
            stream_item.other_content['anchor']  = ContentItem(
                raw = anchor,
                media_type = 'text/html',
                encoding = 'UTF-8',
                )

        yield stream_item
コード例 #10
0
def test_multi_token_match():
    si = make_stream_item(0, '')
    tagger_id = 'test_tagger'
    annotator_id = 'test_anno'
    target_id = 'test_target'
    si.body.sentences[tagger_id] = [
        Sentence(tokens=[
                Token(token='This'),
                Token(token='-LRB-big-RRB- dog'),
                Token(token='Jake'),
                Token(token='has'),
                Token(token='no'),
                Token(token=u'\u1F601'.encode('utf8')),
                Token(token='...'),
                Token(token='Teeth'),
                ])]
    rating = Rating(annotator=Annotator(annotator_id=annotator_id),
           target=Target(target_id=target_id),
           mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')],
           )
    add_annotation(si, rating)
    aligner_data = dict(
        tagger_id = tagger_id,
        annotator_id = annotator_id,
        )
                               
    multi_token_match(si, aligner_data)

    assert si.body.sentences[tagger_id][0].tokens[1].labels
    assert si.body.sentences[tagger_id][0].tokens[2].labels
    assert si.body.sentences[tagger_id][0].tokens[-3].labels
    assert si.body.sentences[tagger_id][0].tokens[-2].labels
    assert si.body.sentences[tagger_id][0].tokens[-1].labels
コード例 #11
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('out_path')
    args = parser.parse_args()

    si = make_stream_item(1, 'http://crazydog.com')
    si.body.raw = '''
Flying dogs are amazing.
The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog.
'''

    topic_name = 'The flight of the super dog Sam Vroomvroom'
    sel = Selector(
        selector_type=SelectorType.TOPIC.value,
        raw_selector=topic_name,
        canonical_selector=topic_name.lower(
        ),  # this is the key for making it appear for a profile of this title
        offsets={
            OffsetType.CHARS:
            Offset(
                type=OffsetType.CHARS,
                first=si.body.raw.find('The'),
                length=len(topic_name),
            )
        },
    )
    si.body.selectors['other'] = [sel]

    chunk = Chunk(args.out_path, mode='wb')
    chunk.add(si)
    chunk.close()
コード例 #12
0
def test_multi_token_match():
    si = make_stream_item(0, '')
    tagger_id = 'test_tagger'
    annotator_id = 'test_anno'
    target_id = 'test_target'
    si.body.sentences[tagger_id] = [
        Sentence(tokens=[
            Token(token='This'),
            Token(token='-LRB-big-RRB- dog'),
            Token(token='Jake'),
            Token(token='has'),
            Token(token='no'),
            Token(token=u'\u1F601'.encode('utf8')),
            Token(token='...'),
            Token(token='Teeth'),
        ])
    ]
    rating = Rating(
        annotator=Annotator(annotator_id=annotator_id),
        target=Target(target_id=target_id),
        mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')],
    )
    add_annotation(si, rating)
    aligner_data = dict(
        tagger_id=tagger_id,
        annotator_id=annotator_id,
    )

    multi_token_match(si, aligner_data)

    assert si.body.sentences[tagger_id][0].tokens[1].labels
    assert si.body.sentences[tagger_id][0].tokens[2].labels
    assert si.body.sentences[tagger_id][0].tokens[-3].labels
    assert si.body.sentences[tagger_id][0].tokens[-2].labels
    assert si.body.sentences[tagger_id][0].tokens[-1].labels
コード例 #13
0
def test_kvlayer_simple(configurator, tmpdir):
    si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'stream_items': 2})
        print repr(list(kvlclient.scan_keys('stream_items')))
        for (k,v) in kvlclient.get(
                'stream_items',
                (uuid.UUID(int=946730040),
                 uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))):
            assert v is not None

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
コード例 #14
0
def test_ft_with_stream_item():
    si = streamcorpus.make_stream_item('2005-01-01T05:06:07.0Z', 'abc')
    tokens = [
        streamcorpus.Token(
            offsets={
                streamcorpus.OffsetType.XPATH_CHARS:
                streamcorpus.Offset(
                    type=streamcorpus.OffsetType.XPATH_CHARS,
                    xpath='/html[1]/body[1]/p[1]/b[1]/text()[1]',
                    first=0,
                    xpath_end='/html[1]/body[1]/p[1]/text()[1]',
                    xpath_end_offset=2),
            }),
        streamcorpus.Token(
            offsets={
                streamcorpus.OffsetType.XPATH_CHARS:
                streamcorpus.Offset(
                    type=streamcorpus.OffsetType.XPATH_CHARS,
                    xpath='/html[1]/body[1]/p[1]/b[2]/text()[1]',
                    first=0,
                    xpath_end='/html[1]/body[1]/p[1]/text()[2]',
                    xpath_end_offset=4),
            }),
    ]
    si.body.sentences = {'test': [streamcorpus.Sentence(tokens=tokens)]}
    si.body.clean_html = '<html><body><p><b>T</b>om ' \
                         '<b>B</b>rady</p></body></html>'

    ft = FeatureTokens()
    ft['tom brady'].append([('test', 0, 0), ('test', 0, 1)])
    assert next(ft.xpath_slices(si, 'tom brady')) == 'Tom Brady'
コード例 #15
0
ファイル: fetcher.py プロジェクト: mrG7/dossier.models
    def process_response(self, resp):
        logger.info("retrieved %d bytes for %r", len(resp.content), resp.url)

        last_modified = resp.headers.get("last-modified")
        if last_modified:
            try:
                last_modified = dateutil.parser.parse(last_modified)
                last_modified = int(last_modified.strftime("%s"))
            except Exception:
                last_modified = None
        if not last_modified:
            last_modified = int(time.time())
        si = streamcorpus.make_stream_item(last_modified, resp.url)
        # don't try to convert it... e.g. if we got a PDF
        si.original_url = resp.url
        si.body.raw = resp.content
        media_type = resp.headers.get("content-type")
        try:
            media_type = media_type.decode("utf8", "ignore").encode("utf8", "ignore")
        except Exception:
            media_type = repr(media_type)
        si.body.media_type = media_type
        si.body.encoding = resp.apparent_encoding

        return si
コード例 #16
0
def fetch_all(urls, out_dir):
    session = requests.Session()
    roller = streamcorpus.ChunkRoller(out_dir)
    start = time.time()
    try:
        for idx, url in enumerate(urls):
            print('# starting fetch of %r' % url)
            sys.stdout.flush()
            resp = requests.get(url, headers=headers)
            last_modified = resp.headers.get('last-modified')
            if last_modified:
                try:
                    last_modified = int(last_modified)
                except:
                    dt = dateutil.parser.parse(last_modified)
                    last_modified = int(dt.strftime('%s'))
            si = streamcorpus.make_stream_item(last_modified or time.time(), url)
            si.body.raw = resp.content
            si.body.media_type = resp.headers.get('content-type')
            si.body.encoding = resp.encoding
            roller.add(si)
            print('fetched %d bytes for %s with last_modified=%r' % (len(si.body.raw), url, last_modified))
            if idx % 10 == 0:
                elapsed = time.time() - start
                rate = (idx + 1) / elapsed
                remaining = (len(urls) - 1 - idx) / rate / 3600
                print('%d of %d done in %.3f seconds --> %.3f per second --> %.3f hours remaining' % ((idx + 1), len(urls), elapsed, rate, remaining))
                sys.stdout.flush()
    except:
        roller.close()
        raise
    roller.close()
コード例 #17
0
def make_test_stream_item():
    stream_item = make_stream_item(None, 'http://nytimes.com/')
    stream_item.body = ContentItem()
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test', 
                         'nytimes-index-clean-stable.html')
    stream_item.body.clean_html = open(path).read()
    return stream_item
コード例 #18
0
def test_get_name_info(tmpdir):

    path = str(tmpdir.join('test_path'))
    c = Chunk(path, mode='wb')
    c.add(make_stream_item(28491, 'abs_url'))

    name_info = get_name_info(path, i_str='foo')
    assert name_info['date_now'] == name_info['date_time_now'][:10]
    assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
コード例 #19
0
def test_langauge(test_data_dir):
    path = os.path.join(test_data_dir, 'test/raw-unicode-issues.html')
    si = make_stream_item(None, 'test')
    si.body = ContentItem(raw=open(path).read())
    context = {}
    lang = language(config={})
    lang(si, context)

    assert si.body.language.name == 'Japanese'
    assert si.body.language.code == 'ja'
コード例 #20
0
def setup_nltk(text, run_extractor=True):
    si = make_stream_item(0, '')
    si.body.clean_visible = text
    nt = nltk_tokenizer({})
    nt(si, {})

    regex_extractor = structured_features(structured_features_config)
    if run_extractor:
        regex_extractor(si)
    return si
コード例 #21
0
def test_langauge(test_data_dir):
    path = os.path.join(test_data_dir, 'test/raw-unicode-issues.html')
    si = make_stream_item(None, 'test')
    si.body = ContentItem(raw=open(path).read())
    context = {}
    lang = language(config={})
    lang(si, context)

    assert si.body.language.name == 'Japanese'
    assert si.body.language.code == 'ja'
コード例 #22
0
def _make_stream_item(entry):
    """Given a single spinn3r feed entry, produce a single StreamItem.

    Returns 'None' if a complete item can't be constructed.

    """
    # get standard metadata, assuming it's present...
    if not hasattr(entry, 'permalink_entry'):
        return None
    pe = entry.permalink_entry

    # ...and create a streamitem...
    si = streamcorpus.make_stream_item(
        pe.date_found[:-1] + '.0Z',
        pe.canonical_link.href.encode('utf8'))
    if not si.stream_time:
        logger.debug('failed to generate stream_time from {0!r}'
                     .format(pe.date_found))
        return None
    if not si.abs_url:
        logger.debug('failed to generate abs_url from {0!r}'
                     .format(pe.canonical_link.href))
        return None

    # ...filling in the actual data
    si.body = _make_content_item(
        pe.content,
        alternate_data=entry.feed_entry.content.data)
    if not si.body: return None
    if not si.body.raw: return None

    if pe.content_extract.data:
        si.other_content['extract'] = _make_content_item(pe.content_extract)
    si.other_content['title'] = streamcorpus.ContentItem(
        raw=pe.title.encode('utf8'),
        media_type=pe.content_extract.mime_type,
        encoding='UTF-8')
    si.other_content['feed_entry_title'] = streamcorpus.ContentItem(
        raw=entry.feed_entry.title.encode('utf8'),
        media_type=entry.feed_entry.content.mime_type,
        encoding='UTF-8')
    if entry.feed_entry.content.data:
        si.other_content['feed_entry'] = _make_content_item(
            entry.feed_entry.content)
    si.source_metadata['lang'] = pe.lang[0].code
    si.source_metadata['author'] = json.dumps( 
        dict(
            name = pe.author[0].name,
            email = pe.author[0].email,
            link = pe.author[0].link[0].href,
        )
    )
    si.source = entry.source.publisher_type
    return si
コード例 #23
0
 def _get_stream_item(self, item):
     stream_item = streamcorpus.make_stream_item(time.time(), item['url'])
     stream_item.body.raw = self._encode(item.get('body', u''))
     stream_item.body.media_type = self._get_media_type(item)
     stream_item.body.encoding = self.encoding
     stream_item.original_url = item.get('source_url')
     meta = self._get_metadata(item)
     stream_item.body.language = streamcorpus.Language(
         code=meta.get('language_code', '?'),
         name=meta.get('language_name', '?'))
     stream_item.source_metadata = meta
     return stream_item
コード例 #24
0
def _generate_stream_items(protobuf_data):
    '''
    converts all of the protobuf_data spinn3r protoStream format into
    StreamItems, which it yields as a generator
    '''
    ## iterate over entry objects and bytes from which they came
    for num, (entry, delimited_bytes) in enumerate(delimited_messages(protobuf_data)):
        #print num
        if entry is None:
            ## hit end of data
            continue

        if not hasattr(entry, 'permalink_entry'):
            #print 'missing permalink_entry'
            continue

        pe = entry.permalink_entry

        ## verify our understanding of the kludgy link data
        #assert pe.link[0].href[:len(pe.link[0].resource)] == pe.link[0].resource, \
        #    (pe.link[0].href, pe.link[0].resource)
        #assert pe.link[0].href == pe.canonical_link.href
        #assert pe.canonical_link.href.startswith(pe.canonical_link.resource), \
        #    (pe.canonical_link.href, pe.canonical_link.resource)

        ## create a StreamItem for this date_found, canonical_link
        si = make_stream_item(
            pe.date_found[:-1] + '.0Z',
            pe.canonical_link.href.encode('utf8'))

        if not si.stream_time:
            print 'failed to generate stream_time from: %r' % pe.date_found
            continue

        if not si.abs_url:
            print 'failed to generate abs_url from: %r' % pe.canonical_link.href
            continue

        if not pe.content.data:
            continue

        try:
            raw = zlib.decompress(pe.content.data)
            assert raw
        except Exception, exc:
            #print('failed to get anything from decompressing pe.content.data')
            try:
                raw = zlib.decompress(entry.feed_entry.content.data)
                assert raw
            except Exception, exc:
                #print('failed to get anything from decompressing entry.feed_entry.content.data')
                #print('empty entry? %s' % entry)
                continue
コード例 #25
0
 def _get_stream_item(self, item):
     stream_item = streamcorpus.make_stream_item(time.time(), item['url'])
     stream_item.body.raw = self._encode(item.get('body', u''))
     stream_item.body.media_type = self._get_media_type(item)
     stream_item.body.encoding = self.encoding
     stream_item.original_url = item.get('source_url')
     meta = self._get_metadata(item)
     stream_item.body.language = streamcorpus.Language(
         code=meta.get('language_code', '?'),
         name=meta.get('language_name', '?'))
     stream_item.source_metadata = meta
     return stream_item
コード例 #26
0
def test_langauge():
    path = os.path.dirname(__file__)
    path = os.path.join(path, _TEST_DATA_ROOT, "test/raw-unicode-issues.html")
    si = make_stream_item(None, "test")
    si.body = ContentItem(raw=open(path).read())

    lang = _init_stage("language", {})
    context = {}
    lang(si, context)

    assert si.body.language.name == "Japanese"
    assert si.body.language.code == "ja"
コード例 #27
0
def _make_stream_item(entry):
    """Given a single spinn3r feed entry, produce a single StreamItem.

    Returns 'None' if a complete item can't be constructed.

    """
    # get standard metadata, assuming it's present...
    if not hasattr(entry, 'permalink_entry'):
        return None
    pe = entry.permalink_entry

    # ...and create a streamitem...
    si = streamcorpus.make_stream_item(pe.date_found[:-1] + '.0Z',
                                       pe.canonical_link.href.encode('utf8'))
    if not si.stream_time:
        logger.debug('failed to generate stream_time from {0!r}'.format(
            pe.date_found))
        return None
    if not si.abs_url:
        logger.debug('failed to generate abs_url from {0!r}'.format(
            pe.canonical_link.href))
        return None

    # ...filling in the actual data
    si.body = _make_content_item(pe.content,
                                 alternate_data=entry.feed_entry.content.data)
    if not si.body: return None
    if not si.body.raw: return None

    if pe.content_extract.data:
        si.other_content['extract'] = _make_content_item(pe.content_extract)
    si.other_content['title'] = streamcorpus.ContentItem(
        raw=pe.title.encode('utf8'),
        media_type=pe.content_extract.mime_type,
        encoding='UTF-8')
    si.other_content['feed_entry_title'] = streamcorpus.ContentItem(
        raw=entry.feed_entry.title.encode('utf8'),
        media_type=entry.feed_entry.content.mime_type,
        encoding='UTF-8')
    if entry.feed_entry.content.data:
        si.other_content['feed_entry'] = _make_content_item(
            entry.feed_entry.content)
    si.source_metadata['lang'] = pe.lang[0].code
    si.source_metadata['author'] = json.dumps(
        dict(
            name=pe.author[0].name,
            email=pe.author[0].email,
            link=pe.author[0].link[0].href,
        ))
    si.source = entry.source.publisher_type
    return si
コード例 #28
0
def test_chunk_roller(tmpdir):

    cr = ChunkRoller(str(tmpdir), chunk_max=10)

    for i in range(25):
        si = make_stream_item(i, str(i))
        
        cr.add(si)

    cr.close()

    files = []
    for fname in os.listdir(str(tmpdir)):
        assert 'tmp' not in fname
        count = int(fname.split('-')[0])
        files.append(count)

    assert sorted(files) == [5, 10, 10]
コード例 #29
0
def test_kvlayer_negative(configurator, tmpdir):
    si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
コード例 #30
0
def extract_user_names(clean_visible):
    '''also renamedto usernames2'''

    if isinstance(clean_visible, unicode):
        clean_visible = clean_visible.encode('utf8')

    si = make_stream_item(0, '')
    si.body.clean_visible = clean_visible
    
    xform = nltk_tokenizer({})
    xform.process_item(si)

    classifier = Classifier('naivebayes')
    sc = classifier.build_feature(si)

    logger.info('found usernames: %r', sc)
    open('/tmp/found.txt', 'wb').write('\n'.join(map(lambda x: x.encode('utf8'), sc.keys())))

    return sc
コード例 #31
0
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len(s1.body.raw))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw=s1.body.raw,
            encoding=s1.body.encoding,
            ## default, might get overwritten below
            media_type='text/html',
            taggings={
                'stanford':
                Tagging(
                    tagger_id='stanford',
                    raw_tagging=s1.body.ner,
                    generation_time=make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config=
                    'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version='Stanford CoreNLP ver 1.2.0',
                )
            })

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join(
                [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw=s1.title.raw,
                encoding=s1.title.encoding,
                clean_visible=s1.title.cleansed,
            )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(raw=s1.anchor.raw,
                             encoding=s1.anchor.encoding,
                             clean_visible=s1.anchor.cleansed)
            s2.other_content['anchor'] = ci
        return s2
コード例 #32
0
def generate_john_smith_chunk(path_to_original):
    '''
    This _looks_ like a Chunk only in that it generates StreamItem
    instances when iterated upon.
    '''
    ## Every StreamItem has a stream_time property.  It usually comes
    ## from the document creation time.  Here, we assume the JS corpus
    ## was created at one moment at the end of 1998:
    creation_time = '1998-12-31T23:59:59.999999Z'
    correct_time = 915148799

    if not path_to_original.startswith('/'):
        path_to_original = os.path.join(os.getcwd(), path_to_original)

    ## iterate over the files in the 35 input directories
    for label_id in range(35):

        dir_path = os.path.join(path_to_original, str(label_id))
        fnames = os.listdir(dir_path)
        fnames.sort()
        for fname in fnames:

            stream_item = streamcorpus.make_stream_item(
                creation_time, 
                ## make up an abs_url
                os.path.join(
                    'john-smith-corpus', str(label_id), fname))

            if int(stream_item.stream_time.epoch_ticks) != correct_time:
                raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\
                                            % (creation_time, stream_item.stream_time.epoch_ticks,
                                               correct_time))

            ## These docs came from the authors of the paper cited above.
            stream_item.source = 'bagga-and-baldwin'

            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing
            ## tricky in it, because we manually cleansed it.  To
            ## illustrate how we stick all strings into thrift, we
            ## convert this to unicode (which introduces no changes)
            ## and then encode it as utf-8, which also introduces no
            ## changes.  Thrift stores strings as 8-bit character
            ## strings.
            # http://www.mail-archive.com/[email protected]/msg00210.html
            body.clean_visible = unicode(raw_string).encode('utf8')

            ## attach the content_item to the stream_item
            stream_item.body = body

            stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH')

            ## The authors also annotated the corpus
            anno = streamcorpus.Annotator()
            anno.annotator_id = 'bagga-and-baldwin'
            anno.annotation_time = stream_item.stream_time

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id = str(label_id)) # must be string
            rating.contains_mention = True
            rating.mentions = ['john', 'smith']

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

            ## provide this stream_item to the pipeline
            yield stream_item
コード例 #33
0
def make_test_stream_item(test_data_dir):
    stream_item = make_stream_item(None, "http://nytimes.com/")
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, "test", "nytimes-index-clean-stable.html")
    stream_item.body.clean_html = open(str(path)).read()
    return stream_item
コード例 #34
0
    def _make_stream_item(self, dir_path, fname):

        ## could use dirpath as the label.  Instead, we illustrate
        ## using a TSV file to lookup the ground truth using the fname.
        assert fname in self.ground_truth, (dir_path, fname)

        ## "mention" is the name string from the text
        ## "target_id" is the label
        mention, target_id = self.ground_truth[fname]

        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.  Here, we assume the JS corpus
        ## was created at one moment at the end of 1998:
        creation_time = "1998-12-31T23:59:59.999999Z"

        stream_item = streamcorpus.make_stream_item(
            creation_time,
            ## make up an abs_url
            os.path.join("john-smith-corpus", target_id, fname),
        )

        ## These docs came from the authors of the paper cited above.
        stream_item.source = "bagga-and-baldwin"

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        raw_string = open(os.path.join(dir_path, fname)).read()
        ## We know that this is already clean and has nothing
        ## tricky in it, because we manually cleansed it.  To
        ## illustrate how we stick all strings into thrift, we
        ## convert this to unicode (which introduces no changes)
        ## and then encode it as utf-8, which also introduces no
        ## changes.  Thrift stores strings as 8-bit character
        ## strings.
        # http://www.mail-archive.com/[email protected]/msg00210.html
        body.clean_visible = unicode(raw_string).encode("utf8")

        ## attach the content_item to the stream_item
        stream_item.body = body

        stream_item.body.language = streamcorpus.Language(code="en", name="ENGLISH")

        ## The authors also annotated the corpus
        anno = streamcorpus.Annotator()
        anno.annotator_id = "bagga-and-baldwin"
        anno.annotation_time = stream_item.stream_time

        ## build a Label for the doc-level label:
        rating = streamcorpus.Rating()
        rating.annotator = anno
        rating.target = streamcorpus.Target(target_id=target_id)
        rating.contains_mention = True

        ## heuristically split the mentions string on white space and
        ## use each token as a separate mention.  For other corpora,
        ## this might need to be more sophisticated.
        rating.mentions = map(cleanse, mention.decode("utf8").split())

        ## put this one label in the array of labels
        streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        return stream_item
コード例 #35
0
def generate_john_smith_chunk(path_to_original):
    '''
    This _looks_ like a Chunk only in that it generates StreamItem
    instances when iterated upon.
    '''
    ## Every StreamItem has a stream_time property.  It usually comes
    ## from the document creation time.  Here, we assume the JS corpus
    ## was created at one moment at the end of 1998:
    creation_time = '1998-12-31T23:59:59.999999Z'
    correct_time = 915148799

    if not os.path.isabs(path_to_original):
        path_to_original = os.path.join(os.getcwd(), path_to_original)

    ## iterate over the files in the 35 input directories
    for label_id in range(35):

        dir_path = os.path.join(path_to_original, str(label_id))
        fnames = os.listdir(dir_path)
        fnames.sort()
        for fname in fnames:

            stream_item = streamcorpus.make_stream_item(
                creation_time,
                ## make up an abs_url
                os.path.join('john-smith-corpus', str(label_id), fname))

            if int(stream_item.stream_time.epoch_ticks) != correct_time:
                raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\
                                            % (creation_time, stream_item.stream_time.epoch_ticks,
                                               correct_time))

            ## These docs came from the authors of the paper cited above.
            stream_item.source = 'bagga-and-baldwin'

            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing
            ## tricky in it, because we manually cleansed it.  To
            ## illustrate how we stick all strings into thrift, we
            ## convert this to unicode (which introduces no changes)
            ## and then encode it as utf-8, which also introduces no
            ## changes.  Thrift stores strings as 8-bit character
            ## strings.
            # http://www.mail-archive.com/[email protected]/msg00210.html
            body.clean_visible = unicode(raw_string).encode('utf8')

            ## attach the content_item to the stream_item
            stream_item.body = body

            stream_item.body.language = streamcorpus.Language(code='en',
                                                              name='ENGLISH')

            ## The authors also annotated the corpus
            anno = streamcorpus.Annotator()
            anno.annotator_id = 'bagga-and-baldwin'
            anno.annotation_time = stream_item.stream_time

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(
                target_id=str(label_id))  # must be string
            rating.contains_mention = True
            rating.mentions = ['john', 'smith']

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

            ## provide this stream_item to the pipeline
            yield stream_item
コード例 #36
0
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(creation_time, abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)

        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id=target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [
                cleanse(unicode(slot[1], 'utf-8')) for slot in slots
            ]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r',
                    num_ratings, stream_item.abs_url)
        return stream_item
コード例 #37
0
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp,
                              s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len( s1.body.raw ))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw = s1.body.raw,
            encoding = s1.body.encoding,
            ## default, might get overwritten below
            media_type = 'text/html',
            taggings = {'stanford': Tagging(
                    tagger_id = 'stanford',
                    raw_tagging = s1.body.ner,
                    generation_time = make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config = 'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version = 'Stanford CoreNLP ver 1.2.0',
                    )}
            )

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join([
                    s1.title.cleansed,
                    s1.anchor.cleansed,
                    s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw = s1.title.raw,
                encoding = s1.title.encoding,
                clean_visible = s1.title.cleansed,
                )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(
                raw = s1.anchor.raw,
                encoding = s1.anchor.encoding,
                clean_visible = s1.anchor.cleansed
                )
            s2.other_content['anchor'] = ci
        return s2
コード例 #38
0
def make_test_stream_item(test_data_dir):
    stream_item = make_stream_item(None, 'http://nytimes.com/')
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test', 'nytimes-index-clean-stable.html')
    stream_item.body.clean_html = open(str(path)).read()
    return stream_item
コード例 #39
0
ch = Chunk(output_path, mode='wb')

for file_path in input_files:

    ## get the text
    text = open(file_path).read()

    ## every StreamItem has a timestamp, which ideally is the creation
    ## time of the text
    zulu_timestamp = '2013-04-18T18:18:20.000000Z'

    ## every StreamItem has an absolute URL, which ideally points to
    ## the real text on the Web
    abs_url = 'http://nytimes.com/index.html'

    si = make_stream_item(zulu_timestamp, abs_url)

    assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version'

    ## StreamItem.source must be a string without spaces that
    ## identifies the origin of the content.  Existing source names
    ## are 'social', 'news', 'linking', 'arxiv', 'FORUMS', and a few
    ## others.  Make up an appropriate source name for this content,
    ## it should be human readable and make sense as the name of the
    ## corpus.  Typically, when naming chunk files, we use
    ## "<date-hour>/<source>-<md5>.sc.xz"
    si.source = 'news'
    ## all of the StreamItems in a chunk file **must** have the same
    ## source string

    ## if the text is raw from the web and might contain control
コード例 #40
0
ch = Chunk(output_path, mode='wb')

for file_path in input_files:

    ## get the text
    text = open(file_path).read()

    ## every StreamItem has a timestamp, which ideally is the creation
    ## time of the text
    zulu_timestamp = '2013-04-18T18:18:20.000000Z'

    ## every StreamItem has an absolute URL, which ideally points to
    ## the real text on the Web
    abs_url = 'http://nytimes.com/index.html'

    si = make_stream_item(zulu_timestamp, abs_url)

    assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version'

    ## StreamItem.source must be a string without spaces that
    ## identifies the origin of the content.  Existing source names
    ## are 'social', 'news', 'linking', 'arxiv', 'FORUMS', and a few
    ## others.  Make up an appropriate source name for this content,
    ## it should be human readable and make sense as the name of the
    ## corpus.  Typically, when naming chunk files, we use
    ## "<date-hour>/<source>-<md5>.sc.xz"
    si.source = 'news'
    ## all of the StreamItems in a chunk file **must** have the same
    ## source string
    
    ## if the text is raw from the web and might contain control
コード例 #41
0
    def __call__(self, si, context=None):
        if si.version == streamcorpus.Versions.v0_3_0:
            return si

        if not hasattr(si, 'version'):
            raise NotImplementedError('upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"')

        si3 = streamcorpus.make_stream_item(
            zulu_timestamp=si.stream_time.zulu_timestamp,
            abs_url=si.abs_url)

        if si3.stream_id != si.stream_id:
            si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id}

        ## copy everything 
        for attr in ['original_url', 'ratings', 'schost', 'source', 'source_metadata',
                     'ratings', ]:
            setattr(si3, attr, copy.deepcopy(getattr(si, attr)))

        si3.body = streamcorpus.ContentItem()

        for name, ci in si.other_content.items():
            ci3 = streamcorpus.ContentItem()
            si3.other_content[name] = ci3
            for attr in content_item_attrs:
                setattr(ci3, attr, copy.deepcopy(getattr(ci, attr)))
            upgrade_labels(ci, ci3)

        for attr in content_item_attrs:
            setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr)))

        upgrade_labels(si.body, si3.body)

        ## fix the body.sentences['lingpipe'] mention_id ranges
        next_global_mention_id = 0
        ## mapping from (sentence_id, mention_id) --> global_mention_id
        mention_ids = {}
        si3.body.sentences['lingpipe'] = []
        for sentence_id, sentence in enumerate(si.body.sentences.get('lingpipe', [])):
            new_sent = streamcorpus.Sentence()
            si3.body.sentences['lingpipe'].append(new_sent)

            for token_id, token in enumerate(sentence.tokens):

                new_token = streamcorpus.Token()
                new_sent.tokens.append(new_token)

                for attr in ['token_num', 'token', 'offsets', 'sentence_pos', 'lemma', 'pos', 'entity_type', 'mention_id', 'equiv_id', 'parent_id', 'dependency_path']:
                    setattr(new_token, attr, copy.deepcopy(getattr(token, attr)))

                upgrade_labels(token, new_token)

                if token.mention_id not in [-1, None]:
                    key = (sentence_id, token.mention_id)
                    if key in mention_ids:
                        new_mention_id = mention_ids[key]

                    else:
                        new_mention_id = next_global_mention_id
                        next_global_mention_id += 1

                        ## save it for later
                        mention_ids[key] = new_mention_id

                    new_token.mention_id = new_mention_id
                    logger.debug('new_mention_id = %d' % new_mention_id)

                    if token.entity_type in [3, 4]:
                        ## convert FEMALE/MALE_PRONOUN
                        new_token.mention_type = streamcorpus.MentionType.PRO
                        new_token.entity_type  = streamcorpus.EntityType.PER
                        
                        if token.entity_type == 3:
                            gender_value = 1
                        else:
                            gender_value = 0

                        attr = streamcorpus.Attribute(
                            attribute_type = streamcorpus.AttributeType.PER_AGE,
                            evidence = token.token,
                            value = str(gender_value),
                            sentence_id = sentence_id,
                            mention_id = token.mention_id)

                        if 'lingpipe' not in si3.body.attributes:
                            si3.body.attributes['lingpipe'] = []
                        si3.body.attributes['lingpipe'].append(attr)

                    else:
                        new_token.mention_type = streamcorpus.MentionType.NAME

        ## return our newly manufacturered v0_3_0 StreamItem
        return si3
コード例 #42
0
    def __call__(self, si, context=None):
        if si.version == streamcorpus.Versions.v0_3_0:
            return si

        if not hasattr(si, 'version'):
            raise NotImplementedError(
                'upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"'
            )

        si3 = streamcorpus.make_stream_item(
            zulu_timestamp=si.stream_time.zulu_timestamp, abs_url=si.abs_url)

        if si3.stream_id != si.stream_id:
            si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id}

        ## copy everything
        for attr in [
                'original_url',
                'ratings',
                'schost',
                'source',
                'source_metadata',
                'ratings',
        ]:
            setattr(si3, attr, copy.deepcopy(getattr(si, attr)))

        si3.body = streamcorpus.ContentItem()

        for name, ci in si.other_content.items():
            ci3 = streamcorpus.ContentItem()
            si3.other_content[name] = ci3
            for attr in content_item_attrs:
                setattr(ci3, attr, copy.deepcopy(getattr(ci, attr)))
            upgrade_labels(ci, ci3)

        for attr in content_item_attrs:
            setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr)))

        upgrade_labels(si.body, si3.body)

        ## fix the body.sentences['lingpipe'] mention_id ranges
        next_global_mention_id = 0
        ## mapping from (sentence_id, mention_id) --> global_mention_id
        mention_ids = {}
        si3.body.sentences['lingpipe'] = []
        for sentence_id, sentence in enumerate(
                si.body.sentences.get('lingpipe', [])):
            new_sent = streamcorpus.Sentence()
            si3.body.sentences['lingpipe'].append(new_sent)

            for token_id, token in enumerate(sentence.tokens):

                new_token = streamcorpus.Token()
                new_sent.tokens.append(new_token)

                for attr in [
                        'token_num', 'token', 'offsets', 'sentence_pos',
                        'lemma', 'pos', 'entity_type', 'mention_id',
                        'equiv_id', 'parent_id', 'dependency_path'
                ]:
                    setattr(new_token, attr,
                            copy.deepcopy(getattr(token, attr)))

                upgrade_labels(token, new_token)

                if token.mention_id not in [-1, None]:
                    key = (sentence_id, token.mention_id)
                    if key in mention_ids:
                        new_mention_id = mention_ids[key]

                    else:
                        new_mention_id = next_global_mention_id
                        next_global_mention_id += 1

                        ## save it for later
                        mention_ids[key] = new_mention_id

                    new_token.mention_id = new_mention_id
                    logger.debug('new_mention_id = %d' % new_mention_id)

                    if token.entity_type in [3, 4]:
                        ## convert FEMALE/MALE_PRONOUN
                        new_token.mention_type = streamcorpus.MentionType.PRO
                        new_token.entity_type = streamcorpus.EntityType.PER

                        if token.entity_type == 3:
                            gender_value = 1
                        else:
                            gender_value = 0

                        attr = streamcorpus.Attribute(
                            attribute_type=streamcorpus.AttributeType.PER_AGE,
                            evidence=token.token,
                            value=str(gender_value),
                            sentence_id=sentence_id,
                            mention_id=token.mention_id)

                        if 'lingpipe' not in si3.body.attributes:
                            si3.body.attributes['lingpipe'] = []
                        si3.body.attributes['lingpipe'].append(attr)

                    else:
                        new_token.mention_type = streamcorpus.MentionType.NAME

        ## return our newly manufacturered v0_3_0 StreamItem
        return si3