def generate_stream_items_from_kba_json(json_file_path):
    ## iterate over gzip'ed file of JSON lines
    data = gzip.GzipFile(fileobj=open(json_file_path, 'rb'), mode='rb').read()
    for line in data.splitlines():
        try: 
            doc = json.loads(line)
        except Exception, exc: 
            print('trapped: %s' % traceback.format_exc(exc))
            print('continuing')
            continue

        assert doc['source'] == 'social', doc['source']

        ## make a StreamItem with valid StreamTime computed from
        ## zulu_timestamp.  This will fix the four-hour offsets in
        ## some of the KBA 2012 files.
        stream_item = make_stream_item(
            doc['stream_time']['zulu_timestamp'],
            bytes(doc['abs_url'].encode('utf-8'))
            )

        ## capture schost and source
        stream_item.schost = doc.pop('schost')
        stream_item.source = doc.pop('source')

        ## assemble source_metadata
        stream_item.source_metadata['kba-2012'] = json.dumps(doc.pop('source_metadata'))
        
        ## might have a funky original URL
        stream_item.original_url = doc['original_url'] and \
            bytes(doc['original_url'].encode('utf-8')) or b''

        ## get the three possible ContentItems
        body   = doc.pop('body',   {}).pop('raw', '').decode('string-escape')
        title  = doc.pop('title',  {}).pop('raw', '').decode('string-escape')
        anchor = doc.pop('anchor', {}).pop('raw', '').decode('string-escape')

        stream_item.body = ContentItem(
            raw = b''.join(['<p>', anchor, '</p>',
                            '<p>', title, '</p>',
                            body]),
            media_type = 'text/html',
            encoding = 'UTF-8',
            )

        if title:
            stream_item.other_content['title']  = ContentItem(
                raw = title,
                media_type = 'text/html',
                encoding = 'UTF-8',
                )

        if anchor:
            stream_item.other_content['anchor']  = ContentItem(
                raw = anchor,
                media_type = 'text/html',
                encoding = 'UTF-8',
                )

        yield stream_item
Example #2
0
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': [parser_type],
    })
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug('OffsetType: {}'.format(OffsetType))
    logger.info('{:.1f} per second for {}'.format(rate, parser_type))
Example #3
0
def test_target_parsing(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )

    assert 'logo' not in visible
    assert 'target' not in visible

    stage = hyperlink_labels(config={
        'offset_types': ['LINES'],
        'require_abs_url': True,
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )

    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
def test_langauge(test_data_dir):
    path = os.path.join(test_data_dir, 'test/raw-unicode-issues.html')
    si = make_stream_item(None, 'test')
    si.body = ContentItem(raw=open(path).read())
    context = {}
    lang = language(config={})
    lang(si, context)

    assert si.body.language.name == 'Japanese'
    assert si.body.language.code == 'ja'
Example #5
0
def test_stage(test_data_dir):
    stage = clean_html({}) # NB: not even defaults
    path = os.path.join(test_data_dir, 'test')
    with open(os.path.join(path, 'nytimes-index.html'), 'r') as f:
        raw = f.read().decode('utf8')
    si = StreamItem(body=ContentItem(raw=raw, media_type='text/html'))
    si = stage(si, {})

    with open(os.path.join(path, 'nytimes-index-clean-stable.html'), 'r') as f:
        stable = f.read()
    assert si.body.clean_html == stable
Example #6
0
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [parser_type],
    })
    hl(stream_item, context)
Example #7
0
    def __call__(self, s1, context):
        s2 = make_stream_item(s1.stream_time.zulu_timestamp, s1.abs_url)
        s2.schost = s1.schost
        s2.source = s1.source
        s2.source_metadata['kba-2012'] = s1.source_metadata

        logger.debug('len(original .body.raw) = %d' % len(s1.body.raw))

        #logger.critical(repr(s2))

        s2.body = ContentItem(
            raw=s1.body.raw,
            encoding=s1.body.encoding,
            ## default, might get overwritten below
            media_type='text/html',
            taggings={
                'stanford':
                Tagging(
                    tagger_id='stanford',
                    raw_tagging=s1.body.ner,
                    generation_time=make_stream_time('2012-06-01T00:00:00.0Z'),
                    tagger_config=
                    'annotators: {tokenize, cleanxml, ssplit, pos, lemma, ner}, properties: pos.maxlen=100',
                    tagger_version='Stanford CoreNLP ver 1.2.0',
                )
            })

        if self.config['keep_old_cleansed_as_clean_visible']:
            s2.body.clean_visible = s1.body.cleansed

        if s1.source == 'social':
            s2.body.media_type = 'text/plain'
            ## the separation of content items in the social stream
            ## was artificial and annoying, so smoosh them together
            s2.body.clean_visible = '\n\n'.join(
                [s1.title.cleansed, s1.anchor.cleansed, s1.body.cleansed])

            changed_body_raw = False
            if s1.title and s1.title.raw:
                s2.body.raw = s1.title.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if s1.anchor and s1.anchor.raw:
                s2.body.raw += s1.anchor.raw
                s2.body.raw += r'\n\n'
                changed_body_raw = True

            if changed_body_raw:
                s2.body.raw += s1.body.raw

        if s1.title:
            ci = ContentItem(
                raw=s1.title.raw,
                encoding=s1.title.encoding,
                clean_visible=s1.title.cleansed,
            )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(raw=s1.anchor.raw,
                             encoding=s1.anchor.encoding,
                             clean_visible=s1.anchor.cleansed)
            s2.other_content['anchor'] = ci
        return s2
Example #8
0
def make_test_stream_item(test_data_dir):
    stream_item = make_stream_item(None, 'http://nytimes.com/')
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test', 'nytimes-index-clean-stable.html')
    stream_item.body.clean_html = open(str(path)).read()
    return stream_item
Example #9
0
def add_content_item(stream_item, title_m):
    title = whitespace_re.sub(' ', title_m.group('title')).strip()
    if len(title) > 60:
        title = title[:60] + '...'
    stream_item.other_content['title'] = ContentItem(clean_visible=title)