def test_speed(parser_type):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.dirname(__file__)
        path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    ## run it with a byte state machine
    for si in stream_items:
        si = hyperlink_labels(
            {'require_abs_url': True, 
             'domain_substrings': ['nytimes.com'],
             'all_domains': False,
             'offset_types': [parser_type]}
            )(si, context)
    elapsed = time.time() - start
    
    rate = len(stream_items) / elapsed

    print OffsetType
    print '\n\n%.1f per second for %s' % (rate, parser_type)
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, "test")
        stream_item.body.clean_html = open(os.path.join(path, "nytimes-index-clean.html")).read()
        stream_items.append(stream_item)

    context = {}
    start = time.time()
    hl = hyperlink_labels(
        config={
            "require_abs_url": True,
            "all_domains": False,
            "domain_substrings": ["nytimes.com"],
            "offset_types": [parser_type],
        }
    )
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug("OffsetType: {}".format(OffsetType))
    logger.info("{:.1f} per second for {}".format(rate, parser_type))
Beispiel #3
0
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': [parser_type],
    })
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug('OffsetType: {}'.format(OffsetType))
    logger.info('{:.1f} per second for {}'.format(rate, parser_type))
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, "test")
    stream_item.body.clean_html = open(os.path.join(path, "company-test.html")).read()

    context = {}
    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [parser_type]})
    hl(stream_item, context)
Beispiel #5
0
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [parser_type],
    })
    hl(stream_item, context)
def test_long_doc(parser_type):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    ## run it with a byte state machine
    hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         ## will fail if set to bytes
         'offset_types': [parser_type]}
        )(stream_item, context)
Beispiel #7
0
def test_target_parsing(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )

    assert 'logo' not in visible
    assert 'target' not in visible

    stage = hyperlink_labels(config={
        'offset_types': ['LINES'],
        'require_abs_url': True,
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )

    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
Beispiel #8
0
def test_stage(test_data_dir):
    stage = clean_html({}) # NB: not even defaults
    path = os.path.join(test_data_dir, 'test')
    with open(os.path.join(path, 'nytimes-index.html'), 'r') as f:
        raw = f.read().decode('utf8')
    si = StreamItem(body=ContentItem(raw=raw, media_type='text/html'))
    si = stage(si, {})

    with open(os.path.join(path, 'nytimes-index-clean-stable.html'), 'r') as f:
        stable = f.read()
    assert si.body.clean_html == stable