def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, "test")
        stream_item.body.clean_html = open(os.path.join(path, "nytimes-index-clean.html")).read()
        stream_items.append(stream_item)

    context = {}
    start = time.time()
    hl = hyperlink_labels(
        config={
            "require_abs_url": True,
            "all_domains": False,
            "domain_substrings": ["nytimes.com"],
            "offset_types": [parser_type],
        }
    )
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug("OffsetType: {}".format(OffsetType))
    logger.info("{:.1f} per second for {}".format(rate, parser_type))
def test_speed(parser_type):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.dirname(__file__)
        path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    ## run it with a byte state machine
    for si in stream_items:
        si = hyperlink_labels(
            {'require_abs_url': True, 
             'domain_substrings': ['nytimes.com'],
             'all_domains': False,
             'offset_types': [parser_type]}
            )(si, context)
    elapsed = time.time() - start
    
    rate = len(stream_items) / elapsed

    print OffsetType
    print '\n\n%.1f per second for %s' % (rate, parser_type)
Exemple #3
0
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
        path = os.path.join(test_data_dir, 'test' )
        stream_item.body.clean_html = open(
            os.path.join(path, 'nytimes-index-clean.html')).read()
        stream_items.append( stream_item )

    context = {}
    start = time.time()
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': False,
        'domain_substrings': ['nytimes.com'],
        'offset_types': [parser_type],
    })
    for si in stream_items:
        si = hl(si, context)
        elapsed = time.time() - start

    rate = len(stream_items) / elapsed

    logger.debug('OffsetType: {}'.format(OffsetType))
    logger.info('{:.1f} per second for {}'.format(rate, parser_type))
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, "test")
    stream_item.body.clean_html = open(os.path.join(path, "company-test.html")).read()

    context = {}
    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [parser_type]})
    hl(stream_item, context)
Exemple #5
0
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [parser_type],
    })
    hl(stream_item, context)
def test_long_doc(parser_type):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    stream_item.body.clean_html = open(
        os.path.join(path, 'company-test.html')).read()

    context = {}
    ## run it with a byte state machine
    hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         ## will fail if set to bytes
         'offset_types': [parser_type]}
        )(stream_item, context)