def test_speed(parser_type, test_data_dir): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, "test") stream_item.body.clean_html = open(os.path.join(path, "nytimes-index-clean.html")).read() stream_items.append(stream_item) context = {} start = time.time() hl = hyperlink_labels( config={ "require_abs_url": True, "all_domains": False, "domain_substrings": ["nytimes.com"], "offset_types": [parser_type], } ) for si in stream_items: si = hl(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed logger.debug("OffsetType: {}".format(OffsetType)) logger.info("{:.1f} per second for {}".format(rate, parser_type))
def test_speed(parser_type): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'nytimes-index-clean.html')).read() stream_items.append( stream_item ) context = {} start = time.time() ## run it with a byte state machine for si in stream_items: si = hyperlink_labels( {'require_abs_url': True, 'domain_substrings': ['nytimes.com'], 'all_domains': False, 'offset_types': [parser_type]} )(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed print OffsetType print '\n\n%.1f per second for %s' % (rate, parser_type)
def test_speed(parser_type, test_data_dir): stream_items = [] for i in xrange(10): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'nytimes-index-clean.html')).read() stream_items.append( stream_item ) context = {} start = time.time() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': False, 'domain_substrings': ['nytimes.com'], 'offset_types': [parser_type], }) for si in stream_items: si = hl(si, context) elapsed = time.time() - start rate = len(stream_items) / elapsed logger.debug('OffsetType: {}'.format(OffsetType)) logger.info('{:.1f} per second for {}'.format(rate, parser_type))
def test_long_doc(parser_type, test_data_dir): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, "test") stream_item.body.clean_html = open(os.path.join(path, "company-test.html")).read() context = {} hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [parser_type]}) hl(stream_item, context)
def test_long_doc(parser_type, test_data_dir): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.join(test_data_dir, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'company-test.html')).read() context = {} hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [parser_type], }) hl(stream_item, context)
def test_long_doc(parser_type): stream_item = StreamItem() stream_item.body = ContentItem() path = os.path.dirname(__file__) path = os.path.join( path, _TEST_DATA_ROOT, 'test' ) stream_item.body.clean_html = open( os.path.join(path, 'company-test.html')).read() context = {} ## run it with a byte state machine hyperlink_labels( {'require_abs_url': True, 'all_domains': True, ## will fail if set to bytes 'offset_types': [parser_type]} )(stream_item, context)