links_columns = { 'links_url': 12, 'links_text': 12, 'links_nofollow': 12, 'nav_links_url': 3, 'nav_links_text': 3, 'header_links_url': 3, 'header_links_text': 3, 'footer_links_url': 3, 'footer_links_text': 3, } links_file = os.path.abspath('tests/data/crawl_testing/test_content.html') crawl('file://' + links_file, 'links_crawl.jl', custom_settings={'ROBOTSTXT_OBEY': False}) crawl_df = pd.read_json('links_crawl.jl', lines=True) dup_links_file = os.path.abspath( 'tests/data/crawl_testing/duplicate_links.html') crawl('file://' + dup_links_file, 'dup_links_crawl.jl', custom_settings={'ROBOTSTXT_OBEY': False}) dup_crawl_df = pd.read_json('dup_links_crawl.jl', lines=True) def test_link_columns_all_exist(): assert set(links_columns).difference(crawl_df.columns.tolist()) == set()
from advertools.spider import crawl links_columns = { 'links_url': 14, 'links_text': 14, 'links_nofollow': 14, 'nav_links_url': 3, 'nav_links_text': 3, 'header_links_url': 3, 'header_links_text': 3, 'footer_links_url': 3, 'footer_links_text': 3, } links_file = os.path.abspath('tests/data/crawl_testing/test_content.html') crawl('file://' + links_file, 'links_crawl.jl', custom_settings={'ROBOTSTXT_OBEY': False}) crawl_df = pd.read_json('links_crawl.jl', lines=True) os.remove('links_crawl.jl') crawl('file://' + links_file, 'follow_url_params.jl', allowed_domains=[links_file, 'example.com'], custom_settings={'ROBOTSTXT_OBEY': False}, follow_links=True) follow_url_params_df = pd.read_json('follow_url_params.jl', lines=True) os.remove('follow_url_params.jl') def test_follow_url_params_followed(): assert follow_url_params_df['url'].str.contains('?', regex=False).any()
def test_crawling_bad_url_directly_is_handled(): crawl(['wrong_url', 'https://example.com'], 'bad_url.jl') bad_url_df = pd.read_json('bad_url.jl', lines=True) assert len(bad_url_df) == 1 assert bad_url_df['url'][0] == 'https://example.com' os.remove('bad_url.jl')
def test_crawl_raises_on_wrong_file_extension(): with pytest.raises(ValueError): crawl('https://example.com', 'myfile.wrong', allowed_domains='example.com')