Esempio n. 1
0
links_columns = {
    'links_url': 12,
    'links_text': 12,
    'links_nofollow': 12,
    'nav_links_url': 3,
    'nav_links_text': 3,
    'header_links_url': 3,
    'header_links_text': 3,
    'footer_links_url': 3,
    'footer_links_text': 3,
}

links_file = os.path.abspath('tests/data/crawl_testing/test_content.html')
crawl('file://' + links_file,
      'links_crawl.jl',
      custom_settings={'ROBOTSTXT_OBEY': False})
crawl_df = pd.read_json('links_crawl.jl', lines=True)

dup_links_file = os.path.abspath(
    'tests/data/crawl_testing/duplicate_links.html')
crawl('file://' + dup_links_file,
      'dup_links_crawl.jl',
      custom_settings={'ROBOTSTXT_OBEY': False})
dup_crawl_df = pd.read_json('dup_links_crawl.jl', lines=True)


def test_link_columns_all_exist():
    assert set(links_columns).difference(crawl_df.columns.tolist()) == set()

Esempio n. 2
0
from advertools.spider import crawl

links_columns = {
    'links_url': 14,
    'links_text': 14,
    'links_nofollow': 14,
    'nav_links_url': 3,
    'nav_links_text': 3,
    'header_links_url': 3,
    'header_links_text': 3,
    'footer_links_url': 3,
    'footer_links_text': 3,
}

links_file = os.path.abspath('tests/data/crawl_testing/test_content.html')
crawl('file://' + links_file, 'links_crawl.jl',
      custom_settings={'ROBOTSTXT_OBEY': False})
crawl_df = pd.read_json('links_crawl.jl', lines=True)
os.remove('links_crawl.jl')

crawl('file://' + links_file, 'follow_url_params.jl',
      allowed_domains=[links_file, 'example.com'],
      custom_settings={'ROBOTSTXT_OBEY': False},
      follow_links=True)
follow_url_params_df = pd.read_json('follow_url_params.jl', lines=True)
os.remove('follow_url_params.jl')


def test_follow_url_params_followed():
    assert follow_url_params_df['url'].str.contains('?', regex=False).any()

Esempio n. 3
0
def test_crawling_bad_url_directly_is_handled():
    crawl(['wrong_url', 'https://example.com'], 'bad_url.jl')
    bad_url_df = pd.read_json('bad_url.jl', lines=True)
    assert len(bad_url_df) == 1
    assert bad_url_df['url'][0] == 'https://example.com'
    os.remove('bad_url.jl')
Esempio n. 4
0
def test_crawl_raises_on_wrong_file_extension():
    with pytest.raises(ValueError):
        crawl('https://example.com',
              'myfile.wrong',
              allowed_domains='example.com')