def test_crawl_basic_bookkeeping(storage, simple_feed): crawl.crawl(simple_feed) feed_dir = join(storage, 'feeds', 'http-com-example-feed') with open(join(feed_dir, 'conditional-get')) as conditional_get_state: assert 'the-etag\nthe date\n' == conditional_get_state.read() with dbm.open(join(feed_dir, 'entries.db')) as entry_url_db: assert 'http://example.com/entry-1' in entry_url_db assert 'http://example.com/entry-2' in entry_url_db
def test_crawl_basic_file_structure(storage, simple_feed): crawl.crawl(simple_feed) assert isdir(storage) assert isdir(join(storage, 'feeds')) feed_dir = join(storage, 'feeds', 'http-com-example-feed') assert isdir(feed_dir) assert isfile(join(feed_dir, 'conditional-get')) assert isfile(join(feed_dir, 'entries.db')) assert isdir(join(feed_dir, '2015-05-10T00:00:00Z-http-com-example-entry-1')) assert isdir(join(feed_dir, '2015-05-11T00:00:00Z-http-com-example-entry-2')) entry_dir = join(feed_dir, '2015-05-10T00:00:00Z-http-com-example-entry-1') assert isfile(join(entry_dir, 'entry.json'))
def test_crawl_conditional_get_restore(storage, simple_feed): crawl.crawl(simple_feed) with mock.patch('skim.crawl.save_feed', side_effect=Exception): # it should not attempt to save anything in the case of a 304 crawl.crawl(simple_feed)