def test_scraper_banned(self): scraper = Scraper( { 'alphabet': 'abcdefghijklmnopqrstuvwxyz', 'url_template': self.get_url('/{shortcode}'), 'request_delay': 0.1, 'redirect_codes': [301, 200], 'no_redirect_codes': [303], 'unavailable_codes': [], 'banned_codes': [420], 'body_regex': r'id="contlink" href="([^"]+)', 'custom_code_required': False, 'method': 'get', 'name': 'blah', }, [3], max_try_count=1 ) try: scraper.run() except ScraperError: pass else: self.fail()
def test_scraper(self): scraper = Scraper( { 'alphabet': 'abcdefghijklmnopqrstuvwxyz', 'url_template': self.get_url('/{shortcode}'), 'request_delay': 0.1, 'redirect_codes': [301, 200], 'no_redirect_codes': [303], 'unavailable_codes': [], 'banned_codes': [420], 'body_regex': r'id="contlink" href="([^"]+)', 'location_anti_regex': r'^/404.php$', 'custom_code_required': False, 'method': 'get', 'name': 'blah', }, [0, 1, 2, 4]) scraper.run() self.assertEqual(2, len(scraper.results)) self.assertEqual('http://archive.land', scraper.results['a']['url']) self.assertEqual('http://yahoo.city', scraper.results['b']['url'])
def main(): logging.basicConfig(level=logging.INFO) tracker_host = sys.argv[1] client_version = sys.argv[2] username = sys.argv[3] bind_address = sys.argv[4] user_agent = sys.argv[5] scheme = sys.argv[6] tracker_client = TrackerClient( tracker_host, username, version=client_version, bind_address=bind_address, user_agent=user_agent, scheme=scheme, ) print('Getting item from tracker.') sys.stdout.flush() item_info = try_with_tracker(tracker_client.get_item) todo_list = range(item_info['lower_sequence_num'], item_info['upper_sequence_num'] + 1) scraper_client = Scraper(item_info['project'], todo_list) try: result = scraper_client.run() except Exception: try_with_tracker(tracker_client.report_error, item_info['id'], item_info['tamper_key'], str(traceback.format_exc())) raise try_with_tracker(tracker_client.upload_item, item_info['id'], item_info['tamper_key'], result)
def test_scraper_banned(self): scraper = Scraper( { 'alphabet': 'abcdefghijklmnopqrstuvwxyz', 'url_template': self.get_url('/{shortcode}'), 'request_delay': 0.1, 'redirect_codes': [301, 200], 'no_redirect_codes': [303], 'unavailable_codes': [], 'banned_codes': [420], 'body_regex': r'id="contlink" href="([^"]+)', 'custom_code_required': False, 'method': 'get', 'name': 'blah', }, [3], max_try_count=1) try: scraper.run() except ScraperError: pass else: self.fail()
def test_scraper(self): scraper = Scraper( { 'alphabet': 'abcdefghijklmnopqrstuvwxyz', 'url_template': self.get_url('/{shortcode}'), 'request_delay': 0.1, 'redirect_codes': [301, 200], 'no_redirect_codes': [303], 'unavailable_codes': [], 'banned_codes': [420], 'body_regex': r'id="contlink" href="([^"]+)', 'custom_code_required': False, 'method': 'get', 'name': 'blah', }, [0, 1, 2] ) scraper.run() self.assertEqual(2, len(scraper.results)) self.assertEqual('http://archive.land', scraper.results['a']['url']) self.assertEqual('http://yahoo.city', scraper.results['b']['url'])