Ejemplo n.º 1
0
    def test_scraper_banned(self):
        scraper = Scraper(
            {
                'alphabet': 'abcdefghijklmnopqrstuvwxyz',
                'url_template': self.get_url('/{shortcode}'),
                'request_delay': 0.1,
                'redirect_codes': [301, 200],
                'no_redirect_codes': [303],
                'unavailable_codes': [],
                'banned_codes': [420],
                'body_regex': r'id="contlink" href="([^"]+)',
                'custom_code_required': False,
                'method': 'get',
                'name': 'blah',
            },
            [3],
            max_try_count=1
        )

        try:
            scraper.run()
        except ScraperError:
            pass
        else:
            self.fail()
Ejemplo n.º 2
0
    def test_scraper(self):
        scraper = Scraper(
            {
                'alphabet': 'abcdefghijklmnopqrstuvwxyz',
                'url_template': self.get_url('/{shortcode}'),
                'request_delay': 0.1,
                'redirect_codes': [301, 200],
                'no_redirect_codes': [303],
                'unavailable_codes': [],
                'banned_codes': [420],
                'body_regex': r'id="contlink" href="([^"]+)',
                'location_anti_regex': r'^/404.php$',
                'custom_code_required': False,
                'method': 'get',
                'name': 'blah',
            }, [0, 1, 2, 4])

        scraper.run()

        self.assertEqual(2, len(scraper.results))
        self.assertEqual('http://archive.land', scraper.results['a']['url'])
        self.assertEqual('http://yahoo.city', scraper.results['b']['url'])
Ejemplo n.º 3
0
def main():
    logging.basicConfig(level=logging.INFO)

    tracker_host = sys.argv[1]
    client_version = sys.argv[2]
    username = sys.argv[3]
    bind_address = sys.argv[4]
    user_agent = sys.argv[5]
    scheme = sys.argv[6]

    tracker_client = TrackerClient(
        tracker_host,
        username,
        version=client_version,
        bind_address=bind_address,
        user_agent=user_agent,
        scheme=scheme,
    )

    print('Getting item from tracker.')
    sys.stdout.flush()

    item_info = try_with_tracker(tracker_client.get_item)

    todo_list = range(item_info['lower_sequence_num'],
                      item_info['upper_sequence_num'] + 1)

    scraper_client = Scraper(item_info['project'], todo_list)

    try:
        result = scraper_client.run()
    except Exception:
        try_with_tracker(tracker_client.report_error, item_info['id'],
                         item_info['tamper_key'], str(traceback.format_exc()))
        raise

    try_with_tracker(tracker_client.upload_item, item_info['id'],
                     item_info['tamper_key'], result)
Ejemplo n.º 4
0
    def test_scraper_banned(self):
        scraper = Scraper(
            {
                'alphabet': 'abcdefghijklmnopqrstuvwxyz',
                'url_template': self.get_url('/{shortcode}'),
                'request_delay': 0.1,
                'redirect_codes': [301, 200],
                'no_redirect_codes': [303],
                'unavailable_codes': [],
                'banned_codes': [420],
                'body_regex': r'id="contlink" href="([^"]+)',
                'custom_code_required': False,
                'method': 'get',
                'name': 'blah',
            }, [3],
            max_try_count=1)

        try:
            scraper.run()
        except ScraperError:
            pass
        else:
            self.fail()
Ejemplo n.º 5
0
    def test_scraper(self):
        scraper = Scraper(
            {
                'alphabet': 'abcdefghijklmnopqrstuvwxyz',
                'url_template': self.get_url('/{shortcode}'),
                'request_delay': 0.1,
                'redirect_codes': [301, 200],
                'no_redirect_codes': [303],
                'unavailable_codes': [],
                'banned_codes': [420],
                'body_regex': r'id="contlink" href="([^"]+)',
                'custom_code_required': False,
                'method': 'get',
                'name': 'blah',
            },
            [0, 1, 2]
        )

        scraper.run()

        self.assertEqual(2, len(scraper.results))
        self.assertEqual('http://archive.land', scraper.results['a']['url'])
        self.assertEqual('http://yahoo.city', scraper.results['b']['url'])