Exemple #1
0
 def test_task_clone_grab_config_and_url(self):
     g = build_grab()
     g.setup(url='http://foo.com/')
     task = Task('foo', grab=g)
     task2 = task.clone(url='http://bar.com/')
     self.assertEqual(task2.url, 'http://bar.com/')
     self.assertEqual(task2.grab_config['url'], 'http://bar.com/')
Exemple #2
0
    def test_task_useragent(self):
        bot = SimpleSpider()
        bot.setup_queue()

        g = Grab()
        g.setup(url=SERVER.BASE_URL)
        g.setup(user_agent='Foo')

        task = Task('baz', grab=g)
        bot.add_task(task.clone())
        bot.run()
        self.assertEqual(SERVER.REQUEST['headers']['User-Agent'], 'Foo')
Exemple #3
0
    def test_task_useragent(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()

        g = Grab()
        g.setup(url=self.server.get_url())
        g.setup(user_agent='Foo')

        task = Task('baz', grab=g)
        bot.add_task(task.clone())
        bot.run()
        self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
Exemple #4
0
    def test_update_grab_instance(self):
        class TestSpider(Spider):
            def update_grab_instance(self, grab):
                grab.setup(timeout=77)

            def task_generator(self):
                yield Task('page', url=self.meta['server'].get_url())
                yield Task('page',
                           grab=Grab(url=self.meta['server'].get_url(),
                                     timeout=1))

            def task_page(self, grab, task):
                self.stat.collect('points', grab.config['timeout'])

        bot = build_spider(TestSpider, meta={'server': self.server})
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(
            Task('page', grab=Grab(url=self.server.get_url(), timeout=1)))
        bot.run()
        self.assertEqual(set([77]), set(bot.stat.collections['points']))
Exemple #5
0
    def test_task_queue_clear(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                self.stop()

        bot = build_spider(TestSpider)
        bot.setup_queue()
        for _ in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())
Exemple #6
0
    def test_network_limit(self):
        class CustomSimpleSpider(SimpleSpider):
            def create_grab_instance(self):
                return Grab(connect_timeout=1, timeout=1)

        self.server.response['get.data'] = 'Hello spider!'
        self.server.response['sleep'] = 1.1

        bot = build_spider(CustomSimpleSpider, network_try_limit=1)
        bot.setup_queue()
        #bot.setup_grab(connect_timeout=1, timeout=1)
        bot.add_task(Task('baz', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:request-network'], 1)

        bot = build_spider(CustomSimpleSpider, network_try_limit=2)
        bot.setup_queue()
        #bot.setup_grab(connect_timeout=1, timeout=1)
        bot.add_task(Task('baz', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:request-network'], 2)
Exemple #7
0
    def test_timeout(self):
        self.server.response['get.data'] = ContentGenerator(self.server)
        bot = build_spider(SimpleSpider, meta={'server': self.server})
        self.setup_cache(bot)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', self.server.get_url()))
        bot.add_task(Task('one', self.server.get_url(), delay=2))
        bot.run()
        self.assertEqual(2, bot.stat.counters['spider:request'])
        self.assertEqual(1, bot.stat.counters['spider:request-cache'])
        self.assertEqual([1, 1], bot.stat.collections['resp_counters'])

        bot = build_spider(SimpleSpider, meta={'server': self.server},
                           parser_pool_size=1)
        self.setup_cache(bot)
        # DO not clear the cache
        # bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', self.server.get_url(), priority=1))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=2, cache_timeout=0, delay=1))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=3, cache_timeout=10, delay=3))
        bot.add_task(Task('one', self.server.get_url(),
                          priority=4, cache_timeout=0, delay=4))
        bot.run()
        self.assertEqual([1, 2, 2, 3], bot.stat.collections['resp_counters'])
Exemple #8
0
    def test_timeout(self):
        bot = SimpleSpider()
        self.setup_cache(bot)
        bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', SERVER.BASE_URL))
        bot.add_task(Task('one', SERVER.BASE_URL, delay=0.5))
        bot.run()
        self.assertEqual([1, 1], bot.resp_counters)

        bot = SimpleSpider()
        self.setup_cache(bot)
        # DO not clear the cache
        #bot.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('one', SERVER.BASE_URL, priority=1))
        bot.add_task(
            Task('one', SERVER.BASE_URL, priority=2, cache_timeout=0, delay=1))
        bot.add_task(
            Task('one',
                 SERVER.BASE_URL,
                 priority=3,
                 cache_timeout=10,
                 delay=1.1))
        bot.add_task(
            Task('one',
                 SERVER.BASE_URL,
                 priority=4,
                 cache_timeout=0,
                 delay=1.2))
        bot.run()
        self.assertEqual([1, 2, 2, 3], bot.resp_counters)
Exemple #9
0
    def test_task_clone(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()

        task = Task('baz', url='http://xxx.com')
        bot.add_task(task.clone())

        # Pass grab to clone
        task = Task('baz', url='http://xxx.com')
        grab = Grab()
        grab.setup(url='zzz')
        bot.add_task(task.clone(grab=grab))

        # Pass grab_config to clone
        task = Task('baz', url='http://xxx.com')
        grab = Grab()
        grab.setup(url='zzz')
        bot.add_task(task.clone(grab_config=grab.config))
Exemple #10
0
    def test_task_clone(self):
        bot = SimpleSpider()
        bot.setup_queue()

        task = Task('baz', url='xxx')
        bot.add_task(task.clone())

        # Pass grab to clone
        task = Task('baz', url='xxx')
        g = Grab()
        g.setup(url='zzz')
        bot.add_task(task.clone(grab=g))

        # Pass grab_config to clone
        task = Task('baz', url='xxx')
        g = Grab()
        g.setup(url='zzz')
        bot.add_task(task.clone(grab_config=g.config))
Exemple #11
0
    def test_task_get_fallback_handler(self):
        class TestSpider(Spider):
            def do_smth(self, task):
                pass

            def task_bar_fallback(self, task):
                pass

        task1 = Task('foo', url='http://foo.com/', fallback_name='do_smth')
        task2 = Task('bar', url='http://foo.com/')
        task3 = Task(url='http://foo.com/')

        bot = build_spider(TestSpider, )

        self.assertEqual(task1.get_fallback_handler(bot), bot.do_smth)
        self.assertEqual(task2.get_fallback_handler(bot),
                         bot.task_bar_fallback)
        self.assertEqual(task3.get_fallback_handler(bot), None)
Exemple #12
0
    def test_exception_from_data_handler(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                yield Data('foo', num=1)

            def data_foo(self, num):
                1 / 0

        bot = TestSpider()
        bot.setup_queue()
        bot.add_task(Task('page', url=SERVER.BASE_URL))
        bot.run()
        self.assertTrue('data_foo' in bot.items['fatal'][0])
Exemple #13
0
    def test_exception_from_data_handler(self):
        class TestSpider(Spider):
            def task_page(self, dummy_grab, dummy_task):
                yield Data('foo', num=1)

            def data_foo(self, num): # pylint: disable=unused-argument
                raise Exception('Shit happens!')

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
Exemple #14
0
    def task_level_4(self, grab, task):
        """
        Наконец то список продукции где также есть ссылки на карточку компании
        """
        if not chek_loading(grab.response.body):
            yield task.clone(refresh_cache=True, priority=50)
            return

        url_level_5_list = grab.doc.select(
            '//a[@class="supplierTit"]').attr_list('href')
        for url_level_5 in url_level_5_list:
            url_level_5 = grab.make_url_absolute(url_level_5)

            if not url_level_5 in self.parsed_url:  ## проверям не извлекали ли уже ссылку на карточку компании
                yield Task('level_5', url=url_level_5, priority=45)

        for next_page_url in grab.doc.select(
                '//p[@class="pagination mt5"]/a').attr_list('href'):
            next_page_url = grab.make_url_absolute(next_page_url)
            if not next_page_url in self.parsed_url:
                yield Task('level_4', url=next_page_url, priority=55)
                self.parsed_url.append(next_page_url)
Exemple #15
0
 def task_handle_author(self, grab, task):
     try:
         articles_selector = r'//div[@class="mw-parser-output"]//li//a[not(contains(@class,"external text"))]'
         author_articles = grab.doc.select(articles_selector )
         for article in author_articles:
             try:
                 href = article.select(r'@href').text()
                 new_url = self.base_url +  (r'/wiki/'  if href[:len(r'/wiki/')] != r'/wiki/' else '') + href
                 yield Task('handle_article', url=new_url, art_path=task.art_path)
             except IndexError:
                 warnings.warn('Invalid article "{}" from author page: {}'.format(article.text(), task.url))
     except NameError:
         pass
Exemple #16
0
    def task_level_1(self, grab, task):
        """
        Получаем ссылки на категории
        """
        if not chek_loading(grab.response.body):
            yield task.clone(refresh_cache=True, priority=90)
            return

        for url_level_2 in grab.doc.select(
                '//div[@class="browse-ttl"]/a').attr_list('href'):
            yield Task('level_2',
                       url=grab.make_url_absolute(url_level_2),
                       priority=85)
Exemple #17
0
    def test_task_queue_clear(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                self.stop()

            def task_keyboard_interrupt_page(self, grab, task):
                raise KeyboardInterrupt

        bot = build_spider(TestSpider)
        bot.setup_queue()
        for _ in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())

        for _ in six.moves.range(5):
            bot.add_task(
                Task('keyboard_interrupt_page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())
Exemple #18
0
    def test_exception_from_data_handler(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                yield Data('foo', num=1)

            def data_foo(self, num):
                1 / 0

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
Exemple #19
0
    def test_handler_result_invalid(self):
        class TestSpider(Spider):
            def prepare(self):
                self.points = []

            def task_page(self, grab, task):
                yield 1

        bot = TestSpider()
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(1, bot.counters['error-spidererror'])
Exemple #20
0
    def test_only_cache_task(self):
        self.server.response['get.data'] = ContentGenerator(self.server)
        class TestSpider(Spider):
            def task_page(self, dummy_grab, dummy_task):
                self.stat.collect('points', 1)

        bot = build_spider(TestSpider, only_cache=True)
        self.setup_cache(bot)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.collections['points'], [])
Exemple #21
0
    def test_handler_result_none(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, dummy_grab, dummy_task):
                yield None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
Exemple #22
0
    def test_check_task_limits_invalid_value(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

            def check_task_limits(self, task):
                return False, 'zz'

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url(),
                          fallback_name='fallback_zz'))
        self.assertRaises(SpiderError, bot.run)
Exemple #23
0
    def test_cache_size(self):
        self.server.response['get.data'] = ContentGenerator(self.server)
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.cache_pipeline.cache.size(), 1)
Exemple #24
0
 def task_getcategory(self, grab, task):
     print(task.url)
     response = json.loads(grab.response.unicode_body())
     with open('category.json', 'at', encoding='cp1251',
               errors='ignore') as f:
         f.write(
             json.dumps(response, ensure_ascii=False, sort_keys=True) +
             '\n')
         f.flush()
     for child in response.get('children'):
         child_url = 'https://supl.biz/api/v1.0/suppliers-catalog/categories/' + str(
             child['id']) + '/menu/'
         yield Task('getcategory', url=child_url)
Exemple #25
0
    def task_level_3(self, grab, task):
        """
        Получаем ссылки на подкатегории
        """
        if not chek_loading(grab.response.body):
            yield task.clone(refresh_cache=True, priority=70)
            return

        for url_level_4 in grab.doc.select(
                '//div[@class="category-top"]/a').attr_list('href'):
            url_level_4 = grab.make_url_absolute(url_level_4)
            yield Task('level_4', url=url_level_4, priority=65)
            self.parsed_url.append(url_level_4)
Exemple #26
0
    def task_table(self, grab, task):
        try:
            term_url = grab.doc.select(
                '//script[contains(text(),"open_terms()")]').text()
            term_url = term_url.split("window.open(")[1].split('"')[1]
        except:
            #Dealing with common error (Terms' page formating)
            try:
                term_url = grab.doc.select(
                    '//a[contains(text(), "Terms")]/@href').text()
            except Exception as e:
                try:
                    #Dealing with common error (there is no direct link on Terms) -
                    # - Terms can be found through "Selections" section:
                    selection_url = grab.doc.select(
                        '//a[contains(text(), "Selections")]//@href').text()
                    root = urlparse(task.url).hostname
                    #use task_selection function which is below for this url:
                    yield Task(
                        "selection",
                        url="http://" + root + selection_url,
                        lasturl=task.url,
                    )
                except Exception as e:
                    self.term_url_error.add(task.url + " " +
                                            getattr(e, "message", str(e)))

        #Just a useful part of auction's url:
        root = urlparse(task.url).hostname

        try:
            #use task_term function for this url:
            yield Task("term",
                       url="http://" + root + term_url,
                       lasturl=task.url)
        except Exception as e:
            self.table_error.add(task.url + " " +
                                 getattr(e, "message", str(e)))
Exemple #27
0
 def task_initial(self, grab, task):
     '''
     '''
     self.base_url = self.initial_urls[0]
     if self.args.c == 'm':
         if self.args.t == 'l':
             yield Task('get_ranks', url='http://live-tennis.eu/')
         elif self.args.t == 'o':
             yield Task('get_ranks',
                        url='http://live-tennis.eu/official_atp_ranking')
         else:
             sys.stderr.write('Unvalid input of utility -t\n')
     elif self.args.c == 'f':
         if self.args.t == 'l':
             yield Task('get_ranks',
                        url='http://live-tennis.eu/wta-live-ranking')
         elif self.args.t == 'o':
             yield Task('get_ranks',
                        url='http://live-tennis.eu/official-wta-ranking')
         else:
             sys.stderr.write('Unvalid input of utility -t\n')
     else:
         sys.stderr.write('Unvalid input of option -c\n')
Exemple #28
0
    def test_create_grab_instance(self):
        class TestSpider(Spider):
            def create_grab_instance(self, **kwargs):
                grab = super(TestSpider, self).create_grab_instance(**kwargs)
                grab.setup(timeout=77)
                return grab

            def task_generator(self):
                yield Task('page', url=self.meta['server'].get_url())
                yield Task('page', grab=Grab(url=self.meta['server'].get_url(),
                                             timeout=76))

            def task_page(self, grab, dummy_task):
                self.stat.collect('points', grab.config['timeout'])

        bot = build_spider(TestSpider, meta={'server': self.server})
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', grab=Grab(url=self.server.get_url(),
                                            timeout=75)))
        bot.run()
        self.assertEqual(set([77, 76, 75]),
                         set(bot.stat.collections['points']))
Exemple #29
0
 def test_basic_priority(self):
     bot = self.SimpleSpider()
     self.setup_queue(bot)
     bot.taskq.clear()
     requested_urls = {}
     for priority in (4, 2, 1, 5):
         url = SERVER.BASE_URL + '?p=%d' % priority
         requested_urls[priority] = url
         bot.add_task(Task('page', url=url,
                           priority=priority))
     bot.run()
     urls = [x[1] for x in sorted(requested_urls.items(), 
                                  key=lambda x: x[0])]
     self.assertEqual(urls, bot.url_history)
    def task_speciality(self, grab, task):
        # Разбор страницы со специальностями
        for lnk in grab.doc.select(
                '//ul[@class="list-unstyled flat-list"]//a/@href'):
            url = str(task.url) + lnk.text()

            yield Task('schedule', url=url, lpu=task.lpu_obj)

            # Для отладки/тестирования
            #
            # g = Grab()
            # g.go(url + lnk.text())
            # self.task_schedule(g, org)
        return
Exemple #31
0
    def task_subcategory(self, grab, task):
        cat_description = grab.css_list(u'td.txt b')
        page_list = grab.doc.select('//div[@class="list_pages"]//a')

        cur_page = re.search(r'page=\d+', grab.response.url)
        if cur_page:
            cur_page = int(cur_page.group().replace('page=', ''))
        else:
            cur_page = 1
        if page_list.exists():
            for a in page_list:
                a_text = a.text()
                if a_text == u'Далее':
                    break
                if int(a_text) == (cur_page + 1):
                    url = self.rebuild_url_to_city(a.attr('href'))
                    yield Task('subcategory', url=url)

        if cat_description[1].text:
            for place in grab.css_list('div.orgl.lc a'):
                yield Task('place', url=place.get('href'))
            for place in grab.css_list('div.orgl div.lc a'):
                yield Task('place', url=place.get('href'))
Exemple #32
0
    def test_render_stats(self):
        class TestSpider(Spider):
            def prepare(self):
                self.stat.logging_period = 0
                self.stat.inc('foo')

            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        bot.render_stats()
Exemple #33
0
    def test_has_item(self):
        self.server.response['get.data'] = ContentGenerator(self.server)
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache_pipeline.cache.clear()
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', url=self.server.get_url('/foo')))
        bot.run()
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url()))
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url(), timeout=100))
        self.assertFalse(bot.cache_pipeline.cache
                         .has_item(self.server.get_url(), timeout=0))
        self.assertTrue(bot.cache_pipeline.cache
                        .has_item(self.server.get_url('/foo')))
        self.assertFalse(bot.cache_pipeline.cache
                         .has_item(self.server.get_url('/bar')))
Exemple #34
0
    def test_task_clone(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()

        task = Task('baz', url='xxx')
        bot.add_task(task.clone())

        # Pass grab to clone
        task = Task('baz', url='xxx')
        g = Grab()
        g.setup(url='zzz')
        bot.add_task(task.clone(grab=g))

        # Pass grab_config to clone
        task = Task('baz', url='xxx')
        g = Grab()
        g.setup(url='zzz')
        bot.add_task(task.clone(grab_config=g.config))
Exemple #35
0
    def test_task_get_fallback_handler(self):
        class TestSpider(Spider):
            def zz(self, task):
                pass

            def task_bar_fallback(self, task):
                pass


        t1 = Task('foo', url='http://foo.com/', fallback_name='zz')
        t2 = Task('bar', url='http://foo.com/')
        t3 = Task(url='http://foo.com/')

        bot = build_spider(TestSpider, )

        self.assertEqual(t1.get_fallback_handler(bot), bot.zz)
        self.assertEqual(t2.get_fallback_handler(bot), bot.task_bar_fallback)
        self.assertEqual(t3.get_fallback_handler(bot), None)
Exemple #36
0
 def test_task_clone_with_url_param(self):
     task = Task('baz', url='xxx')
     task.clone(url='http://yandex.ru/')
Exemple #37
0
 def test_task_clone_kwargs(self):
     g = build_grab()
     g.setup(url='http://foo.com/')
     task = Task('foo', grab=g, cache_timeout=1)
     task2 = task.clone(cache_timeout=2)
     self.assertEqual(2, task2.cache_timeout)