Exemple #1
0
 def task_generator(self):
     for _ in six.moves.range(1111):
         yield Task('page', url=server.get_url())
Exemple #2
0
 def task_generator(self):
     self.done_counter = 0
     yield Task('page', url=server.get_url())
Exemple #3
0
 def task_generator(self):
     g = self.create_grab_instance()
     g.setup(url='http://h.wrttn.me/status/503', log_dir='log', debug=True)
     yield Task('initial', grab=g)
Exemple #4
0
 def task_page(self, grab, task):
     print('Start parse olx')
     for elem in grab.xpath_list(
             '//a[@class="marginright5 link linkWithHash detailsLink"]'):
         yield Task('olxpost', url=elem.get('href'))
Exemple #5
0
    def task_parse_items(self, grab, task):
        self.logger.info('[{}] Start: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            if task.task_try_count < self.err_limit:
                self.logger.error(
                    '[{}] Restart task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))
                yield Task('parse_items',
                           url=task.url,
                           priority=105,
                           task_try_count=task.task_try_count + 1,
                           raw=True)
            else:
                self.logger.error(
                    '[{}] Skip task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))

            return

        try:
            # parse pagination numbers
            if not task.get('d_skip_page_check'):
                items = grab.doc.select('//a[contains(@href, "{}")]'.format(
                    Config.get('SITE_PAGE_PARAM')))
                max_page = get_max_page(items, 1)
                self.logger.info('[{}] Find max page: {}'.format(
                    task.name, max_page))

                url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM'))

                # self-execute from 2 page (if needed)
                for p in range(2, max_page + 1):
                    url = url_gen.get_page(p)
                    yield Task('parse_items',
                               url=url,
                               priority=100,
                               d_skip_page_check=True,
                               raw=True)

            # parse items
            items_list = grab.doc.select(
                '//div[@class="cart_table"]/div/div/table/tbody/tr')

            for index, row in enumerate(items_list):
                try:
                    # NAME
                    item_name = row.select(
                        './td[1]//div[@class="description"]/div/a').text(
                        ).strip()

                    # UNIT
                    unit = row.select('./td[2]').text().strip()
                    if unit == '':
                        unit = 'ед.'

                    # PRICE
                    price_raw = row.select(
                        './td[6]//meta[@itemprop="lowprice"]').attr('content')
                    match = Ree.float.match(price_raw)
                    # check & fix
                    if not match:
                        self.logger.warning(
                            '[{}] Skip item, because price is {} (line: {})'.
                            format(task.name, price_raw, index))
                        continue

                    price = match.groupdict()['price'].replace(',', '.')

                    # COUNT
                    count = row.select('./td[5]')
                    count_text = count.text().strip()

                    # case 1: string line
                    if count_text == 'распродано':
                        item_count = self.const_price_on_request
                        item_place = self.const_default_place

                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 2: string line
                    elif count_text == 'под заказ':
                        item_count = self.const_stock_zero
                        item_place = self.const_default_place
                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 3
                    else:
                        count_rows = count.select(
                            './/div[@class="layer_info"]/table/tbody/tr')

                        for count_row in count_rows:
                            item_place = count_row.select(
                                './td[1]').text().strip()
                            item_count = 0

                            # add stock
                            place_count_stock = count_row.select(
                                './td[1]').text().strip()
                            if Ree.float.match(place_count_stock):
                                item_count += float(place_count_stock)

                            # add expo
                            place_count_expo = count_row.select(
                                './td[2]').text().strip()
                            if Ree.float.match(place_count_expo):
                                item_count += float(place_count_expo)

                            if item_count > 0:
                                # OUTPUT
                                self.logger.debug(
                                    '[{}] Item added, index {} at url {}'.
                                    format(task.name, index, task.url))
                                self.result.append({
                                    'name':
                                    item_name,
                                    # 3.140 -> 3.14; 3.0 -> 3
                                    'count':
                                    '{0:g}'.format(item_count),
                                    'unit':
                                    unit,
                                    'price':
                                    price,
                                    'place':
                                    item_place
                                })
                except IndexError as e:
                    self.logger.warning('[{}] Skip item: {}, {}'.format(
                        task.name,
                        type(e).__name__, task.url))

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.logger.info('[{}] Finish: {}'.format(task.name, task.url))
Exemple #6
0
 def task_generator(self):
     for x in xrange(2):
         yield Task('page', 'http://dumpz.org/%d/' % x)
Exemple #7
0
 def task_foo(self, grab, task):
     grab.setup(url=SERVER.BASE_URL)
     yield Task('bar', grab=grab)
Exemple #8
0
    def test_setup_proxylist(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open('/tmp/__proxy.txt', 'w').write(content)

        # Simple test, one task
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru',
                                           debug=True)))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))

        # By default auto_change is True
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)

        # DO the same test with load_proxylist method
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)

        # Disable auto_change
        # By default auto_init is True
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False)
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))

        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        bot = build_spider(SimpleSpider, thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt',
                           'text_file',
                           auto_change=False,
                           auto_init=False)
        bot.setup_queue()
        for x in six.moves.range(10):
            bot.add_task(Task('baz', self.server.get_url()))
        bot.run()

        self.assertEqual(self.server.request['headers'].get('host'),
                         '%s:%s' % (ADDRESS, self.server.port))
        self.assertEqual(1, len(set(bot.stat.collections['ports'])))
        self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
Exemple #9
0
 def task_generator(self):
     for category in CATEGORIES:
         url = category % REGION[self.city]
         addition = {'category': category}
         yield Task('collect_adv_data', url, addition=addition)
Exemple #10
0
 def task_generator(self):
     yield Task('page', 'http://dumpz.org/100/')
     yield Task('page', 'http://dumpz.org/101/', disable_cache=True)
Exemple #11
0
 def task_generator(self):
     print '//////////////task generator///////////////'
     for proxy in self.proxy_list:
         yield Task('check_proxy', url='http://ci.ua', delay=3,
                    network_try_limit=1, raw=True)
Exemple #12
0
    def test_setup_proxylist(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open('/tmp/__proxy.txt', 'w').write(content)

        # Simple test, one task
        bot = SimpleSpider(thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True)))
        bot.run()

        self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru')
        self.assertTrue(len(bot.ports) == 1)

        # By default auto_change is True
        bot = SimpleSpider(thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in xrange(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru')
        self.assertTrue(len(bot.ports) > 1)

        # DO the same test with load_proxylist method
        bot = SimpleSpider(thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file')
        bot.setup_queue()
        for x in xrange(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru')
        self.assertTrue(len(bot.ports) > 1)

        # Disable auto_change
        # By default auto_init is True
        bot = SimpleSpider(thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False)
        bot.setup_queue()
        for x in xrange(10):
            bot.add_task(Task('baz', 'http://yandex.ru'))
        bot.run()

        self.assertEqual(SERVER.REQUEST['headers']['host'], 'yandex.ru')
        self.assertTrue(len(bot.ports) == 1)

        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        bot = SimpleSpider(thread_number=1)
        bot.load_proxylist('/tmp/__proxy.txt', 'text_file',
                           auto_change=False, auto_init=False)
        bot.setup_queue()
        for x in xrange(10):
            bot.add_task(Task('baz', SERVER.BASE_URL))
        bot.run()

        self.assertEqual(SERVER.REQUEST['headers'].get('host'),
                         '%s:%s' % ('localhost', SERVER.PORT))
        self.assertTrue(len(bot.ports) == 1)
        self.assertEqual(list(bot.ports)[0], SERVER.PORT)
Exemple #13
0
 def data_foo(self, count):
     self.data_processed.append(count)
     if count == 1:
         yield Data('foo', count=666)
         yield Task('page', url=server.get_url(),
                    count=count + 1)
Exemple #14
0
    def task_level_5(self, grab, task):
        """
        Парсим карточку компании
        """
        if not chek_loading(grab.response.body, 'manufacturers'):
            yield task.clone(refresh_cache=True, priority=80)
            return

        company_info = grab.doc.select('//div[@class="companyInfo"]').one()
        company = comp_db.Company()
        try:
            company.name = company_info.select('//*[@class="mt10"]').text()
        except IndexError:
            yield task.clone(refresh_cache=True)
            return

        company.url_card = task.url
        company.site = '; '.join(
            company_info.select(
                'p[contains(text(), "Homepage Address")]/following-sibling::p[1]'
            ).text_list())

        try:
            country_and_index = ''.join(
                company_info.select('text()').text_list())
        except IndexError:
            country_and_index = ''
        try:
            company.country, company.address_index = INDEX_PATTERN.search(
                country_and_index).group(1, 2)
        except AttributeError:
            company.country = country_and_index

        try:
            company.city = company_info.select(
                'p[contains(text(), "Tel")]/preceding-sibling::p[2]').text()
        except IndexError:
            pass

        try:
            company.province = company_info.select(
                'p[contains(text(), "Tel")]/preceding-sibling::p[1]').text()
        except IndexError:
            pass

        try:
            company.address = company_info.select('p[2]').text()
        except IndexError:
            pass

        try:
            company.fax = company_info.select(
                'p[contains(text(), "Fax")]').text().replace('Fax:',
                                                             '').strip()
        except IndexError:
            pass

        try:
            company.tel = company_info.select(
                'p[contains(text(), "Tel")]').text().replace('Tel:',
                                                             '').strip()
        except IndexError:
            pass

        try:
            company.about = grab.doc.select(
                '//div[@class="commonBox userContent"]').text().replace(
                    '... more >>', '')
        except IndexError:
            pass

        try:
            company.email_img_url = company_info.select(
                'p[contains(text(), "-mail")]/img').attr('src')
            company.email_img_url = grab.make_url_absolute(
                company.email_img_url)
        except IndexError:
            pass

        try:
            company.person = grab.doc.select(
                '//div[@class="companyInfo"][2]/p[2]').text()
        except IndexError:
            pass

        try:
            company.stars = grab.doc.select(
                '//p[@class="supplierInfo_main"]/a').text()
        except IndexError:
            pass

        company.importer = 'Import' in grab.doc.select(
            '//div[@class="CoProfile"]').text(smart=True)
        company.exporter = 'Export' in grab.doc.select(
            '//div[@class="CoProfile"]').text(smart=True)

        #pdb.set_trace()
        if company.email_img_url:
            yield Task('ocr_image', url=company.email_img_url, priority=35)
        comp_db.session.add(company)
        comp_db.session.commit()
Exemple #15
0
 def task_initial(self, grab, task):
     yield Task('more', url=server.get_url())
Exemple #16
0
 def task_generator(self):
     yield Task('page', url=server.get_url(), delay=1.5, num=3)
     yield Task('page', url=server.get_url(), delay=4.5, num=2)
     yield Task('page', url=server.get_url(), delay=3, num=4)
     yield Task('page', url=server.get_url(), num=1)
Exemple #17
0
 def task_generator(self):
     yield Task('page', url=self.url)
Exemple #18
0
 def task_initial(self, grab, task):
     for cat in grab.css_list(u'.index_rubrics a'):
         yield Task('category', url=cat.get('href'))
Exemple #19
0
 def task_generator(self):
     for category in CATEGORIES:
         url = '%s%s' % (DOMEN, category)
         addition = {'category': category}
         yield Task('collect_adv_data', url, addition=addition)
Exemple #20
0
 def task_category(self, grab, task):
     for cat in grab.css_list(u'ul.list_rubrics li a'):
         if HeroCategory.objects.filter(hero_name=cat.text.lower()).count():
             url = self.rebuild_url_to_city(cat.get('href'))
             yield Task('subcategory', url=url)
Exemple #21
0
 def task_foo(self, grab, dummy_task):
     grab.setup(url=server.get_url())
     yield Task('bar', grab=grab)
Exemple #22
0
 def task_generator(self):
     for x in xrange(1111):
         yield Task('page', url=SERVER.BASE_URL)
Exemple #23
0
 def task_generator(self):
     yield Task('initial', url='')
Exemple #24
0
 def task_initial(self, grab, task):
     yield Task('parse', grab=grab)
Exemple #25
0
 def task_generator(self):
     yield Task('page', url=INVALID_URL)
Exemple #26
0
 def task_generator(self):
     grab = Grab()
     grab.setup(url=server.get_url(), timeout=1)
     yield Task('page', grab=grab, raw=True)
Exemple #27
0
 def task_generator(self):
     for query, tag in settings.QUERY_LIST:
         g = Grab()
         g.setup(url=self.build_query_url(query), content_type='xml')
         yield Task('feed', grab=g, query=query, tag=tag)
Exemple #28
0
 def task_generator(self):
     # pylint: disable=attribute-defined-outside-init
     self.done_counter = 0
     # pylint: enable=attribute-defined-outside-init
     yield Task('page', url=server.get_url())
Exemple #29
0
 def task_generator(self):
     grab = Grab(url=server.get_url(), timeout=1)
     yield Task('page', grab=grab)
Exemple #30
0
 def task_initial(self, grab, task):
     items = grab.xpath_list('//h5[@class]')
     for item in items:
         link = item.getparent()
         url = 'http://www.immobilienscout24.de' + link.attrib['href']
         self.add_task(Task(name='get_data', url=url))