def test_keep_single_value(self):
     """Loaded item should contain values from the initial item"""
     input_item = self.item_class(name='foo')
     il = ItemLoader(item=input_item)
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(dict(loaded_item), {'name': ['foo']})
Ejemplo n.º 2
0
    def download_files(self, response):
        try:
            logger.info(f'ZooScraper : download_files : {response}')
            zip_urls = []
            # get download file link
            html = html_xml.fromstring(response.text)

            links = html.xpath(
                "//a[@class='js-navigation-open link-gray-dark']/@href")
            # url=self.base_url+links[3]
            for url in links:
                git_url = self.base_url + url
                response = requests.get(git_url)
                html = html_xml.fromstring(response.text)
                zip_links = html.xpath(
                    "//a[@class='js-navigation-open link-gray-dark']/@href")

                zip_url = self.base_url + zip_links[3]
                zip_urls.append(zip_url)
                loader = ItemLoader(item=MaliciousFileCrawlerItem())
                loader.add_value('file_urls', zip_url)
                yield loader.load_item()

        except Exception as err:
            logger.error(f'ZooScraper : download_files : {err}')
            raise err
Ejemplo n.º 3
0
 def test_load_item_using_default_loader(self):
     i = dict(summary='lala')
     il = ItemLoader(item=i)
     il.add_value('name', 'marta')
     item = il.load_item()
     assert item is i
     assert item['summary'] == ['lala']
     assert item['name'] == ['marta']
 def test_get_output_value_list(self):
     """Getting output value must not remove value from item"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     self.assertEqual(il.get_output_value('name'), ['foo', 'bar'])
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']}))
 def test_add_value_list_singlevalue(self):
     """Values added after initialization should be appended"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     il.add_value('name', 'qwerty')
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']})
Ejemplo n.º 6
0
 def get_payments(self, payments) -> PaymentMethod:
     if payments:
         for payment in payments:
             loader = ItemLoader(item=PaymentMethod())
             loader.add_value('method', payment.get('method'))
             loader.add_value('installments',
                              self.get_installments(payment))
             yield loader.load_item()
Ejemplo n.º 7
0
    def test_nested_replace(self):
        loader = ItemLoader(selector=self.selector)
        nl1 = loader.nested_xpath('//footer')
        nl2 = nl1.nested_xpath('a')

        loader.add_xpath('url', '//footer/a/@href')
        self.assertEqual(loader.get_output_value('url'),
                         ['http://www.scrapy.org'])
        nl1.replace_xpath('url', 'img/@src')
        self.assertEqual(loader.get_output_value('url'), ['/images/logo.png'])
        nl2.replace_xpath('url', '@href')
        self.assertEqual(loader.get_output_value('url'),
                         ['http://www.scrapy.org'])
Ejemplo n.º 8
0
    def test_nested_css(self):
        loader = ItemLoader(selector=self.selector)
        nl = loader.nested_css("header")
        nl.add_xpath('name', 'div/text()')
        nl.add_css('name_div', '#id')
        nl.add_value('name_value',
                     nl.selector.xpath('div[@id = "id"]/text()').getall())

        self.assertEqual(loader.get_output_value('name'), ['marta'])
        self.assertEqual(loader.get_output_value('name_div'),
                         ['<div id="id">marta</div>'])
        self.assertEqual(loader.get_output_value('name_value'), ['marta'])

        self.assertEqual(loader.get_output_value('name'),
                         nl.get_output_value('name'))
        self.assertEqual(loader.get_output_value('name_div'),
                         nl.get_output_value('name_div'))
        self.assertEqual(loader.get_output_value('name_value'),
                         nl.get_output_value('name_value'))
Ejemplo n.º 9
0
    def test_nested_ordering(self):
        loader = ItemLoader(selector=self.selector)
        nl1 = loader.nested_xpath('//footer')
        nl2 = nl1.nested_xpath('a')

        nl1.add_xpath('url', 'img/@src')
        loader.add_xpath('url', '//footer/a/@href')
        nl2.add_xpath('url', 'text()')
        loader.add_xpath('url', '//footer/a/@href')

        self.assertEqual(loader.get_output_value('url'), [
            '/images/logo.png',
            'http://www.scrapy.org',
            'homepage',
            'http://www.scrapy.org',
        ])
Ejemplo n.º 10
0
    def test_nested_load_item(self):
        loader = ItemLoader(selector=self.selector)
        nl1 = loader.nested_xpath('//footer')
        nl2 = nl1.nested_xpath('img')

        loader.add_xpath('name', '//header/div/text()')
        nl1.add_xpath('url', 'a/@href')
        nl2.add_xpath('image', '@src')

        item = loader.load_item()

        assert item is loader.item
        assert item is nl1.item
        assert item is nl2.item

        self.assertEqual(item['name'], ['marta'])
        self.assertEqual(item['url'], ['http://www.scrapy.org'])
        self.assertEqual(item['image'], ['/images/logo.png'])
Ejemplo n.º 11
0
    def __callback(self, response: HtmlResponse):
        item = LeroymerlinItem()
        loader = ItemLoader(item=item, selector=response)
        loader.add_xpath(Fields.name, "//h1/text()")
        loader.add_value(Fields.link, response.url)
        loader.add_xpath(Fields.article_number,
                         "//span[@slot='article']/text()")
        loader.add_xpath(Fields.price, "//uc-pdp-price-view/span/text()")
        loader.add_xpath(Fields.image_links,
                         "//uc-pdp-media-carousel//img/@src")
        loader.add_value(Fields.image_paths, [])
        loader.add_value(Fields.category, self.__category)

        # 2)Написать универсальный обработчик характеристик товаров, который будет формировать данные вне зависимости от их типа и количества.
        loader.add_xpath(
            Fields.details,
            "//dl[@class='def-list']//dt/text() | //dl[@class='def-list']//dd/text()"
        )

        yield loader.load_item()
    def parse(self, response):
        current_page = response.meta['currentPage']
        json_resp = json.loads(response.text)
        houses = json_resp['cat1']['searchResults']['listResults']
        total_pages = json_resp['cat1']['searchList']['totalPages']

        for house in houses:
            loader = ItemLoader(item=ZillowItem())
            loader.add_value('id', house.get('id'))
            loader.add_value('image_urls', house.get('imgSrc'))
            loader.add_value('detail_url', house.get('detailUrl'))
            loader.add_value('status_type', house.get('statusType'))
            loader.add_value('status_text', house.get('statusText'))
            loader.add_value('price', house.get('price'))
            loader.add_value('address', house.get('address'))
            loader.add_value('beds', house.get('beds'))
            loader.add_value('baths', house.get('baths'))
            loader.add_value('area_sqft', house.get('area'))
            loader.add_value('latitude', house.get('latLong').get('latitude'))
            loader.add_value('longitude', house.get('latLong').get('longitude'))
            loader.add_value('broker_name', house.get('brokerName'))
            loader.add_value('broker_phone', house.get('brokerPhone'))
            yield loader.load_item()

        print({
            "houses": len(houses),
            "current_page": current_page,
            "total_pages": total_pages
        })

        if current_page <= total_pages:
            current_page += 1
            yield scrapy.Request(
                url=parse_new_url(URL, page_number=current_page),
                callback=self.parse,
                cookies=get_cookie(),
                meta={
                    'currentPage': current_page
                }
            )
Ejemplo n.º 13
0
 def test_add_none(self):
     il = ItemLoader()
     il.add_value('name', None)
     assert il.get_collected_values('name') == []
 def test_values_list(self):
     """Values from initial item must be added to loader._values"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     self.assertEqual(il._values.get('name'), ['foo', 'bar'])
Ejemplo n.º 15
0
 def parse_application(self, response):
     loader = ItemLoader(item=TraidingSsgeItem(), selector=response)
     loader._add_value('statement_date', response.request.meta['appdate'])
     loader.add_xpath('new_or_used',
                      "normalize-space(//div[@class='condition']/text())")
     loader.add_xpath('location',
                      "//div[@class='location-time']/div[2]/p/span/text()")
     loader.add_xpath(
         'last_updated',
         'normalize-space(//div[@class="location-time"]/div[2]/descendant::span[2]/text())'
     )
     loader.add_xpath('product',
                      "normalize-space(//h2[@class='main-title']/text())")
     loader.add_xpath(
         'price',
         "normalize-space(//div[@class='market-item-price ']/text())")
     loader.add_xpath(
         'currency_symbol',
         "normalize-space(//div[@class='market-item-price ']/span/text())")
     loader.add_xpath(
         'applicant', "normalize-space(//div[@class='author_type']/text())")
     loader.add_value('current_app_url', response.url)
     loader.add_xpath(
         'all_apps_url',
         "//div[@class='author_type']/descendant::span/a/@href")
     loader.add_xpath(
         'agent_or_person',
         "normalize-space((//div[@class='author_type'])[1]/span/a/text())")
     loader.add_xpath(
         'number_of_apps',
         "normalize-space(//div[@class='author_type']/descendant::span[2]/text())"
     )
     loader.add_xpath(
         'product_description',
         "normalize-space(//span[@class='details_text']/text())")
     loader.add_xpath(
         'product_specification',
         "normalize-space(//div[@class='jobs_details']/span/text())")
     loader.add_xpath(
         'product_condition_description',
         "normalize-space(//div[@class='jobs_details'][2]/span[2]/text())")
     loader.add_xpath(
         'seen',
         "normalize-space(//div[@class='article_views']/span/text())")
     loader.add_xpath(
         'app_id',
         "normalize-space(//div[@class='market-item-id']/span/text())")
     loader.add_xpath(
         'phone', "normalize-space(//div[@class='numbers-wrap']/a/@href)")
     print(loader.item)
     yield loader.load_item()
Ejemplo n.º 16
0
 def test_add_zero(self):
     il = ItemLoader()
     il.add_value('name', 0)
     assert il.get_collected_values('name') == [0]
Ejemplo n.º 17
0
 def parse(self, response):
     loader = ItemLoader(WikiPageItem(), response, response=response)
     loader.add_css('name', 'h1.firstHeading::text')
     loader.add_value('url', response.url)
     loader.add_css('img', 'td a.image img::attr(src)')
     links = self.link_extractor.extract_links(response)
     links_items = [{'url': link.url, 'text': link.text} for link in links]
     loader.add_value('links', links_items)
     yield from response.follow_all([link.url for link in links])
     yield loader.load_item()
Ejemplo n.º 18
0
    def test_get_value(self):
        il = ItemLoader()
        self.assertEqual('FOO',
                         il.get_value(['foo', 'bar'], TakeFirst(), str.upper))
        self.assertEqual(['foo', 'bar'],
                         il.get_value(['name:foo', 'name:bar'],
                                      re='name:(.*)$'))
        self.assertEqual(
            'foo',
            il.get_value(['name:foo', 'name:bar'],
                         TakeFirst(),
                         re='name:(.*)$'))
        self.assertEqual(
            None, il.get_value(['foo', 'bar'], TakeFirst(), re='name:(.*)$'))
        self.assertEqual(None, il.get_value(None, TakeFirst()))

        il.add_value('name', ['name:foo', 'name:bar'],
                     TakeFirst(),
                     re='name:(.*)$')
        self.assertEqual(['foo'], il.get_collected_values('name'))
        il.replace_value('name', 'name:bar', re='name:(.*)$')
        self.assertEqual(['bar'], il.get_collected_values('name'))
Ejemplo n.º 19
0
 def _test_item(self, item):
     il = ItemLoader()
     il.add_value('item_list', item)
     self.assertEqual(il.load_item(), {'item_list': [item]})
Ejemplo n.º 20
0
 def test_get_unset_value(self):
     loader = ItemLoader()
     self.assertEqual(loader.load_item(), {})
     self.assertEqual(loader.get_output_value('foo'), [])
     self.assertEqual(loader.load_item(), {})
Ejemplo n.º 21
0
    def parse_single_hausnummer_page(self, response):
        meta = response.meta
        loader = ItemLoader(items.Street(), response)
        loader.default_output_processor = TakeFirst()
        loader.add_value('stadtteil', meta['stadtteil'])
        loader.add_value('name', meta['adresse'])
        loader.add_value('link', meta['link'])

        for script in response.css("script[type='text/javascript']").getall():
            if "demographicInfo" in script:
                loader.add_value('demographics', self.regex_demographics.search(script).group())
                break
        yield loader.load_item()