Beispiel #1
0
    def parse_details(self, response, item):
        # script_obj = json.loads(response.xpath(SCRIPT_OBJ_XPATH).get())
        try:
            script_obj = chompjs.parse_js_object(response.css(SCRIPT_OBJ_CSS).get())
        except Exception as e:
            self.logger.error(f'{e} when script object parsing')
            return

        item['krisha_id'] = get_nested_item(script_obj, FLAT_KRISHA_ID_KEY)
        item['title'] = response.xpath(FLAT_TITLE_XPATH).get()
        item['seller_phone'] = None
        item['views_count'] = None
        item['price'] = get_nested_item(script_obj, FLAT_PRICE_KEY)
        item['rooms_count'] = get_nested_item(script_obj, FLAT_ROOMS_COUNT_KEY)
        item['total_area'] = get_nested_item(script_obj, FLAT_TOTAL_AREA_KEY)
        item['ceiling_height'] = response.xpath(FLAT_CEILING_HEIGHT_XPATH).get()
        item['region'] = get_nested_item(script_obj, FLAT_REGION_KEY)
        item['city'] = get_nested_item(script_obj, FLAT_CITY_KEY)
        item['address'] = get_nested_item(script_obj, FLAT_ADDRESS_KEY)
        item['flat_floor'] = response.xpath(FLAT_FLOOR_XPATH).get()
        item['longitude'] = get_nested_item(script_obj, FLAT_LONGITUDE_KEY)
        item['attitude'] = get_nested_item(script_obj, FLAT_ATTITUDE_KEY)
        item['construction_year'] = response.xpath(BUILDING_CONSTRUCTION_YEAR_XPATH).get()
        item['floors_count'] = response.xpath(BUILDING_FLOORS_COUNT_XPATH).get()
        item['wall_type'] = response.xpath(BUILDING_WALL_TYPE_XPATH).get()
        item['seller_user_type'] = get_nested_item(script_obj, FLAT_SELLER_USER_TYPE_KEY)
        item['description'] = response.xpath(FLAT_DESCRIPTION_XPATH).getall()

        return item
Beispiel #2
0
def scrape_beer_bazaar():
    base_url = 'https://beerbazaar.co.il/apps/bundles/bundle/17591'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)'
    }
    price = '8 for 99, 12 for 129'
    brewery = 'Beer Bazaar'
    results = []
    supplier = brewery

    html = urlopen(base_url).read()
    soup = BeautifulSoup(html, features="html.parser")
    script = soup.find('script', id='bundle-builder-app--bundle--data')
    script = str(script)[506:]
    beers = parse_js_object(script)

    for beer in beers:
        # Name
        name = beer['handle']
        print(name)

        # Image
        img = beer['image']['src']
        print(img)

        print('\n')

        new_beer = [name, price, base_url, img, supplier, brewery]
        results.append(new_beer)
   
    return results
    print(f"Finished scraping: {supplier}!")
def parse_product_details_neptun(response):

    dict_item = {}

    new_response = response.replace(encoding='utf-8')
    result = new_response.css('script:contains(productModel)::text').get()
    if result:
        decoded = codecs.decode(
            result, 'unicode_escape').encode('latin1').decode('utf8')
        dict_data = chompjs.parse_js_object(decoded)

        market_name = 'neptun'

        dict_item["id"] = f'{market_name}-{dict_data["Id"]}'
        dict_item['brand'] = dict_data['Manufacturer']['Name'].title()
        dict_item['product_name'] = dict_data['Title'].replace("Лаптоп ", "")
        dict_item['categories_path'] = [
            data_['Name'] for data_ in dict_data['NavigationPath']
        ]
        dict_item['price'] = dict_data['RegularPrice']
        dict_item['discount_price'] = dict_data['DiscountPrice']
        sel_desc = scrapy.selector.Selector(text=dict_data['Description'])
        lst_desc_line = sel_desc.xpath('//ul/li/text()').getall()
        if lst_desc_line:
            lst_desc_line = [
                item_.strip() for item_ in lst_desc_line if item_.strip()
            ]

        dict_item['description'] = '\n'.join(lst_desc_line)
        dict_item['url'] = response.request.url
        dict_item['market_name'] = market_name

    return dict_item
Beispiel #4
0
 def test_nested_dict(self):
     result = parse_js_object(
         "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}")
     self.assertEqual(result, {
         'hello': 'world',
         'my': {
             'master': 'of Orion'
         },
         'test': 'xx'
     })
Beispiel #5
0
 def parse_infos(self, response):
     data_string = data = response.xpath(
         '//script[@id="js-hydration"]/text()').get()
     data = chompjs.parse_js_object(data_string,
                                    json_params={'strict': False})
     loader = ItemLoader(ImmobiliareItem(), response)
     # inspect_response(response, self)
     loader.add_value('image_urls', nested_lookup('medium', data))
     data = nested_delete(data, 'multimedia')
     listing = nested_delete(nested_lookup('listing', data)[0], 'type')
     listing = nested_delete(listing, 'title')
     loader.add_value('listing', listing)
     loader.add_value('trovokasa', nested_lookup('trovakasa', data))
     yield loader.load_item()
Beispiel #6
0
    def parse(self, response):

        response_js = chompjs.parse_js_object(response.text)

        for element in response_js.get("data"):
            date = self.extract_date(element)
            url = self.extract_url(element)
            edition_number = self.extract_edition_number(element)
            is_extra_edition = self.extract_is_extra_edition(element)

            yield Gazette(
                date=date,
                file_urls=[url],
                edition_number=edition_number,
                is_extra_edition=is_extra_edition,
                power="executive_legislative",
            )
Beispiel #7
0
 def test_stack(self):
     result = parse_js_object(
         "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
         initial_stack_size=1,
     )
     self.assertEqual(
         result,
         {'a': [{
             'b': 1
         }, {
             'c': [{
                 'd': {
                     'f': {
                         'g': [1, 2]
                     }
                 }
             }, {
                 'e': 1
             }]
         }]})
Beispiel #8
0
    def parse(self, response):
        """Descobre datas disponíveis em calendário na página inicial e as requisita"""
        available_dates_js = re.search(r"var eventData = \[.+?\];",
                                       response.text, re.DOTALL).group()
        available_dates = parse_js_object(available_dates_js)

        for date_metadata in available_dates:
            gazette_date = self._parse_date(date_metadata["date"])

            if not (self.start_date <= gazette_date <= self.end_date):
                continue

            callback = (self.parse_multi_publication_date
                        if date_metadata["classname"] == "ambos" else
                        self.parse_single_publication_date)
            yield scrapy.Request(
                url=
                f"{self.BASE_URL}doDia.php?dataEdicao={date_metadata['date']}",
                callback=callback,
                cb_kwargs={"date_metadata": date_metadata},
            )
    def parse_details(self, response):

        scripts = response.css('script::text').getall()
        for script in scripts:
            try:
                obj = chompjs.parse_js_object(script)
                if 'product' in obj:
                    obj = obj['product']['preFetchedData']
                    break
            except ValueError as e:
                pass

        data = {
            'shop': 'mechta.kz',
            'url': response.request.url,
            'title': clean_title(obj['NAME'], 'Телефон сотовый'),
            'price': obj['PRICE']['PRICE'],
            'images': obj['PHOTO'],
            'specs': self.get_specs(obj)
        }
        yield data
Beispiel #10
0
    def parse1(self, response):
        jsonresp= chompjs.parse_js_object(response.xpath("//@data-redux-state").extract_first())

        

        for items in self.traversen(jsonresp):
            loader=ItemLoader(item=TescoItem(), selector=items)
            tesco_img_url=" "
            tesco_prod_name=" "
            tesco_img_url = items['defaultImageUrl']
            tesco_prod_name = items['title'] 
            name= self.cleanup(tesco_prod_name)
            loader.add_value('image_urls', tesco_img_url)    
            loader.add_value('image_name', name)
            yield loader.load_item()  
        next_url= response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get()
        if next_url:
            next_lnk= response.urljoin(response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get())
            yield scrapy.Request(url=next_lnk, callback=self.parse1, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
            },dont_filter=True)
    
    # def parse1(self, response):
    #     for product in response.xpath("//ul[@class='product-list grid']/li[contains(@class,'product-list--list-item')]"):
    #         loader=ItemLoader(item=TescoItem(), selector=product)
    #         tesco_img_url = product.xpath(".//div/div/div/div/a/div/img/@srcset").get()
    #         #tesco_img_url = tesco[tesco.rfind("http"):tesco.rfind(" ")]
    #         tesco_prod_name = product.xpath(".//div[@class='product-details--wrapper']/div/h3/a/text()").get()
    #         print("********")
    #         print(tesco_img_url)
    #         print(tesco_prod_name)
    #         loader.add_value('image_urls', tesco_img_url)    
    #         loader.add_value('image_name',tesco_prod_name)
    #         yield loader.load_item()  
    #     next_url= response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get()
    #     if next_url:
    #         next_lnk= response.urljoin(response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get())
    #         yield scrapy.Request(url=next_lnk, callback=self.parse1, headers={
    #             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    #         },dont_filter=True)
    def parse_category_products(self, response):

        this_url = response.request.url
        base_url = this_url.split('?')[0]

        items_show = 100

        dict_form_data = {}
        dict_form_data['items'] = f'{items_show}'

        new_response = response.replace(encoding='utf-8')
        result = new_response.css(
            'script:contains(shopCategoryModel)::text').get()

        if result:

            decoded = codecs.decode(
                result, 'unicode_escape').encode('latin1').decode('utf8')
            dict_data = chompjs.parse_js_object(decoded)

            number_of_products = dict_data["NumberOfProducts"]
            page_count = number_of_products // items_show
            page_count = page_count + 1 if number_of_products % items_show else page_count

            lst_links = [
                f"{base_url}/{dict_product_['Url']}"
                for dict_product_ in dict_data["Products"]
            ]

            for link_ in lst_links:
                urlItem = UrlItem()
                urlItem["url"] = link_
                yield urlItem

            if page_count > 1:
                for i in range(2, page_count + 1):
                    dict_form_data["page"] = i
                    href = f"{base_url}?{urlencode(dict_form_data)}"
                    yield scrapy.Request(href,
                                         callback=self.parse_category_products)
Beispiel #12
0
 def parse_text(
     self,
     response,
     company_link,
     title,
     title_link,
     date_posted,
     location,
     company,
 ):
     description = self.cleanhtml(
         chompjs.parse_js_object(
             response.xpath("//script[contains(., 'JobPosting')]/text()").
             extract()[0])["description"])
     yield {
         "company_link": company_link,
         "job_title": title,
         "link": title_link,
         "activated_at": date_posted,
         "job_id": uuid.uuid4(),
         "location": location,
         "company_name": company,
         "text": description
     }
Beispiel #13
0
 def test_depth(self):
     result = parse_js_object("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]")
     self.assertEqual(result, [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]])
Beispiel #14
0
 def test_multiple_identifiers(self):
     result = parse_js_object("{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}")
     self.assertEqual(result, {k: 1 for k in 'abcdefghij'})
Beispiel #15
0
 def test_one_field_dict(self):
     result = parse_js_object("{'hello': 'world'}")
     self.assertEqual(result, {'hello': 'world'})
Beispiel #16
0
 def test_escaped_text(self):
     result = parse_js_object("{'a': '123\\'456\\n'}")
     self.assertEqual(result, {'a': "123'456\n"})
Beispiel #17
0
 def test_special_fields(self):
     result = parse_js_object("{'a': true, 'b': false, 'c': null}")
     self.assertEqual(result, {'a': True, 'b': False, 'c': None})
Beispiel #18
0
 def test_nested_lists(self):
     result = parse_js_object("[[[]]]")
     self.assertEqual(result, [[[]]])
Beispiel #19
0
 def test_dict_with_multiple_element_list(self):
     result = parse_js_object("{'hello': [1, 2, 3, 4]}")
     self.assertEqual(result, {'hello': [1, 2, 3, 4]})
Beispiel #20
0
 def test_multiple_list_items_string(self):
     result = parse_js_object("['h', 'e', 'l', 'l', 'o']")
     self.assertEqual(result, ['h', 'e', 'l', 'l', 'o'])
Beispiel #21
0
 def test_dict_with_lists(self):
     result = parse_js_object("{'hello': [], 'world': [0]}")
     self.assertEqual(result, {'hello': [], 'world': [0]})
Beispiel #22
0
 def test_empty_dict(self):
     result = parse_js_object("{}")
     self.assertEqual(result, {})
Beispiel #23
0
 def test_single_list_item(self):
     result = parse_js_object("[1]")
     self.assertEqual(result, [1])
Beispiel #24
0
 def test_nested_lists_with_value(self):
     result = parse_js_object("[[[1]]]")
     self.assertEqual(result, [[[1]]])
Beispiel #25
0
 def test_unicode_values(self):
     result = parse_js_object("['\u00E9']")
     self.assertEqual(result, ['é'])
Beispiel #26
0
 def test_list_of_dicts(self):
     result = parse_js_object("[{'a':12}, {'b':33}]")
     self.assertEqual(result, [{'a': 12}, {'b': 33}])
Beispiel #27
0
 def test_unicode_keys(self):
     result = parse_js_object('{"cache":{"\u002Ftest\u002F": 0}}')
     self.assertEqual(result, {'cache': {'/test/': 0}})
Beispiel #28
0
 def test_non_quoted_identifier(self):
     result = parse_js_object("{abcdefghijklmnopqrstuvwxyz: 12}")
     self.assertEqual(result, {"abcdefghijklmnopqrstuvwxyz": 12})
Beispiel #29
0
 def test_multiple_list_items(self):
     result = parse_js_object("[1, 2, 3, 4]")
     self.assertEqual(result, [1, 2, 3, 4])
Beispiel #30
0
 def test_empty_list(self):
     result = parse_js_object("[]")
     self.assertEqual(result, [])