def parse_details(self, response, item): # script_obj = json.loads(response.xpath(SCRIPT_OBJ_XPATH).get()) try: script_obj = chompjs.parse_js_object(response.css(SCRIPT_OBJ_CSS).get()) except Exception as e: self.logger.error(f'{e} when script object parsing') return item['krisha_id'] = get_nested_item(script_obj, FLAT_KRISHA_ID_KEY) item['title'] = response.xpath(FLAT_TITLE_XPATH).get() item['seller_phone'] = None item['views_count'] = None item['price'] = get_nested_item(script_obj, FLAT_PRICE_KEY) item['rooms_count'] = get_nested_item(script_obj, FLAT_ROOMS_COUNT_KEY) item['total_area'] = get_nested_item(script_obj, FLAT_TOTAL_AREA_KEY) item['ceiling_height'] = response.xpath(FLAT_CEILING_HEIGHT_XPATH).get() item['region'] = get_nested_item(script_obj, FLAT_REGION_KEY) item['city'] = get_nested_item(script_obj, FLAT_CITY_KEY) item['address'] = get_nested_item(script_obj, FLAT_ADDRESS_KEY) item['flat_floor'] = response.xpath(FLAT_FLOOR_XPATH).get() item['longitude'] = get_nested_item(script_obj, FLAT_LONGITUDE_KEY) item['attitude'] = get_nested_item(script_obj, FLAT_ATTITUDE_KEY) item['construction_year'] = response.xpath(BUILDING_CONSTRUCTION_YEAR_XPATH).get() item['floors_count'] = response.xpath(BUILDING_FLOORS_COUNT_XPATH).get() item['wall_type'] = response.xpath(BUILDING_WALL_TYPE_XPATH).get() item['seller_user_type'] = get_nested_item(script_obj, FLAT_SELLER_USER_TYPE_KEY) item['description'] = response.xpath(FLAT_DESCRIPTION_XPATH).getall() return item
def scrape_beer_bazaar(): base_url = 'https://beerbazaar.co.il/apps/bundles/bundle/17591' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)' } price = '8 for 99, 12 for 129' brewery = 'Beer Bazaar' results = [] supplier = brewery html = urlopen(base_url).read() soup = BeautifulSoup(html, features="html.parser") script = soup.find('script', id='bundle-builder-app--bundle--data') script = str(script)[506:] beers = parse_js_object(script) for beer in beers: # Name name = beer['handle'] print(name) # Image img = beer['image']['src'] print(img) print('\n') new_beer = [name, price, base_url, img, supplier, brewery] results.append(new_beer) return results print(f"Finished scraping: {supplier}!")
def parse_product_details_neptun(response): dict_item = {} new_response = response.replace(encoding='utf-8') result = new_response.css('script:contains(productModel)::text').get() if result: decoded = codecs.decode( result, 'unicode_escape').encode('latin1').decode('utf8') dict_data = chompjs.parse_js_object(decoded) market_name = 'neptun' dict_item["id"] = f'{market_name}-{dict_data["Id"]}' dict_item['brand'] = dict_data['Manufacturer']['Name'].title() dict_item['product_name'] = dict_data['Title'].replace("Лаптоп ", "") dict_item['categories_path'] = [ data_['Name'] for data_ in dict_data['NavigationPath'] ] dict_item['price'] = dict_data['RegularPrice'] dict_item['discount_price'] = dict_data['DiscountPrice'] sel_desc = scrapy.selector.Selector(text=dict_data['Description']) lst_desc_line = sel_desc.xpath('//ul/li/text()').getall() if lst_desc_line: lst_desc_line = [ item_.strip() for item_ in lst_desc_line if item_.strip() ] dict_item['description'] = '\n'.join(lst_desc_line) dict_item['url'] = response.request.url dict_item['market_name'] = market_name return dict_item
def test_nested_dict(self): result = parse_js_object( "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}") self.assertEqual(result, { 'hello': 'world', 'my': { 'master': 'of Orion' }, 'test': 'xx' })
def parse_infos(self, response): data_string = data = response.xpath( '//script[@id="js-hydration"]/text()').get() data = chompjs.parse_js_object(data_string, json_params={'strict': False}) loader = ItemLoader(ImmobiliareItem(), response) # inspect_response(response, self) loader.add_value('image_urls', nested_lookup('medium', data)) data = nested_delete(data, 'multimedia') listing = nested_delete(nested_lookup('listing', data)[0], 'type') listing = nested_delete(listing, 'title') loader.add_value('listing', listing) loader.add_value('trovokasa', nested_lookup('trovakasa', data)) yield loader.load_item()
def parse(self, response): response_js = chompjs.parse_js_object(response.text) for element in response_js.get("data"): date = self.extract_date(element) url = self.extract_url(element) edition_number = self.extract_edition_number(element) is_extra_edition = self.extract_is_extra_edition(element) yield Gazette( date=date, file_urls=[url], edition_number=edition_number, is_extra_edition=is_extra_edition, power="executive_legislative", )
def test_stack(self): result = parse_js_object( "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}", initial_stack_size=1, ) self.assertEqual( result, {'a': [{ 'b': 1 }, { 'c': [{ 'd': { 'f': { 'g': [1, 2] } } }, { 'e': 1 }] }]})
def parse(self, response): """Descobre datas disponíveis em calendário na página inicial e as requisita""" available_dates_js = re.search(r"var eventData = \[.+?\];", response.text, re.DOTALL).group() available_dates = parse_js_object(available_dates_js) for date_metadata in available_dates: gazette_date = self._parse_date(date_metadata["date"]) if not (self.start_date <= gazette_date <= self.end_date): continue callback = (self.parse_multi_publication_date if date_metadata["classname"] == "ambos" else self.parse_single_publication_date) yield scrapy.Request( url= f"{self.BASE_URL}doDia.php?dataEdicao={date_metadata['date']}", callback=callback, cb_kwargs={"date_metadata": date_metadata}, )
def parse_details(self, response): scripts = response.css('script::text').getall() for script in scripts: try: obj = chompjs.parse_js_object(script) if 'product' in obj: obj = obj['product']['preFetchedData'] break except ValueError as e: pass data = { 'shop': 'mechta.kz', 'url': response.request.url, 'title': clean_title(obj['NAME'], 'Телефон сотовый'), 'price': obj['PRICE']['PRICE'], 'images': obj['PHOTO'], 'specs': self.get_specs(obj) } yield data
def parse1(self, response): jsonresp= chompjs.parse_js_object(response.xpath("//@data-redux-state").extract_first()) for items in self.traversen(jsonresp): loader=ItemLoader(item=TescoItem(), selector=items) tesco_img_url=" " tesco_prod_name=" " tesco_img_url = items['defaultImageUrl'] tesco_prod_name = items['title'] name= self.cleanup(tesco_prod_name) loader.add_value('image_urls', tesco_img_url) loader.add_value('image_name', name) yield loader.load_item() next_url= response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get() if next_url: next_lnk= response.urljoin(response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get()) yield scrapy.Request(url=next_lnk, callback=self.parse1, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' },dont_filter=True) # def parse1(self, response): # for product in response.xpath("//ul[@class='product-list grid']/li[contains(@class,'product-list--list-item')]"): # loader=ItemLoader(item=TescoItem(), selector=product) # tesco_img_url = product.xpath(".//div/div/div/div/a/div/img/@srcset").get() # #tesco_img_url = tesco[tesco.rfind("http"):tesco.rfind(" ")] # tesco_prod_name = product.xpath(".//div[@class='product-details--wrapper']/div/h3/a/text()").get() # print("********") # print(tesco_img_url) # print(tesco_prod_name) # loader.add_value('image_urls', tesco_img_url) # loader.add_value('image_name',tesco_prod_name) # yield loader.load_item() # next_url= response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get() # if next_url: # next_lnk= response.urljoin(response.xpath("//nav[@class='pagination--page-selector-wrapper']/ul/li/a[@class='pagination--button prev-next'][@aria-label='Go to results page']/@href").get()) # yield scrapy.Request(url=next_lnk, callback=self.parse1, headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' # },dont_filter=True)
def parse_category_products(self, response): this_url = response.request.url base_url = this_url.split('?')[0] items_show = 100 dict_form_data = {} dict_form_data['items'] = f'{items_show}' new_response = response.replace(encoding='utf-8') result = new_response.css( 'script:contains(shopCategoryModel)::text').get() if result: decoded = codecs.decode( result, 'unicode_escape').encode('latin1').decode('utf8') dict_data = chompjs.parse_js_object(decoded) number_of_products = dict_data["NumberOfProducts"] page_count = number_of_products // items_show page_count = page_count + 1 if number_of_products % items_show else page_count lst_links = [ f"{base_url}/{dict_product_['Url']}" for dict_product_ in dict_data["Products"] ] for link_ in lst_links: urlItem = UrlItem() urlItem["url"] = link_ yield urlItem if page_count > 1: for i in range(2, page_count + 1): dict_form_data["page"] = i href = f"{base_url}?{urlencode(dict_form_data)}" yield scrapy.Request(href, callback=self.parse_category_products)
def parse_text( self, response, company_link, title, title_link, date_posted, location, company, ): description = self.cleanhtml( chompjs.parse_js_object( response.xpath("//script[contains(., 'JobPosting')]/text()"). extract()[0])["description"]) yield { "company_link": company_link, "job_title": title, "link": title_link, "activated_at": date_posted, "job_id": uuid.uuid4(), "location": location, "company_name": company, "text": description }
def test_depth(self): result = parse_js_object("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]") self.assertEqual(result, [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]])
def test_multiple_identifiers(self): result = parse_js_object("{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}") self.assertEqual(result, {k: 1 for k in 'abcdefghij'})
def test_one_field_dict(self): result = parse_js_object("{'hello': 'world'}") self.assertEqual(result, {'hello': 'world'})
def test_escaped_text(self): result = parse_js_object("{'a': '123\\'456\\n'}") self.assertEqual(result, {'a': "123'456\n"})
def test_special_fields(self): result = parse_js_object("{'a': true, 'b': false, 'c': null}") self.assertEqual(result, {'a': True, 'b': False, 'c': None})
def test_nested_lists(self): result = parse_js_object("[[[]]]") self.assertEqual(result, [[[]]])
def test_dict_with_multiple_element_list(self): result = parse_js_object("{'hello': [1, 2, 3, 4]}") self.assertEqual(result, {'hello': [1, 2, 3, 4]})
def test_multiple_list_items_string(self): result = parse_js_object("['h', 'e', 'l', 'l', 'o']") self.assertEqual(result, ['h', 'e', 'l', 'l', 'o'])
def test_dict_with_lists(self): result = parse_js_object("{'hello': [], 'world': [0]}") self.assertEqual(result, {'hello': [], 'world': [0]})
def test_empty_dict(self): result = parse_js_object("{}") self.assertEqual(result, {})
def test_single_list_item(self): result = parse_js_object("[1]") self.assertEqual(result, [1])
def test_nested_lists_with_value(self): result = parse_js_object("[[[1]]]") self.assertEqual(result, [[[1]]])
def test_unicode_values(self): result = parse_js_object("['\u00E9']") self.assertEqual(result, ['é'])
def test_list_of_dicts(self): result = parse_js_object("[{'a':12}, {'b':33}]") self.assertEqual(result, [{'a': 12}, {'b': 33}])
def test_unicode_keys(self): result = parse_js_object('{"cache":{"\u002Ftest\u002F": 0}}') self.assertEqual(result, {'cache': {'/test/': 0}})
def test_non_quoted_identifier(self): result = parse_js_object("{abcdefghijklmnopqrstuvwxyz: 12}") self.assertEqual(result, {"abcdefghijklmnopqrstuvwxyz": 12})
def test_multiple_list_items(self): result = parse_js_object("[1, 2, 3, 4]") self.assertEqual(result, [1, 2, 3, 4])
def test_empty_list(self): result = parse_js_object("[]") self.assertEqual(result, [])