def parse_box(self, response): box = ScrapingboxesItem() box_data = ItemUpdaterTarra(item=box, measured_in='mm') box_data.update_item('description', 'all_inner_dimensions', 'tags', 'standard_size', 'product_type', 'color', text_element=response.xpath('//h1/span/text()')) table_handler = TableHandlerTarra(text_elements=response.xpath( '//*[@class="product attribute description"]//p[1]/text()')) box_data.analyse_table_rows(string_list=response.xpath( '//*[@class="product attribute description"]//p[1]/text()').getall( ), table_handler=table_handler) price_handler = PriceHandler2(item=box) price_handler.create_price_table( tier_elements=response.xpath('//li[@class="item"]'), price_elements=response.xpath( '//li[@class="item"]//span[@data-label="Excl. BTW"]')) price_handler.get_base_price_from_price_table() # box['url'] = response.request.url box['company'] = 'Tarra-pack' yield box
def parse_box(self, response): box = ScrapingboxesItem() in_stock_text = (response.xpath( '//*[@id="product_addtocart_form"]/div[3]/img/@alt').get().lower()) box['in_stock'] = find_in_stock(in_stock_text) # retrieve prices and bundle size box_data = ItemUpdaterViv(item=box, measured_in="mm") box_data.update_item('minimum_purchase', text_element=response.xpath( '//*[@class="product-shop"]/text()[4]')) price_handler = PriceHandlerViv(box) box['price_ex_BTW'] = round((price_handler.create_base_price_manually( response.xpath( '//div[@class="product-shop"]//*[@class="per-one"]//*[@class="price-excluding-tax"]/*[@class="price"]' )) / 121) * 100, 2) box['price_table'] = price_handler.create_price_table( tier_elements=response.xpath( "//*[@class='tier-prices product-pricing']/li"), price_elements=response.xpath( "//*[@class='tier-prices product-pricing']/li/span[1]")) box['url'] = response.request.url # update item with product description product_description_text = response.xpath( '//*[@id="product-name"]/h1/text()').get() box_data.update_item( "tags", "description", "wall_thickness", "standard_size", 'product_type', 'bottles', description_element=response.xpath('//*[@id="product-name"]/h1')) # analyse specs table indices_object = TableHandlerViv( header_elements=response.xpath('//tbody/tr/th')) box_data.analyse_table_rows( table_handler=indices_object, row_elements=response.xpath('//tbody/tr/td')) # indices_dict for testing purposes box['indices_dict'] = indices_object.indices_dict, indices_object.multiple_inner_dimensions_words, indices_object.column_names box['company'] = 'Verpakkingsindustrie Veenendaal' # Product image box['image_urls'] = [ response.xpath('//p[@class="product-image"]/a/img/@src').get() ] yield box
def parse_box(self, response): box = ScrapingboxesItem() table_handler = TableHandlerPaco(header_elements=response.xpath('//table[@class="featurestable"]//td[1]')) box_data = ItemUpdaterPaco(item=box, measured_in="cm") box_data.update_item( 'description', 'tags', 'standard_size', 'product_type', description_element=response.xpath('//h1[@itemprop="name"]') ) box_data.analyse_table_rows( row_elements=response.xpath('//table[@class="featurestable"]//td[2]'), table_handler=table_handler ) price_handler = PriceHandlerPaco(item=box, price_multiplier=box['minimum_purchase']) box['price_ex_BTW'] = price_handler.create_base_price_manually( price_element=response.xpath('//div[@class="product-prices"]//*[@itemprop="price"]') ) box['price_table'] = price_handler.create_price_table( tier_elements=response.xpath('//table[@class="table-product-discounts"]//tr[position() >1]/td[1]'), price_elements=response.xpath('//table[@class="table-product-discounts"]//tr[position() >1]/td[2]') ) box['url'] = response.request.url box['company'] = 'PacoVerpakkingen' yield box
def parse(self, response): products = response.xpath('//*[@class="product-item-link"]') for product in products: box = ScrapingboxesItem() description = product.xpath('./text()').get() box['description'] = description yield box
def parse_category(self, response): # check if page has products or needs to be skipped if response.xpath("//*[@class='from-price']").get(): # iterate over table rows table_rows = response.xpath( "//*[@class='table products-view']/tbody/tr") for idx, row in enumerate(table_rows): if idx > TestSettings.MAX_ROWS and TESTING: break box = ScrapingboxesItem() box_data = ItemUpdaterDozen(item=box, measured_in="mm") header_indices_object = TableHandlerDozen( header_elements=response.xpath("//thead/tr/th")) # analyse table rows box_data.analyse_table_rows( table_handler=header_indices_object, row_elements=row.xpath(".//td")) box['price_table'] = create_price_table_dozenNL( string=row.xpath( ".//*[@id='tierprices']/@data-content").get()) box['price_ex_BTW'] = box['price_table'][list( box['price_table'])[0]] # update box from page title box_data.update_item( 'description', 'tags', 'standard_size', 'product_type', description_element=response.xpath( "//*[@class='page-title category-title']/h1")) # extra info found in image alt attribute box_data.update_item("color", "tags", text_element=row.xpath('.//td[1]//@alt')) # create box url # example: https://www.dozen.nl/gekleurde-dozen/gekleurde-vouwdozen/breedte/155/hoogte/80/lengte/210.html if 'inner_dim3' in box: box['url'] = response.request.url.replace( "/show/all.html", f"/breedte/{int(box['inner_dim2'])}/hoogte/{int(box['inner_dim3'])}/lengte/{int(box['inner_dim1'])}.html" ) elif 'inner_variable_dimension_MIN' in box: box['url'] = response.request.url.replace( "/show/all.html", f"/breedte/{int(box['inner_dim2'])}/lengte/{int(box['inner_dim1'])}.html" ) else: box['url'] = "error" box['company'] = "Dozen.nl" yield box
def parse_box(self, response): # initialize item box = ScrapingboxesItem() # create item data object box_data = ItemUpdaterEuropresto(item=box, measured_in="mm") # update from specifications header_indices_object = TableHandlerEuropresto( header_elements=response.xpath("//*[@class='specifics']/li")) box_data.analyse_table_rows( row_elements=response.xpath("//*[@class='specifics']/li/span"), table_handler=header_indices_object) # update from main title description box_data.update_item('description', 'tags', 'all_inner_dimensions', 'standard_size', 'wall_thickness', 'product_type', 'color', description_element=response.xpath( "//*[@class='product-description']/*/h1")) # use PriceHandler price_handler = PriceHandler() price_elements = response.xpath( "//*[@class='bulk']/li/*[@class='price']") tier_elements = response.xpath( "//*[@class='bulk']/li/*[@class='from']") if not price_elements or not tier_elements: box['price_table'] = {} else: box['price_table'] = price_handler.create_price_table( price_elements=response.xpath( "//*[@class='bulk']/li/*[@class='price']"), tier_elements=response.xpath( "//*[@class='bulk']/li/*[@class='from']")) box['price_ex_BTW'] = price_handler.create_base_price_manually( price_element=response.xpath( "//*[@class='product-price']//*[@class='euro']")) # add missing item attributes box["url"] = response.request.url box["company"] = "Europresto" # for testing box["indices_dict"] = header_indices_object.indices_dict, header_indices_object.column_names yield box
def parse_box(self, response): # iterate over different tables for box_table in response.xpath('//table'): boxes_rows = box_table.xpath('tbody/tr') for row in boxes_rows: box = ScrapingboxesItem() table = TableHandlerTupak(header_elements=box_table.xpath( 'thead/tr[@class="rij-2"][2]/th')) box_data = ItemUpdater2(item=box, measured_in="mm") # create data from product description box_data.update_item( "tags", "color", "wall_thickness", "description", 'product_type', description_element=response.xpath('//h1')) # iterate over row indices and update box box_data.analyse_table_rows(table_handler=table, row_elements=row.xpath('td')) # create product url relative_url = row.xpath('./td/a/@href').get() if relative_url: box["url"] = 'https://www.tupak.com' + relative_url else: box['url'] = response.request.url # use PriceHandler price_handler = PriceHandler2(item=box) box["price_table"] = price_handler.create_price_table( tier_elements=box_table.xpath( './/tr[@class="rij-2"][2]/th[@class="staffel"]'), price_elements=row.xpath( './td[contains(@class, "prijs")]')) box['price_ex_BTW'] = price_handler.get_base_price_from_price_table( ) # add item fields manually box["company"] = 'Tupak' box["in_stock"] = None # # # for testing box["indices_dict"] = table.indices_dict yield box
def parse(self, response): category_elements = response.xpath( '//*[@class="Shop01catOuterWrapper"]//a') for element in category_elements: box = ScrapingboxesItem() box_data = ItemUpdater2(box, measured_in='cm') box_data.update_item('product_type', text_element=element.xpath('@title')) link = element.xpath('@href').get() link += "?page=1&perPage=300" yield response.follow(link, self.parse_category, meta={'item': box})
def parse_box(self, response): box = ScrapingboxesItem() # table_handler = TableHandlerTest(header_elements=None) box_data = ItemUpdaterTest(item=box, measured_in="mm") price_handler = PriceHandler(price_multiplier=None) ## product description element test # product_description_element = None # yield {'test_field': product_description_element, 'url': response.request.url} # table header element test table_header_elements = response.xpath('/text()').getall() for text in table_header_elements: yield {'test_field': text, 'url': response.request.url} table_row_elements = None price_element = None price_tier_elements = None price_tierprice_elements = None yield {'test_field': None, 'url': response.request.url}
def parse_box(self, response): box = ScrapingboxesItem() # table_handler = TableHandlerTest(header_elements=None) box_data = ItemUpdaterVermeij(item=box, measured_in="cm") product_description_element = response.xpath( '//div[@class="mobile-title-nr"]/h1[@itemprop="name"]/text()') box_data.update_item( 'description', 'all_inner_dimensions', 'tags', 'box_type', 'standard_size', text_element=product_description_element ) table_handler = TableHandlerVermeij( header_elements=response.xpath('//*[@class="extraspecs-row"]//td[1]') ) box_data.analyse_table_rows( row_elements=response.xpath('//*[@class="extraspecs-row"]//tr'), table_handler=table_handler ) # create PriceHandler, check if prices are per piece or per box box_or_piece_text = response.xpath('//*[@class="Shop01DetailPrijs"]/span[1]/text()').get() other_box_or_piece_text = response.xpath('//table[@class="staffelkortingen"]//tr[1]/th[4]').get() if box_or_piece_text: if 'doos' in box_or_piece_text: print('per doos', response.request.url) price_handler = PriceHandler2(box, price_multiplier=box['minimum_purchase']) elif 'stuk' in box_or_piece_text: print('per stuk', response.request.url) price_handler = PriceHandler2(box) else: raise ValueError("No pricehandler, there is something wrong with the box_or_piece_text") elif other_box_or_piece_text: if 'doos' in other_box_or_piece_text: price_handler = PriceHandler2(box, price_multiplier=box['minimum_purchase']) elif 'stuk' in other_box_or_piece_text: price_handler = PriceHandler2(box) else: raise ValueError("No pricehandler, there is something wrong with the box_or_piece_text") #create price table tier_elements = response.xpath('//table[@class="staffelkortingen"]//tr[position() >1]/th') if tier_elements: price_handler.create_price_table( tier_elements=tier_elements, price_elements=response.xpath('//table[@class="staffelkortingen"]//tr[position() >1]/td[3]') ) price_handler.get_base_price_from_price_table() else: price_handler.create_base_price_manually( price_element=response.xpath('//*[@class="Shop01DetailPrijs"]/span[2]') ) box['url'] = response.request.url box['company'] = 'Vermeij' yield box
def parse_box_table(self, response): # iterate over box rows boxes_rows = response.xpath('//*[@id="tbody_1"]/tr') for idx, row in enumerate(boxes_rows): # initialize Item and data object box = ScrapingboxesItem() table_handler = TableHandlerRajapack( header_elements=response.xpath('//thead[@id="thead_1"]/tr/th') ) # crate box data updater box_data = ItemUpdater2(item=box, measured_in=table_handler.get_measurement_unit()) # create data from product description box_data.update_item( "tags", "wall_thickness", # todo driedubbelgolf wordt niet gepakt, palletdozen "description", "color", 'product_type', description_element=response.xpath('//*[@test-ihm="ProductName"]') ) # iterate over row indices and update box box_data.analyse_table_rows( table_handler=table_handler, row_elements=row.xpath("./td") ) # create product url base_url = response.request.url.split("_")[0] product_code = row.xpath('.//*[@class="tooltip-img"]/text()').get() try: product_url = base_url + "_sku" + product_code + ".html" box["url"] = product_url except TypeError: raise TypeError(base_url, product_code, box) # use PriceHandler price_handler = PriceHandlerRajapack(item=box, # price_multiplier=box['minimum_purchase'] ) box["price_table"] = price_handler.create_price_table( tier_elements=response.xpath('//*[@id="thead_1"]/tr[2]/th'), price_elements=row.xpath('./td[contains(@class, "nobdr")]') ) # HANDLE 'Prijs per doos/pak' per_text = response.xpath('//th[contains(@class, "promo")]/b[1]/text()').get() multiplier = box.get('minimum_purchase', 1) if 'pak' in per_text: new_price_table = {} for key, value in box['price_table'].items(): new_price_table[ key * multiplier ] = round(value / multiplier, 2) box['price_table'] = new_price_table price_handler.price_table = new_price_table box['price_ex_BTW'] = price_handler.get_base_price_from_price_table() # add item fields manually box["company"] = "Rajapack" box["in_stock"] = None box['indices_dict'] = table_handler.indices_dict