def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector('div.col-sm-8.pull-left ul.thumbnails li a.thumbnail')[0].get_attribute('href') product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = self.driver.find_elements_by_css_selector('ul.list-unstyled li h2')[0].text.split(':')[1] product_details['product_title'] = self.driver.find_elements_by_css_selector('div.col-sm-4.pull-right h1')[0].text product_details['taxonomy'] = taxonomy.replace("->","/") product_details['source'] = 'Jubbas' json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-") write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector('img._1Nyybr.Yun65Y._30XEf0')[0] # web_image_url = self.driver.find_elements_by_css_selector('img._3togXc._3wp706')[0] web_image_url = web_image_url.get_attribute("src") product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = self.driver.find_elements_by_css_selector('div._1uv9Cb div._1vC4OE._3qQ9m1')[0].text product_details['product_title'] = self.driver.find_elements_by_css_selector('span._35KyD6')[0].text product_details['taxonomy'] = taxonomy.replace('->','/') product_details['source'] = 'Flipkart' json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-") write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector('div.product-image-gallery img#image-main')[0].get_attribute('src') product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = self.driver.find_elements_by_css_selector('div.price-info div.price-box p.special-price span.price')[0].text product_details['source'] = 'Parivar' product_details['product_title'] = self.driver.find_elements_by_css_selector('div.product-name span.h1')[0].text product_details['taxonomy'] = taxonomy.replace("->","/") product_details['product_info'] = self.driver.find_elements_by_css_selector('div.tabs-data dl#description')[0].text json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-") write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector('div.iwsprddetails_mainimage img#ContentPlaceHolder1_ImgPrdt')[0].get_attribute('src') product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = 'Rs.'+self.driver.find_elements_by_css_selector('span.detail-price')[0].text product_details['product_title'] = self.driver.find_elements_by_css_selector('h1#ContentPlaceHolder1_ltrProductName')[0].text product_details['taxonomy'] = taxonomy product_details['source'] = 'Indian Wedding' json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-") write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self, each_url, taxonomy): try: dict_of_items = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector( ".image-grid-imageContainer div.image-grid-image")[0] web_image_url = web_image_url.get_attribute("style") web_image_url = web_image_url.split('url("')[1] web_image_url = web_image_url.split('"')[0] dict_of_items['product_image_url'] = web_image_url file_path = taxonomy.replace( "->", "/") + "/" + self.source_urls_col + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('-')[1] dict_of_items['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) dict_of_items['product_page_url'] = each_url dict_of_items[ 'product_price'] = self.driver.find_elements_by_css_selector( ".pdp-discount-container strong.pdp-price")[0].text dict_of_items[ 'product_title'] = self.driver.find_elements_by_css_selector( ".pdp-price-info h1.pdp-title" )[0].text + self.driver.find_elements_by_css_selector( ".pdp-price-info h1.pdp-name")[0].text dict_of_items['taxonomy'] = taxonomy dict_of_items[ 'product_description'] = self.driver.find_elements_by_css_selector( ".pdp-product-description-content")[0].text json_path = taxonomy.replace( "->", "/") + "/" + self.source_urls_col + '/' write_into_json(json_path, dict_of_items) except Exception as e: print("Unable to crawl for " + each_url) print("Reason: " + str(e))
def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector('ul.a-unordered-list.a-nostyle.a-horizontal.list.maintain-height img#landingImage')[0] web_image_url = web_image_url.get_attribute("src") product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" # file_path = taxonomy.replace("->","/") + "/" + self.source_urls_col + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = "Rs." + self.driver.find_elements_by_css_selector('div#price td.a-span12 span#priceblock_ourprice ')[0].text.split("-")[0] product_details['source'] = 'Amazon' product_details['product_title'] = self.driver.find_elements_by_css_selector('div#titleBlock span#productTitle')[0].text product_details['taxonomy'] = taxonomy temp_product_info = self.driver.find_elements_by_css_selector('div#feature-bullets ul.a-unordered-list.a-vertical.a-spacing-none li') temp = [] for info in temp_product_info: temp.append(info.text) product_details['product_info'] = '.'.join(temp) json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-") write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self,each_url,taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector(".product-essential div.product-image-gallery img#image-0")[0] web_image_url = web_image_url.get_attribute("src") product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") + "/images/" # file_path = taxonomy.replace("->","/") + "/" + self.source_urls_col + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name # urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details['product_price'] = self.driver.find_elements_by_css_selector(".price-info div.price-box p.special-price span.price")[0].text product_details['product_title'] = self.driver.find_elements_by_css_selector(".product-shop div.product-name span.h1")[0].text product_details['taxonomy'] = taxonomy product_details['source'] = 'My Batua' temp_product_info = self.driver.find_elements_by_css_selector(".std ul li") temp = [] for info in temp_product_info: temp.append(info.text) product_details['product_info'] = '.'.join(temp) json_path = 'atlas_dataset/'+taxonomy.replace("->","-") + "/" write_into_json(json_path, product_details) print("Wtitten into json",json_path) except Exception as e: print("Unable to crawl for "+each_url) print("Reason: "+str(e))
def parse_product(self, each_url, taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector( 'div#th1big img')[0].get_attribute('src') product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace( "->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details[ 'product_price'] = 'Rs.' + self.driver.find_elements_by_css_selector( 'span#dPrice')[0].text product_details['source'] = 'Jaypore' product_details[ 'product_title'] = self.driver.find_elements_by_css_selector( 'h1.productName')[0].text product_details['taxonomy'] = taxonomy.replace("->", "/") product_details[ 'product_info'] = self.driver.find_elements_by_css_selector( 'span#prodDesc')[0].text json_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") write_into_json(json_path, product_details) print("Written into json", json_path) except Exception as e: print("Unable to crawl for " + each_url) print("Reason: " + str(e))
def parse_product(self, each_url, taxonomy): try: product_details = {} self.driver.get(each_url) web_image_url = self.driver.find_elements_by_css_selector( 'div#rondellCarousel img')[0].get_attribute('src') product_details['product_image_url'] = web_image_url temp_taxonomy = taxonomy.replace(" ", "_") file_path = 'atlas_dataset/' + temp_taxonomy.replace( "->", "-") + "/images/" if not os.path.exists(file_path): os.makedirs(file_path) image_name = file_path + web_image_url.split('/')[-1] product_details['file_path'] = image_name urllib.request.urlretrieve(web_image_url, image_name) product_details['product_page_url'] = each_url product_details[ 'product_price'] = self.driver.find_elements_by_css_selector( 'span.price.on-sale')[0].text product_details[ 'product_title'] = self.driver.find_elements_by_css_selector( 'header.product-title span.lang1')[0].text product_details['taxonomy'] = taxonomy product_details['source'] = 'East Essence' json_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-") write_into_json(json_path, product_details) print("Wtitten into json", json_path) except Exception as e: print("Unable to crawl for " + each_url) print("Reason: " + str(e))