Ejemplo n.º 1
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector('div.col-sm-8.pull-left ul.thumbnails li a.thumbnail')[0].get_attribute('href')

            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = self.driver.find_elements_by_css_selector('ul.list-unstyled li h2')[0].text.split(':')[1]

            product_details['product_title'] = self.driver.find_elements_by_css_selector('div.col-sm-4.pull-right h1')[0].text
            product_details['taxonomy'] = taxonomy.replace("->","/")
            product_details['source'] = 'Jubbas'
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-")
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 2
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector('img._1Nyybr.Yun65Y._30XEf0')[0]
            # web_image_url = self.driver.find_elements_by_css_selector('img._3togXc._3wp706')[0]
            web_image_url = web_image_url.get_attribute("src")
            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = self.driver.find_elements_by_css_selector('div._1uv9Cb div._1vC4OE._3qQ9m1')[0].text

            product_details['product_title'] = self.driver.find_elements_by_css_selector('span._35KyD6')[0].text
            product_details['taxonomy'] = taxonomy.replace('->','/')
            product_details['source'] = 'Flipkart'

            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-")
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 3
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector('div.product-image-gallery img#image-main')[0].get_attribute('src')

            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = self.driver.find_elements_by_css_selector('div.price-info div.price-box p.special-price span.price')[0].text
            product_details['source'] = 'Parivar'
            product_details['product_title'] = self.driver.find_elements_by_css_selector('div.product-name span.h1')[0].text
            product_details['taxonomy'] = taxonomy.replace("->","/")
            product_details['product_info'] = self.driver.find_elements_by_css_selector('div.tabs-data dl#description')[0].text
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-")
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 4
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector('div.iwsprddetails_mainimage img#ContentPlaceHolder1_ImgPrdt')[0].get_attribute('src')

            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = 'Rs.'+self.driver.find_elements_by_css_selector('span.detail-price')[0].text

            product_details['product_title'] = self.driver.find_elements_by_css_selector('h1#ContentPlaceHolder1_ltrProductName')[0].text
            product_details['taxonomy'] = taxonomy
            product_details['source'] = 'Indian Wedding'
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-")
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 5
0
    def parse_product(self, each_url, taxonomy):
        try:

            dict_of_items = {}
            self.driver.get(each_url)

            web_image_url = self.driver.find_elements_by_css_selector(
                ".image-grid-imageContainer div.image-grid-image")[0]
            web_image_url = web_image_url.get_attribute("style")
            web_image_url = web_image_url.split('url("')[1]
            web_image_url = web_image_url.split('"')[0]
            dict_of_items['product_image_url'] = web_image_url

            file_path = taxonomy.replace(
                "->", "/") + "/" + self.source_urls_col + "/images/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('-')[1]
            dict_of_items['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            dict_of_items['product_page_url'] = each_url
            dict_of_items[
                'product_price'] = self.driver.find_elements_by_css_selector(
                    ".pdp-discount-container strong.pdp-price")[0].text
            dict_of_items[
                'product_title'] = self.driver.find_elements_by_css_selector(
                    ".pdp-price-info h1.pdp-title"
                )[0].text + self.driver.find_elements_by_css_selector(
                    ".pdp-price-info h1.pdp-name")[0].text
            dict_of_items['taxonomy'] = taxonomy
            dict_of_items[
                'product_description'] = self.driver.find_elements_by_css_selector(
                    ".pdp-product-description-content")[0].text
            json_path = taxonomy.replace(
                "->", "/") + "/" + self.source_urls_col + '/'
            write_into_json(json_path, dict_of_items)

        except Exception as e:
            print("Unable to crawl for " + each_url)
            print("Reason: " + str(e))
Ejemplo n.º 6
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)

            web_image_url = self.driver.find_elements_by_css_selector('ul.a-unordered-list.a-nostyle.a-horizontal.list.maintain-height img#landingImage')[0]
            web_image_url = web_image_url.get_attribute("src")
            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            # file_path = taxonomy.replace("->","/") + "/" + self.source_urls_col + "/images/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = "Rs." + self.driver.find_elements_by_css_selector('div#price td.a-span12 span#priceblock_ourprice ')[0].text.split("-")[0]
            product_details['source'] = 'Amazon'
            product_details['product_title'] = self.driver.find_elements_by_css_selector('div#titleBlock span#productTitle')[0].text
            product_details['taxonomy'] = taxonomy
            temp_product_info = self.driver.find_elements_by_css_selector('div#feature-bullets ul.a-unordered-list.a-vertical.a-spacing-none li')

            temp = []
            for info in temp_product_info:
                temp.append(info.text)

            product_details['product_info'] = '.'.join(temp)
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->","-")
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 7
0
    def parse_product(self,each_url,taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)

            web_image_url = self.driver.find_elements_by_css_selector(".product-essential div.product-image-gallery img#image-0")[0]
            web_image_url = web_image_url.get_attribute("src")
            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace("->",
                                                                 "-") + "/images/"

            # file_path = taxonomy.replace("->","/") + "/" + self.source_urls_col + "/images/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            # urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details['product_price'] = self.driver.find_elements_by_css_selector(".price-info div.price-box p.special-price span.price")[0].text
            product_details['product_title'] = self.driver.find_elements_by_css_selector(".product-shop div.product-name span.h1")[0].text
            product_details['taxonomy'] = taxonomy
            product_details['source'] = 'My Batua'
            temp_product_info = self.driver.find_elements_by_css_selector(".std ul li")

            temp = []
            for info in temp_product_info:
                temp.append(info.text)

            product_details['product_info'] = '.'.join(temp)
            json_path = 'atlas_dataset/'+taxonomy.replace("->","-") + "/"
            write_into_json(json_path, product_details)
            print("Wtitten into json",json_path)

        except Exception as e:
            print("Unable to crawl for "+each_url)
            print("Reason: "+str(e))
Ejemplo n.º 8
0
    def parse_product(self, each_url, taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector(
                'div#th1big img')[0].get_attribute('src')

            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace(
                "->", "-") + "/images/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details[
                'product_price'] = 'Rs.' + self.driver.find_elements_by_css_selector(
                    'span#dPrice')[0].text
            product_details['source'] = 'Jaypore'
            product_details[
                'product_title'] = self.driver.find_elements_by_css_selector(
                    'h1.productName')[0].text
            product_details['taxonomy'] = taxonomy.replace("->", "/")
            product_details[
                'product_info'] = self.driver.find_elements_by_css_selector(
                    'span#prodDesc')[0].text
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-")
            write_into_json(json_path, product_details)
            print("Written into json", json_path)

        except Exception as e:
            print("Unable to crawl for " + each_url)
            print("Reason: " + str(e))
Ejemplo n.º 9
0
    def parse_product(self, each_url, taxonomy):
        try:

            product_details = {}
            self.driver.get(each_url)
            web_image_url = self.driver.find_elements_by_css_selector(
                'div#rondellCarousel img')[0].get_attribute('src')

            product_details['product_image_url'] = web_image_url

            temp_taxonomy = taxonomy.replace(" ", "_")
            file_path = 'atlas_dataset/' + temp_taxonomy.replace(
                "->", "-") + "/images/"

            if not os.path.exists(file_path):
                os.makedirs(file_path)
            image_name = file_path + web_image_url.split('/')[-1]
            product_details['file_path'] = image_name
            urllib.request.urlretrieve(web_image_url, image_name)

            product_details['product_page_url'] = each_url
            product_details[
                'product_price'] = self.driver.find_elements_by_css_selector(
                    'span.price.on-sale')[0].text

            product_details[
                'product_title'] = self.driver.find_elements_by_css_selector(
                    'header.product-title span.lang1')[0].text
            product_details['taxonomy'] = taxonomy
            product_details['source'] = 'East Essence'
            json_path = 'atlas_dataset/' + temp_taxonomy.replace("->", "-")
            write_into_json(json_path, product_details)
            print("Wtitten into json", json_path)

        except Exception as e:
            print("Unable to crawl for " + each_url)
            print("Reason: " + str(e))