def __init__(self, driver=None, base_url=None): if driver is not None: self.driver = driver else: try: from dryscrape.driver.webkit import Driver as DefaultDriver self.driver = DefaultDriver() except ImportError: raise ValueError('No driver instance can be created.') self.base_url = base_url
def get_prdouct_category_and_image(product_urls): img_cat_data = [] count = 0 for url in product_urls: # headers={"Accept" : "application/json, text/javascript, */*; q=0.01", # "Referer": "https://www.blibli.com/p/canon-bg-e8-baterai-grip-original/ps--SUP-49229-00160?ds=SUP-49229-00160-00001&list=Product%20Listing%20Page", # "Host": "www.blibli.com", # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", # "Accept-Encoding":"gzip, deflate, br", # "Accept-Language":"en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7", # "X-Requested-With":"XMLHttpRequest" # } # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'} # r = requests.get(url,headers=headers) # print (r.content) print("crawling in progress -> {} ".format(count)) print(url) session = dryscrape.Session(driver=Driver()) session.visit(url) response = session.body() session.set_timeout(30) session.reset() soup = BeautifulSoup(response, 'lxml') print("data fetched") product_image_divs = soup.findAll( "div", {"class": "product__image-thumbnails--item"}) product_imges = [] for item in product_image_divs: image_tag = item.findAll("img") if len(image_tag) != 0: image_tag = image_tag[0] url = image_tag.get("src") url = url.replace("thumbnail", "full", 1) product_imges.append(url) # print("categoryies") category_divs = soup.findAll("div", {"class": "breadcrumb__block"}) # print(category_divs) categories = [] for category in category_divs: cat = category.findAll("a")[0].findAll("span")[0] categ = cat.text.encode("utf-8") categories.append(categ) image_category = {} image_category["image_urls"] = ",".join(product_imges) image_category["cat_label"] = "->".join(categories) img_cat_data.append(image_category) write_to_csv("image_data_set700_800", image_category) count = count + 1 return img_cat_data
def __init__(self, driver = None, base_url = None): if driver is not None: self.driver = driver else: try: from dryscrape.driver.webkit import Driver as DefaultDriver self.driver = DefaultDriver() except ImportError: raise ValueError('No driver instance can be created.') self.base_url = base_url
class Session(object): """ A web scraping session based on a driver instance. Implements the proxy pattern to pass unresolved method calls to the underlying driver. If no `driver` is specified, the instance will create an instance of ``dryscrape.session.DefaultDriver`` to get a driver instance (defaults to ``dryscrape.driver.webkit.Driver``). If `base_url` is present, relative URLs are completed with this URL base. If not, the `get_base_url` method is called on itself to get the base URL. """ def __init__(self, driver = None, base_url = None): if driver is not None: self.driver = driver else: try: from dryscrape.driver.webkit import Driver as DefaultDriver self.driver = DefaultDriver() except ImportError: raise ValueError('No driver instance can be created.') self.base_url = base_url # implement proxy pattern def __getattr__(self, attr): """ Pass unresolved method calls to underlying driver. """ return getattr(self.driver, attr) def visit(self, url): """ Passes through the URL to the driver after completing it using the instance's URL base. """ return self.driver.visit(self.complete_url(url)) def complete_url(self, url): """ Completes a given URL with this instance's URL base. """ if self.base_url: return urlparse.urljoin(self.base_url, url) else: return url def interact(self, **local): """ Drops the user into an interactive Python session with the ``sess`` variable set to the current session instance. If keyword arguments are supplied, these names will also be available within the session. """ import code code.interact(local=dict(sess=self, **local))
class Session(object): """ A web scraping session based on a driver instance. Implements the proxy pattern to pass unresolved method calls to the underlying driver. If no `driver` is specified, the instance will create an instance of ``dryscrape.session.DefaultDriver`` to get a driver instance (defaults to ``dryscrape.driver.webkit.Driver``). If `base_url` is present, relative URLs are completed with this URL base. If not, the `get_base_url` method is called on itself to get the base URL. """ def __init__(self, driver=None, base_url=None): if driver is not None: self.driver = driver else: try: from dryscrape.driver.webkit import Driver as DefaultDriver self.driver = DefaultDriver() except ImportError: raise ValueError('No driver instance can be created.') self.base_url = base_url # implement proxy pattern def __getattr__(self, attr): """ Pass unresolved method calls to underlying driver. """ return getattr(self.driver, attr) def visit(self, url): """ Passes through the URL to the driver after completing it using the instance's URL base. """ return self.driver.visit(self.complete_url(url)) def complete_url(self, url): """ Completes a given URL with this instance's URL base. """ if self.base_url: return urlparse.urljoin(self.base_url, url) else: return url def interact(self, **local): """ Drops the user into an interactive Python session with the ``sess`` variable set to the current session instance. If keyword arguments are supplied, these names will also be available within the session. """ import code code.interact(local=dict(sess=self, **local))