def download(self, url): ''' Download the content of a page for late usage. The content is saved to a file with the url as filename ''' resp, content = fetch(url) write_to_file(urllib.parse.quote_plus(url), 'w', content.decode('utf-8'))
def get_content_as_json(url): resp, content = fetch(url) assert resp.status == 200 # for some reason the content is in binary content = content.decode("utf-8") content = json.loads(content) return content
def download_image(img_url, dir_name): '''Downloads an image. Fails if request is redirected. ''' response, img = fetch(img_url, 'GET') if (response['content-location'] != img_url): write_to_file(os.path.join(dir_name, 'image_not_found.txt'), 'w', "image at {0} was not found".format(img_url)) return False img_name = os.path.basename(img_url) write_to_file(os.path.join(dir_name, img_name), 'wb', img) return True
def download_image(img_url, dir_name): """Downloads an image. Fails if request is redirected. """ response, img = fetch(img_url, "GET") if response["content-location"] != img_url: write_to_file(os.path.join(dir_name, "image_not_found.txt"), "w", "image at {0} was not found".format(img_url)) return False img_name = os.path.basename(img_url) write_to_file(os.path.join(dir_name, img_name), "wb", img) return True
def grab_real_url(self, search_key): ''' Given the homepage and a key that identifies an item, try to search the website for the item's "real" url. Keyword arguments: search_key -- key that uniquely identifies the item (e.g. image url) ''' resp, content = fetch(self._search_url.format(search_key)) soup = bs(content) result_list = soup.find('a', 'overhand') url = result_list['href'] return url
def get_item_info(self, url, image_url=None): ''' Given a item url and image url, returns a json representation of the item's information. image url is usefl in case actual url is not - it can be used for finding the item's "real" url ''' path = re.split(self._URL_REGEX, url)[2] if (path.strip() == ''): if (image_url == None): return None # the url posted is just the homepage url = self.grab_real_url(image_url) # get content for item and parse it resp, content = fetch(url) item = self.scrape(content) item.url = url return item
def get_item_info(self, url, image_url=None): ''' Given a item url and image url, returns a json representation of the item's information. image url is usefl in case actual url is not - it can be used for finding the item's "real" url ''' path = re.split(self._URL_REGEX, url)[2] if (path.strip() == ''): if (image_url==None): return None # the url posted is just the homepage url = self.grab_real_url(image_url) # get content for item and parse it resp, content = fetch(url) item = self.scrape(content) item.url = url return item