def find_biggest_image(self, response, meta=None): """ Finds all images on the website and returns the biggest one by resolution NOTE: This method should ONLY be used as last resort when source is not an image and there's no downloader for the domain because it's slow and inaccurate """ sel = parsel.Selector(text=response.text) urls = sel.xpath("//*[self::div or self::span]/img/@src").extract() for image in sel.xpath("//*[self::div or self::span]/a[img]/@href").extract(): if '.' in os.path.basename(image): print(image) urls.append(image) urls = [utils.fix_url_http(u, parent=response.url) for u in urls] images = [] for i, url in enumerate(urls): self.logging.debug('downloading {}/{}: {}'.format(i, len(urls), url)) try: images.append(requests.get(url)) except Exception as e: logging.debug('Failed to download image: {}\n{}'.format(url, e)) images = [i for i in images if 'image' in i.headers['Content-Type']] images = sorted(images, key=lambda v: int(v.headers['Content-Length']), reverse=True) return make_content(images[0], meta=meta)
def find_biggest_image(self, response, meta=None): """ Finds all images on the website and returns the biggest one by resolution NOTE: This method should ONLY be used as last resort when source is not an image and there's no downloader for the domain because it's slow and inaccurate """ sel = parsel.Selector(text=response.text) urls = sel.xpath("//*[self::div or self::span]/img/@src").extract() for image in sel.xpath( "//*[self::div or self::span]/a[img]/@href").extract(): if '.' in os.path.basename(image): print(image) urls.append(image) urls = [utils.fix_url_http(u, parent=response.url) for u in urls] images = [] for i, url in enumerate(urls): self.logging.debug('downloading {}/{}: {}'.format( i, len(urls), url)) try: images.append(requests.get(url)) except Exception as e: logging.debug('Failed to download image: {}\n{}'.format( url, e)) images = [i for i in images if 'image' in i.headers['Content-Type']] images = sorted(images, key=lambda v: int(v.headers['Content-Length']), reverse=True) return make_content(images[0], meta=meta)
def find_images(self, response, meta=None): sel = parsel.Selector(text=response.text) url = sel.xpath("//div[@class='post-image']/a/img/@src").extract_first() if url: url = utils.fix_url_http(url) file_resp = requests.get(url) return make_content(file_resp, meta) return {}
def find_images(self, response, meta=None): sel = parsel.Selector(text=response.text) url = sel.xpath( "//div[@class='post-image']/a/img/@src").extract_first() if url: url = utils.fix_url_http(url) file_resp = requests.get(url) return make_content(file_resp, meta) return {}
def download(self, **kwargs): """ :param position - position of image or defaults to random :param category - archive category, see get_categories for the list :return: dict{'content': <image_content>, <some meta data>...} """ category = kwargs.get('category', None) position = kwargs.get('position', 0) rand = False if position == 0: rand = True if position > 1: position -= 1 # since 0 is reserved reduce position if not category: category = self.default_cat category = category.lower() url = self.url_tpl(category=category) response = requests.get(url) sel = Selector(text=response.text) # get position total_items = int(sel.xpath("//p[@class='count']").re('\d+')[0]) items = sel.xpath("//div[@id='search_results']//a[img]/@href").extract() items_per_page = len(items) # find the right image by position if rand: position = random.randrange(0, total_items) if position < items_per_page: image = items[position] else: page = int(math.ceil(position / items_per_page)) position -= items_per_page * (page - 1) url = "{}?page={}".format(url, page) response = requests.get(url) pos_sel = Selector(text=response.text) items = pos_sel.xpath("//div[@id='search_results']//a[img]/@href").extract() image = items[position] # retrieve image response = requests.get(urljoin(url, image)) sel = Selector(text=response.text) image_url = sel.xpath("//div[@class='primary_photo']/a/img/@src").extract_first() image_url = utils.fix_url_http(image_url) meta = { 'url': image_url, 'title': sel.xpath("//div[@class='primary_photo']/a/img/@alt").extract_first(), 'desc_title': sel.xpath("//div[@id='caption']/h2/text()").extract_first(), 'desc': sel.xpath("//div[@id='caption']/p[not(@class)]/text()").extract_first(), 'author': sel.xpath("//div[@id='caption']/p[@class='credit']/a/text()").extract_first(), 'publication_date': sel.xpath("//div[@id='caption']/p[@class='publication_time']" "/text()").extract_first(), } image = Image(image_url, meta) return self.process_url(image, kwargs)
def url(self, value): self._url = utils.fix_url_http(value)