Ejemplo n.º 1
0
    def find_biggest_image(self, response, meta=None):
        """
        Finds all images on the website and returns the biggest one by resolution
        NOTE:
        This method should ONLY be used as last resort when source
        is not an image and there's no downloader for the domain
        because it's slow and inaccurate
        """
        sel = parsel.Selector(text=response.text)
        urls = sel.xpath("//*[self::div or self::span]/img/@src").extract()
        for image in sel.xpath("//*[self::div or self::span]/a[img]/@href").extract():
            if '.' in os.path.basename(image):
                print(image)
                urls.append(image)
        urls = [utils.fix_url_http(u, parent=response.url) for u in urls]

        images = []
        for i, url in enumerate(urls):
            self.logging.debug('downloading {}/{}: {}'.format(i, len(urls), url))
            try:
                images.append(requests.get(url))
            except Exception as e:
                logging.debug('Failed to download image: {}\n{}'.format(url, e))
        images = [i for i in images if 'image' in i.headers['Content-Type']]
        images = sorted(images, key=lambda v: int(v.headers['Content-Length']), reverse=True)
        return make_content(images[0], meta=meta)
Ejemplo n.º 2
0
    def find_biggest_image(self, response, meta=None):
        """
        Finds all images on the website and returns the biggest one by resolution
        NOTE:
        This method should ONLY be used as last resort when source
        is not an image and there's no downloader for the domain
        because it's slow and inaccurate
        """
        sel = parsel.Selector(text=response.text)
        urls = sel.xpath("//*[self::div or self::span]/img/@src").extract()
        for image in sel.xpath(
                "//*[self::div or self::span]/a[img]/@href").extract():
            if '.' in os.path.basename(image):
                print(image)
                urls.append(image)
        urls = [utils.fix_url_http(u, parent=response.url) for u in urls]

        images = []
        for i, url in enumerate(urls):
            self.logging.debug('downloading {}/{}: {}'.format(
                i, len(urls), url))
            try:
                images.append(requests.get(url))
            except Exception as e:
                logging.debug('Failed to download image: {}\n{}'.format(
                    url, e))
        images = [i for i in images if 'image' in i.headers['Content-Type']]
        images = sorted(images,
                        key=lambda v: int(v.headers['Content-Length']),
                        reverse=True)
        return make_content(images[0], meta=meta)
Ejemplo n.º 3
0
 def find_images(self, response, meta=None):
     sel = parsel.Selector(text=response.text)
     url = sel.xpath("//div[@class='post-image']/a/img/@src").extract_first()
     if url:
         url = utils.fix_url_http(url)
         file_resp = requests.get(url)
         return make_content(file_resp, meta)
     return {}
Ejemplo n.º 4
0
 def find_images(self, response, meta=None):
     sel = parsel.Selector(text=response.text)
     url = sel.xpath(
         "//div[@class='post-image']/a/img/@src").extract_first()
     if url:
         url = utils.fix_url_http(url)
         file_resp = requests.get(url)
         return make_content(file_resp, meta)
     return {}
Ejemplo n.º 5
0
 def download(self, **kwargs):
     """
     :param position - position of image or defaults to random
     :param category - archive category, see get_categories for the list
     :return: dict{'content': <image_content>, <some meta data>...}
     """
     category = kwargs.get('category', None)
     position = kwargs.get('position', 0)
     rand = False
     if position == 0:
         rand = True
     if position > 1:
         position -= 1  # since 0 is reserved reduce position
     if not category:
         category = self.default_cat
     category = category.lower()
     url = self.url_tpl(category=category)
     response = requests.get(url)
     sel = Selector(text=response.text)
     # get position
     total_items = int(sel.xpath("//p[@class='count']").re('\d+')[0])
     items = sel.xpath("//div[@id='search_results']//a[img]/@href").extract()
     items_per_page = len(items)
     # find the right image by position
     if rand:
         position = random.randrange(0, total_items)
     if position < items_per_page:
         image = items[position]
     else:
         page = int(math.ceil(position / items_per_page))
         position -= items_per_page * (page - 1)
         url = "{}?page={}".format(url, page)
         response = requests.get(url)
         pos_sel = Selector(text=response.text)
         items = pos_sel.xpath("//div[@id='search_results']//a[img]/@href").extract()
         image = items[position]
     # retrieve image
     response = requests.get(urljoin(url, image))
     sel = Selector(text=response.text)
     image_url = sel.xpath("//div[@class='primary_photo']/a/img/@src").extract_first()
     image_url = utils.fix_url_http(image_url)
     meta = {
         'url': image_url,
         'title': sel.xpath("//div[@class='primary_photo']/a/img/@alt").extract_first(),
         'desc_title': sel.xpath("//div[@id='caption']/h2/text()").extract_first(),
         'desc': sel.xpath("//div[@id='caption']/p[not(@class)]/text()").extract_first(),
         'author': sel.xpath("//div[@id='caption']/p[@class='credit']/a/text()").extract_first(),
         'publication_date': sel.xpath("//div[@id='caption']/p[@class='publication_time']"
                                       "/text()").extract_first(),
     }
     image = Image(image_url, meta)
     return self.process_url(image, kwargs)
Ejemplo n.º 6
0
 def url(self, value):
     self._url = utils.fix_url_http(value)
Ejemplo n.º 7
0
 def url(self, value):
     self._url = utils.fix_url_http(value)