Ejemplo n.º 1
0
 def parse_item(self, response):
     sel = Selector(response)
     image_url = sel.xpath('//span[@class="full-size-link"]/a/@href').extract()[0]
     no = response.meta.get('index')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}'.format(no, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         download(image_url, path)
Ejemplo n.º 2
0
 def parse_item(self, response):
     sel = Selector(response)
     image_url = sel.xpath(
         '//span[@class="full-size-link"]/a/@href').extract()[0]
     no = response.meta.get('index')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}'.format(no, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         download(image_url, path)
Ejemplo n.º 3
0
    def parse_start_url(self, response):
        sel = Selector(response)
        image_url = sel.xpath('//div[@class="entry-content"]/p/a/@href').extract()[0]
        base_url = get_base_url(response)
        image_url = urljoin(base_url, image_url)
        _, ext = image_url.split('/')[-1].split('.')
        image_name = u'_characters.{}'.format(ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            download(image_url, path)

        images = sel.xpath('//figure[@class="gallery-item"]//a/@href').extract()
        for i, c in enumerate(images):
            yield Request(c, meta={'index': i+1}, callback=self.parse_item)
Ejemplo n.º 4
0
Archivo: cm.py Proyecto: ben-en/comics
    def parse_item(self, response):
        sel = Selector(response)
        base_url = get_base_url(response)
        timestamp = sel.xpath(
            '//meta[@itemprop="datePublished"]/@content').extract()[0]
        date = parser.parse(timestamp).strftime(u'%Y.%m.%d')
        try:
            image_title = sel.xpath(
                '//h1/div[@class="name"]/text()').extract()[0].strip()
        except IndexError:
            try:
                image_title = sel.xpath('//h1/text()').extract()[0].replace(
                    ':', '').strip()
            except IndexError:
                everything = sel.xpath('//h1/*[not(self::a)]')
                image_title = u''.join([
                    x.xpath('string(.)').extract()[0] for x in everything
                ]).strip()

        try:
            image_url = sel.xpath(
                '(//*[@itemprop="articleBody"]//img)[2]/@src').extract()[0]
        except IndexError:
            image_url = sel.xpath(
                '//*[@itemprop="articleBody"]//img/@src').extract()[0]

        image_url = urljoin(base_url, image_url)
        _, ext = image_url.split('/')[-1].rsplit('.', 1)
        image_name = u'{}.{}.{}'.format(date, image_title, ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            return download(image_url, path)
Ejemplo n.º 5
0
Archivo: cm.py Proyecto: mktums/comics
    def parse_item(self, response):
        sel = Selector(response)
        base_url = get_base_url(response)
        timestamp = sel.xpath('//meta[@itemprop="datePublished"]/@content').extract()[0]
        date = parser.parse(timestamp).strftime(u'%Y.%m.%d')
        try:
            image_title = sel.xpath('//h1/div[@class="name"]/text()').extract()[0].strip()
        except IndexError:
            try:
                image_title = sel.xpath('//h1/text()').extract()[0].replace(':', '').strip()
            except IndexError:
                everything = sel.xpath('//h1/*[not(self::a)]')
                image_title = u''.join([x.xpath('string(.)').extract()[0] for x in everything]).strip()

        try:
            image_url = sel.xpath('(//*[@itemprop="articleBody"]//img)[2]/@src').extract()[0]
        except IndexError:
            image_url = sel.xpath('//*[@itemprop="articleBody"]//img/@src').extract()[0]

        image_url = urljoin(base_url, image_url)
        _, ext = image_url.split('/')[-1].rsplit('.', 1)
        image_name = u'{}.{}.{}'.format(date, image_title, ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            return download(image_url, path)
Ejemplo n.º 6
0
    def parse_start_url(self, response):
        sel = Selector(response)
        image_url = sel.xpath(
            '//div[@class="entry-content"]/p/a/@href').extract()[0]
        base_url = get_base_url(response)
        image_url = urljoin(base_url, image_url)
        _, ext = image_url.split('/')[-1].split('.')
        image_name = u'_characters.{}'.format(ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            download(image_url, path)

        images = sel.xpath(
            '//figure[@class="gallery-item"]//a/@href').extract()
        for i, c in enumerate(images):
            yield Request(c, meta={'index': i + 1}, callback=self.parse_item)
Ejemplo n.º 7
0
 def parse_item(self, response):
     base_url = get_base_url(response)
     sel = Selector(response)
     img_data = sel.xpath('//tbody[@class="style5"]/tr[2]/td/img')
     image_url = urljoin(base_url, img_data.xpath('./@src').extract()[0])
     image_title = img_data.xpath('./@alt').extract()[0]
     name, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(name, image_title, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 8
0
 def parse_item(self, response):
     sel = Selector(response)
     base_url = get_base_url(response)
     no = response.url.rsplit('/')[-1]
     image_url = sel.xpath('//img[@id="cm_strip"]/@src').extract()[0]
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].rsplit('.', 1)
     image_name = u'{}.{}'.format(no, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 9
0
 def parse_item(self, response):
     sel = Selector(response)
     base_url = get_base_url(response)
     no = response.url.rsplit('/')[-1]
     image_url = sel.xpath('//img[@id="cm_strip"]/@src').extract()[0]
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].rsplit('.', 1)
     image_name = u'{}.{}'.format(no, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 10
0
 def parse_item(self, response):
     base_url = get_base_url(response)
     sel = Selector(response)
     img_data = sel.xpath('//tbody[@class="style5"]/tr[2]/td/img')
     image_url = urljoin(base_url, img_data.xpath('./@src').extract()[0])
     image_title = img_data.xpath('./@alt').extract()[0]
     name, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(name, image_title, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 11
0
 def parse_item(self, response):
     sel = Selector(response)
     image_url = sel.xpath('//div[@id="cc-comicbody"]//img/@src').extract()[0]
     date, image_name = response.meta.get('date_text').split(' - ')
     date = parser.parse(date).strftime(u'%Y-%m-%d')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, image_name, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 12
0
 def parse_item(self, response):
     comic_id = parse_qs(urlparse(response.url).query).get('s')[0]
     sel = Selector(response)
     img_data = sel.xpath('//img[@id="strip"]')
     image_url = img_data.xpath('./@src').extract()[0]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     name, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}'.format(comic_id, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 13
0
 def parse_item(self, response):
     comic_id = parse_qs(urlparse(response.url).query).get('s')[0]
     sel = Selector(response)
     img_data = sel.xpath('//img[@id="strip"]')
     image_url = img_data.xpath('./@src').extract()[0]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     name, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}'.format(comic_id, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 14
0
 def parse_item(self, response):
     sel = Selector(response)
     orig_name = sel.xpath('//h1/text()').extract()[0]
     date = parser.parse(sel.xpath('//div[@class="stand_high"]/small/text()').extract()[0]).strftime(u'%Y-%m-%d')
     image_url = sel.xpath('//div[@class="comicmid"]//img/@src').extract()[0]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, orig_name, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 15
0
 def parse_item(self, response):
     sel = Selector(response)
     image_url = sel.xpath(
         '//div[@id="cc-comicbody"]//img/@src').extract()[0]
     date, image_name = response.meta.get('date_text').split(' - ')
     date = parser.parse(date).strftime(u'%Y-%m-%d')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, image_name, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 16
0
 def parse_item(self, response):
     sel = Selector(response)
     image_url = sel.xpath('//img[@id="strip"]/@src').extract()[0]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     no, ext = image_url.split('/')[-1].rsplit('.', 1)
     # no, image_name = response.meta.get('link_text').split(': ', 1)
     # no = no.replace('Comic ', '')
     # image_name = u'{}.{}.{}'.format(no, image_name, ext)
     image_name = u'{}.{}'.format(no, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path, response.url, self.cookie)
Ejemplo n.º 17
0
 def parse_item(self, response):
     sel = Selector(response)
     image_data = sel.xpath('//div[@id="comic"]//img')
     image_url = image_data.xpath('./@src').extract()[0]
     full_title = sel.xpath('//h2[@class="post-title"]/text()').extract()[0]
     no, image_title = full_title.split(' ', 1)
     no = no.replace(':', '')
     if image_title[-1] == '.' and not image_title[-2] == '.':
         image_title = image_title[:-1]
     name, ext = image_url.split('/')[-1].rsplit('.', 1)
     image_name = u'{}.{}.{}'.format(int(no), image_title.strip(), ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 18
0
 def parse_item(self, response):
     sel = Selector(response)
     img_data = sel.xpath('//td[@id="comic_ruutu"]')
     image_title = img_data.xpath('./center/div/text()').extract()[0]
     image_title = re.sub(r'^[#*](\d+)\s+', '\g<1>. ', image_title)
     if image_title[-1] == '.' and not image_title[-2] == '.':
         image_title = image_title[:-1]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, img_data.xpath('./center/img/@src').extract()[0])
     name, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}'.format(image_title, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 19
0
 def parse_item(self, response):
     sel = Selector(response)
     image_data = sel.xpath('//div[@id="comic"]//img')
     image_url = image_data.xpath('./@src').extract()[0]
     full_title = sel.xpath('//h2[@class="post-title"]/text()').extract()[0]
     no, image_title = full_title.split(' ', 1)
     no = no.replace(':', '')
     if image_title[-1] == '.' and not image_title[-2] == '.':
         image_title = image_title[:-1]
     name, ext = image_url.split('/')[-1].rsplit('.', 1)
     image_name = u'{}.{}.{}'.format(int(no), image_title.strip(), ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 20
0
    def parse_item(self, response):
        sel = Selector(response)
        img_data = sel.xpath('//div[@id="content"]//article/div[@class="entry-content"]//img')
        image_url = img_data.xpath('./@src').extract()[0]
        if not urlparse(image_url).scheme:
            image_url = urlparse(image_url)._replace(scheme=urlparse(response.url).scheme).geturl()

        image_title = sel.xpath('//h1[@class="entry-title"]/text()').extract()[0]
        _, ext = image_url.split('/')[-1].rsplit('.', 1)
        date = '-'.join(re.search(r'/en/(\d{4})/(\d{2})/(\d{2})/', response.url).groups())
        image_name = u'{}.{}.{}'.format(date, image_title, ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            return download(image_url, path)
Ejemplo n.º 21
0
 def parse_item(self, response):
     sel = Selector(response)
     base_url = get_base_url(response)
     image_title = sel.xpath('//h1/text()').extract()[0]
     image_urls = sel.xpath('//div[@class="entry-content"]//img/@src').extract()
     for idx, _image_url in enumerate(image_urls):
         image_url = urljoin(base_url, _image_url)
         _, ext = image_url.split('/')[-1].rsplit('.', 1)
         if len(image_urls) > 1:
             image_name = u'{}-{}.{}'.format(image_title, idx+1, ext)
         else:
             image_name = u'{}.{}'.format(image_title, ext)
         path = self.PATH + '/' + image_name
         if not exists(path):
             return download(image_url, path)
Ejemplo n.º 22
0
 def parse_item(self, response):
     sel = Selector(response)
     orig_name = sel.xpath('//h1/text()').extract()[0]
     date = parser.parse(
         sel.xpath('//div[@class="stand_high"]/small/text()').extract()
         [0]).strftime(u'%Y-%m-%d')
     image_url = sel.xpath(
         '//div[@class="comicmid"]//img/@src').extract()[0]
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, orig_name, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 23
0
 def parse_item(self, response):
     sel = Selector(response)
     image_title = sel.xpath('//h1/text()').extract()[0]
     image_url = sel.xpath(
         '//div[@class="container"]/center//img/@src').extract()[0]
     date = sel.xpath(
         '//div[@class="row"]/div[@class="col-md-9"]/small/text()').extract(
         )
     date = parser.parse([x for x in date
                          if x.strip()][0]).strftime(u'%Y-%m-%d')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, image_title, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)
Ejemplo n.º 24
0
    def parse_item(self, response):
        sel = Selector(response)
        img_data = sel.xpath(
            '//div[@id="content"]//article/div[@class="entry-content"]//img')
        image_url = img_data.xpath('./@src').extract()[0]
        if not urlparse(image_url).scheme:
            image_url = urlparse(image_url)._replace(
                scheme=urlparse(response.url).scheme).geturl()

        image_title = sel.xpath(
            '//h1[@class="entry-title"]/text()').extract()[0]
        _, ext = image_url.split('/')[-1].rsplit('.', 1)
        date = '-'.join(
            re.search(r'/en/(\d{4})/(\d{2})/(\d{2})/', response.url).groups())
        image_name = u'{}.{}.{}'.format(date, image_title, ext)
        path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
        if not exists(path):
            return download(image_url, path)
Ejemplo n.º 25
0
 def parse_item(self, response):
     sel = Selector(response)
     try:
         image_url = sel.xpath('//div[@id="comic"]//img/@src').extract()[0]
     except:
         return
     post_info = sel.xpath('//div[@class="post-text"]')
     orig_name = post_info.xpath('./h2/a/text()').extract()[0]
     date = parser.parse(
         post_info.xpath('./span[@class="post-date"]/text()').extract()[0]
     ).strftime(u'%Y-%m-%d')
     base_url = get_base_url(response)
     image_url = urljoin(base_url, image_url)
     _, ext = image_url.split('/')[-1].split('.')
     image_name = u'{}.{}.{}'.format(date, orig_name, ext)
     path = self.PATH + '/' + remove_disallowed_filename_chars(image_name)
     if not exists(path):
         return download(image_url, path)