def parse_item(self, response): sel = Selector(response) base_url = get_base_url(response) timestamp = sel.xpath( '//meta[@itemprop="datePublished"]/@content').extract()[0] date = parser.parse(timestamp).strftime(u'%Y.%m.%d') try: image_title = sel.xpath( '//h1/div[@class="name"]/text()').extract()[0].strip() except IndexError: try: image_title = sel.xpath('//h1/text()').extract()[0].replace( ':', '').strip() except IndexError: everything = sel.xpath('//h1/*[not(self::a)]') image_title = u''.join([ x.xpath('string(.)').extract()[0] for x in everything ]).strip() try: image_url = sel.xpath( '(//*[@itemprop="articleBody"]//img)[2]/@src').extract()[0] except IndexError: image_url = sel.xpath( '//*[@itemprop="articleBody"]//img/@src').extract()[0] image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].rsplit('.', 1) image_name = u'{}.{}.{}'.format(date, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) base_url = get_base_url(response) timestamp = sel.xpath('//meta[@itemprop="datePublished"]/@content').extract()[0] date = parser.parse(timestamp).strftime(u'%Y.%m.%d') try: image_title = sel.xpath('//h1/div[@class="name"]/text()').extract()[0].strip() except IndexError: try: image_title = sel.xpath('//h1/text()').extract()[0].replace(':', '').strip() except IndexError: everything = sel.xpath('//h1/*[not(self::a)]') image_title = u''.join([x.xpath('string(.)').extract()[0] for x in everything]).strip() try: image_url = sel.xpath('(//*[@itemprop="articleBody"]//img)[2]/@src').extract()[0] except IndexError: image_url = sel.xpath('//*[@itemprop="articleBody"]//img/@src').extract()[0] image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].rsplit('.', 1) image_name = u'{}.{}.{}'.format(date, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) base_url = get_base_url(response) no = response.url.rsplit('/')[-1] image_url = sel.xpath('//img[@id="cm_strip"]/@src').extract()[0] image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].rsplit('.', 1) image_name = u'{}.{}'.format(no, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) image_url = sel.xpath('//span[@class="full-size-link"]/a/@href').extract()[0] no = response.meta.get('index') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}'.format(no, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): download(image_url, path)
def parse_item(self, response): base_url = get_base_url(response) sel = Selector(response) img_data = sel.xpath('//tbody[@class="style5"]/tr[2]/td/img') image_url = urljoin(base_url, img_data.xpath('./@src').extract()[0]) image_title = img_data.xpath('./@alt').extract()[0] name, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(name, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) image_url = sel.xpath('//div[@id="cc-comicbody"]//img/@src').extract()[0] date, image_name = response.meta.get('date_text').split(' - ') date = parser.parse(date).strftime(u'%Y-%m-%d') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, image_name, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): comic_id = parse_qs(urlparse(response.url).query).get('s')[0] sel = Selector(response) img_data = sel.xpath('//img[@id="strip"]') image_url = img_data.xpath('./@src').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) name, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}'.format(comic_id, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) image_url = sel.xpath( '//span[@class="full-size-link"]/a/@href').extract()[0] no = response.meta.get('index') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}'.format(no, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): download(image_url, path)
def parse_item(self, response): sel = Selector(response) orig_name = sel.xpath('//h1/text()').extract()[0] date = parser.parse(sel.xpath('//div[@class="stand_high"]/small/text()').extract()[0]).strftime(u'%Y-%m-%d') image_url = sel.xpath('//div[@class="comicmid"]//img/@src').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, orig_name, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) image_url = sel.xpath( '//div[@id="cc-comicbody"]//img/@src').extract()[0] date, image_name = response.meta.get('date_text').split(' - ') date = parser.parse(date).strftime(u'%Y-%m-%d') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, image_name, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) image_url = sel.xpath('//img[@id="strip"]/@src').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) no, ext = image_url.split('/')[-1].rsplit('.', 1) # no, image_name = response.meta.get('link_text').split(': ', 1) # no = no.replace('Comic ', '') # image_name = u'{}.{}.{}'.format(no, image_name, ext) image_name = u'{}.{}'.format(no, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path, response.url, self.cookie)
def parse_item(self, response): sel = Selector(response) image_data = sel.xpath('//div[@id="comic"]//img') image_url = image_data.xpath('./@src').extract()[0] full_title = sel.xpath('//h2[@class="post-title"]/text()').extract()[0] no, image_title = full_title.split(' ', 1) no = no.replace(':', '') if image_title[-1] == '.' and not image_title[-2] == '.': image_title = image_title[:-1] name, ext = image_url.split('/')[-1].rsplit('.', 1) image_name = u'{}.{}.{}'.format(int(no), image_title.strip(), ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) img_data = sel.xpath('//td[@id="comic_ruutu"]') image_title = img_data.xpath('./center/div/text()').extract()[0] image_title = re.sub(r'^[#*](\d+)\s+', '\g<1>. ', image_title) if image_title[-1] == '.' and not image_title[-2] == '.': image_title = image_title[:-1] base_url = get_base_url(response) image_url = urljoin(base_url, img_data.xpath('./center/img/@src').extract()[0]) name, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}'.format(image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) img_data = sel.xpath('//div[@id="content"]//article/div[@class="entry-content"]//img') image_url = img_data.xpath('./@src').extract()[0] if not urlparse(image_url).scheme: image_url = urlparse(image_url)._replace(scheme=urlparse(response.url).scheme).geturl() image_title = sel.xpath('//h1[@class="entry-title"]/text()').extract()[0] _, ext = image_url.split('/')[-1].rsplit('.', 1) date = '-'.join(re.search(r'/en/(\d{4})/(\d{2})/(\d{2})/', response.url).groups()) image_name = u'{}.{}.{}'.format(date, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_start_url(self, response): sel = Selector(response) image_url = sel.xpath('//div[@class="entry-content"]/p/a/@href').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'_characters.{}'.format(ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): download(image_url, path) images = sel.xpath('//figure[@class="gallery-item"]//a/@href').extract() for i, c in enumerate(images): yield Request(c, meta={'index': i+1}, callback=self.parse_item)
def parse_item(self, response): sel = Selector(response) orig_name = sel.xpath('//h1/text()').extract()[0] date = parser.parse( sel.xpath('//div[@class="stand_high"]/small/text()').extract() [0]).strftime(u'%Y-%m-%d') image_url = sel.xpath( '//div[@class="comicmid"]//img/@src').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, orig_name, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_start_url(self, response): sel = Selector(response) image_url = sel.xpath( '//div[@class="entry-content"]/p/a/@href').extract()[0] base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'_characters.{}'.format(ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): download(image_url, path) images = sel.xpath( '//figure[@class="gallery-item"]//a/@href').extract() for i, c in enumerate(images): yield Request(c, meta={'index': i + 1}, callback=self.parse_item)
def parse_item(self, response): sel = Selector(response) image_title = sel.xpath('//h1/text()').extract()[0] image_url = sel.xpath( '//div[@class="container"]/center//img/@src').extract()[0] date = sel.xpath( '//div[@class="row"]/div[@class="col-md-9"]/small/text()').extract( ) date = parser.parse([x for x in date if x.strip()][0]).strftime(u'%Y-%m-%d') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) img_data = sel.xpath( '//div[@id="content"]//article/div[@class="entry-content"]//img') image_url = img_data.xpath('./@src').extract()[0] if not urlparse(image_url).scheme: image_url = urlparse(image_url)._replace( scheme=urlparse(response.url).scheme).geturl() image_title = sel.xpath( '//h1[@class="entry-title"]/text()').extract()[0] _, ext = image_url.split('/')[-1].rsplit('.', 1) date = '-'.join( re.search(r'/en/(\d{4})/(\d{2})/(\d{2})/', response.url).groups()) image_name = u'{}.{}.{}'.format(date, image_title, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)
def parse_item(self, response): sel = Selector(response) try: image_url = sel.xpath('//div[@id="comic"]//img/@src').extract()[0] except: return post_info = sel.xpath('//div[@class="post-text"]') orig_name = post_info.xpath('./h2/a/text()').extract()[0] date = parser.parse( post_info.xpath('./span[@class="post-date"]/text()').extract()[0] ).strftime(u'%Y-%m-%d') base_url = get_base_url(response) image_url = urljoin(base_url, image_url) _, ext = image_url.split('/')[-1].split('.') image_name = u'{}.{}.{}'.format(date, orig_name, ext) path = self.PATH + '/' + remove_disallowed_filename_chars(image_name) if not exists(path): return download(image_url, path)