def parse(self, rsp): """ Parse CG items from pixiv illust page @url @returns items 1 200 @scraps crawled_from site_pk large_file_url file_url source """ wrapper = rsp.xpath('//div[@id="wrapper"]') wrapper_text = wrapper.extract()[0] pixiv_data = extract_pixiv_path(wrapper_text) pixiv_data.pop('order') order = 0 while True: large_file_url = 'https://i.pximg.net/img-original/img/'\ '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\ .format(order=order, **pixiv_data) if requests.head( large_file_url, headers={'Referer': 'https://www.pixiv.net/'} ).status_code == 404: break yield CG( crawled_from='pixiv.net', site_pk=pixiv_data['pk'], large_file_url=large_file_url, file_url=large_file_url, source=rsp.url, ) order += 1
def parse_json_result(self, art): try: file_url = large_file_url = re.findall( r'data-super-full-img="(.*?)"', art)[0] except IndexError: return None url = re.findall(r'href="(.*?)"', art)[0] site_pk = url.split('/')[4].split('-')[-1] return CG( crawled_from='deviantart.com', site_pk=site_pk, large_file_url=large_file_url, file_url=file_url, source=url, )
def parse_image(self, rsp): """ Parse CG items from image page @url @returns items 1 @scraps crawled_from site site_pk large_file_url file_url source """ url = rsp.url site_pk = url.split('/')[4].split('-')[-1] file_url = large_file_url = rsp.xpath('//img[@class="dev-content-full "]/@src')\ .extract_first() return CG( crawled_from='deviantart.com', site_pk=site_pk, large_file_url=large_file_url, file_url=file_url, source=url, )
def parse(self, rsp): """ Parse CG items from xml @url http://danbooru.donmai.us/explore/posts/popular.xml @returns items 1 100 @scraps crawled_from site_pk large_file_url file_url source md5 pixiv_id donmai_uploader_id rating fav_count score character_tags general_tags copyright_tags artist_tags """ for p in rsp.xpath('//posts/post'): cg = extract_donmai_rss(p) if not cg: continue yield CG(**cg) next_url = self.get_next_url(rsp) yield Request(next_url, callback=self.parse)
def parse_illust(self, rsp): """ Parse CG items from gallery page @url @returns items 1 200 @scraps crawled_from site_pk large_file_url file_url source """ wrapper = rsp.xpath('//div[@id="wrapper"]') wrapper_text = wrapper.extract()[0] pixiv_data = extract_pixiv_path(wrapper_text) pixiv_data.pop('order') order = 0 while True: large_file_url = 'https://i.pximg.net/img-original/img/'\ '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\ .format(order=order, **pixiv_data) if requests.head( large_file_url, headers={'Referer': 'https://www.pixiv.net/'} ).status_code == 404: break yield CG( crawled_from='pixiv.net', site_pk=pixiv_data['pk'], large_file_url=large_file_url, file_url=large_file_url, source=rsp.url, ) order += 1 works = rsp.xpath('//div[@id="wrapper"]//section[@class="works"]')[0] next_works = works.xpath( 'ul/li[contains(@class, "selected_works")]/following-sibling::li/a/@href' ) if next_works: next_ = next_works.extract_first().rsplit('=')[-1] next_url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=%s' % next_ yield Request(next_url, callback=self.parse_illust)
def parse(self, rsp): """ Parse CG items from gallery page @url @returns items 1 200 @scraps crawled_from site_pk large_file_url file_url source """ preview_images = rsp.xpath( '//div[@class="newindex"]//ul[contains(@class, "ui-brick")]/li//img/@src' ).extract() for image in preview_images: pixiv_data = extract_pixiv_path(image) large_file_url = 'https://i.pximg.net/img-original/img/'\ '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\ .format(**pixiv_data) yield CG( crawled_from='pixiv.net', site_pk=pixiv_data['pk'], large_file_url=large_file_url, file_url=large_file_url, source='https://www.pixiv.net/member_illust.php?mode=medium&illust_id={pk}'\ .format(**pixiv_data), )