def parse_list(self, response): query = response.meta['query'] uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) return Page(query=query, uri=uri, total=0, arts=[]) def gen(trs): for tr in trs[2:-1:2][:MAX_ARTS]: yield Art( title=tr.xpath('td[@class="c1"]/a/text()').extract()[0], author=tr.xpath('td[@class="c2"]/a/text()').extract()[0], company=tr.xpath('td[@class="c3"]/a/text()').extract()[0], uri=urljoin( BASE_URL, tr.xpath('td[@class="c1"]/a/@href').extract()[0]), status='reserve' if u'予' in tr.xpath('td[@class="c7"]/text()').extract() else 'other', ) sel = Selector(response) try: trs = list(sel.xpath('//table[@class="FixFrame"]//tr')) return Page(query=query, uri=uri, total=total(sel), arts=list(gen(trs))) except: log.msg('parse failed', level=log.ERROR) return Result(ok=False, query=query)
def parse_art(self, response): uri = response.meta['uri'] sel = Selector(response) try: art = Art( title=sel.xpath( '//td[@class="td_title_bar_r1c2"]/text()').extract()[0], author=sel.xpath( '//td[@class="DetailData_L"]/a[contains(@href, "author")]/text()' ).extract(), company=sel.xpath( '//td[@class="CircleName"]/a[1]/text()').extract()[0], uri=uri, status=status_in_art(sel), ) except: log.msg('parse failed', level=log.ERROR) return Result(ok=False, query=uri) if 'page' not in response.meta: return Page(query=uri, uri=uri, total=1, arts=[art]) else: page = response.meta['page'] ranks = response.meta['ranks'] page['arts'][ranks[uri]] = art if page_complete(page): return page
def parse_user_illustrations_uri(self, response): query = response.meta['query'] uri = response.meta['uri'] log.msg(u'got response of query %s' % uri) def gen(sel): author = sel.xpath('//h1[@class="user"]/text()').extract()[0] for a in sel.xpath('//a[@class="work"]')[:self.max_arts]: yield Art( title=a.xpath('h1/@title').extract()[0], author=author, uri=urljoin(BASE_URL, a.xpath('@href').extract()[0]), thumbnail_uri=a.xpath('img/@src').extract()[0], ) sel = Selector(response) try: return Page( query=query, uri=uri, total=sel.xpath( '//*[@id="wrapper"]/div[1]/div[1]/div/span/text()').re( r'\d+')[0], arts=list(gen(sel)), ) except Exception as e: log.msg('parse failed', level=log.ERROR) return Result(ok=False, query=query, message=str(e))
def parse_complex_list(self, response): query = response.meta['query'] uri = response.meta['uri'] log.msg(u'got response of query %s' % query, level=log.INFO) if empty(response.body_as_unicode()): log.msg('empty result', level=log.INFO) yield Page(query=query, uri=uri, total=0, arts=[]) return sel = Selector(response) try: uris = [ urljoin(BASE_URL, url) for url in sel.xpath( '//tr[@class="TBLdtil"]/td[@class="noi_c2"]/a/@href'). extract()[:MAX_ARTS] ] log.msg('got %d arts' % len(uris)) page = Page(query=query, uri=uri, total=total_complex(sel), arts=[None] * len(uris)) ranks = {uri: i for i, uri in enumerate(uris)} for uri in uris: req = self.make_art_request(uri) req.meta['page'] = page req.meta['ranks'] = ranks yield req except: log.msg('parse failed', level=log.ERROR) yield Result(ok=False, query=query)
def parse_bangumi(self, response): query = response.meta['query'] try: return Bangumi(query=query, content=json.loads(response.body_as_unicode())) except: log.msg('parse failed', level=log.ERROR) return Result(ok=False, query=query)
def parse_rss(self, response): query = response.meta['query'] try: sel = Selector(response) return RSS(query=query, arts=[make_art(sub) for sub in sel.xpath('//item')]) except Exception as e: log.msg('parse failed: %s' % str(e), level=log.ERROR) return Result(ok=False, query=query)
def parse_user(self, response): query = response.meta['query'] try: sel = Selector(response) return User( query=query, posts=[ make_post(sub) for sub in sel.xpath('//div[@class="main_list"]/ul/li') ]) except Exception as e: log.msg('parse failed: %s' % str(e), level=log.ERROR) return Result(ok=False, query=query)
def parse_ranking_uri(self, response): query = response.meta['query'] try: pages = response.meta['pages'] d = json.loads(response.body_as_unicode()) pages[ response.meta['page']] = [] if 'error' in d else d['contents'] if None not in pages: arts = list(chain(*pages)) return Page( query=query, uri=make_ranking_uri(query), total=len(arts), arts=arts, ) except Exception as e: log.msg('parse failed, content: %s' % response.body_as_unicode(), level=log.ERROR) return Result(ok=False, query=query, message=str(e))
def failed(query, message): log.msg('parse failed: %s' % message, level=log.ERROR) return Result(ok=False, query=query, message=message)