def get_score_details(response: TypeResponse): detail = { 'total': response.xpath( '//*[@id="ChartWarpper"]/div/small/span/text()').extract_first() } for li in response.xpath( '//*[@id="ChartWarpper"]/ul[@class="horizontalChart"]/li'): detail[li.xpath('.//span[@class="label"]/text()').extract_first( )] = li.xpath('.//span[@class="count"]/text()').extract_first()[1:-1] return detail
def parse(self, response: TypeResponse): links = set() for link in response.xpath( '//*[@id="wikiEntryMainTab"]//li/a/@href').extract(): links.add(link) for link in response.xpath( '//*[@id="latestEntryMainTab"]//li/a/@href').extract(): links.add(link) for link in links: if '/subject/' in link: yield Request(response.urljoin(link), callback=self.parse_page, meta={'dont_cache': True})
def get_image(response: TypeResponse): not_nsfw_cover = response.xpath( '//*[@id="bangumiInfo"]/div/div/a/img/@src') if not_nsfw_cover: return not_nsfw_cover.extract_first().replace( '//lain.bgm.tv/pic/cover/c/', 'lain.bgm.tv/pic/cover/g/') else: return 'lain.bgm.tv/img/no_icon_subject.png'
def get_info(response: TypeResponse): info = defaultdict(list) for info_el in response.xpath('//*[@id="infobox"]/li'): info[info_el.xpath('span/text()').extract_first().replace( ': ', '')] = info_el.xpath('a/text()').extract() or info_el.xpath( 'text()').extract() return dict(info)
def get_tag_from_response(response: TypeResponse, subject_id): for a in response.xpath( '//*[@id="subject_detail"]//div[@class="subject_tag_section"]/div[@class="inner"]/a' ): text = a.xpath('span/text()').extract_first() if not text: continue yield TagItem(subject_id=subject_id, text=text, count=int(a.xpath('small/text()').extract_first()))
def parse_topic(self, response: TypeResponse): topic = TopicItem() topic['last_reply'] = max( parse_datetime(x.xpath('./text()').extract_first()) for x in response.xpath('//*[contains(@class,"re_info")]/small')) e = response.xpath('//*[contains(@class, "topic_content")]') post_topic = response.xpath('//*[contains(@class, "postTopic")]') topic['id'] = response.url.split('/')[-1] topic['content'] = parse_content(e) topic['group'] = response.xpath( '//*[@id="pageHeader"]/h1/span/a[1]/@href').extract_first().split( '/')[-1] topic['title'] = response.xpath( '//*[@id="pageHeader"]/h1/text()').extract_first() topic['author'] = post_topic.xpath( './div[contains(@class, "inner")]//a/@href').extract_first() if not topic['author']: raise KeyError('no author') else: topic['author'] = topic['author'].split('/')[-1] create_time = post_topic.xpath( './div[contains(@class, "re_info")]/small/text()').extract_first() topic['create_time'] = parse_datetime(create_time) comments = response.xpath('//*[@id="comment_list"]') last_reply = topic['create_time'] for row in comments.xpath('./div[contains(@class, "row_reply")]'): for item in parse_row_reply(response, row, topic['id']): if item['create_time'] > last_reply: last_reply = item['create_time'] yield item # print(m_r) topic['last_reply'] = last_reply yield topic
def parse(self, response: TypeResponse): yield Request( 'https://mirror.bgm.rin.cat/group/topic/350626', callback=self.parse_topic, ) return for item in response.xpath('//*[@id="eden_tpc_list"]/ul/li'): url = item.xpath('./a/@href').extract_first().replace( '/rakuen/topic/group/', '/group/topic/') yield Request( response.urljoin(url), callback=self.parse_topic, )
def parse(self, response: TypeResponse): subject_id = int(response.url.split('/')[-1]) if '出错了' not in response.text: subject_item = SubjectItem() if '条目已锁定' in response.text: subject_item['id'] = subject_id subject_item['locked'] = True subject_type = response.xpath( '//*[@id="panelInterestWrapper"]//div[contains(@class, ' '"global_score")]' '/div/small[contains(@class, "grey")]/text()').extract_first() subject_item['subject_type'] = subject_type.split()[1] subject_item['id'] = int(response.url.split('/')[-1]) subject_item['info'] = get_info(response) subject_item['tags'] = 'tags' yield from get_tag_from_response(response, subject_id) subject_item['image'] = get_image(response) subject_item['score'] = get_score(response) subject_item['score_details'] = get_score_details(response) title = response.xpath('//*[@id="headerSubject"]/h1/a')[0] subject_item['name_cn'] = title.attrib['title'] subject_item['name'] = title.xpath('text()').extract_first() # this will set 'wishes', 'done', 'doings', 'on_hold', 'dropped' subject_item.update(get_collector_count(response)) for edge in get_relation(response, source=subject_item['id']): relation_item = RelationItem(**edge, ) yield relation_item # yield Request(url_from_id(relation_item['target'])) yield subject_item
def get_collector_count(response: TypeResponse): item = {} for key, value in collector.items(): item[key] = response.xpath( '//*[@id="subjectPanelCollect"]/span[@class="tip_i"]' '/a[re:test(@href, "{}$")]/text()'.format(value), namespaces={ 're': regexpNS }).extract_first() for key in collector: if item[key]: item[key] = int(item[key].split('人')[0]) else: item[key] = 0 return item
def get_relation(response: TypeResponse, source): section = response.xpath( '//div[@class="subject_section"][//h2[@class="subtitle" and contains(' 'text(), "关联条目")]]' '/div[@class="content_inner"]/ul/li') relation = [] chunk_list = [] # type:List[TypeSelectorList] for li in section: if 'sep' in li.attrib.get('class', ''): chunk_list.append([li]) else: chunk_list[-1].append(li) for li_list in chunk_list: rel = li_list[0].xpath('span/text()').extract_first() for li in li_list: target = li.xpath('a/@href').extract_first() relation.append({ 'source': source, 'target': int(target.split('/')[-1]), 'relation': rel, }) return relation
def get_score(response: TypeResponse): return response.xpath( '//*[@id="panelInterestWrapper"]//div[@class="global_score"]/span[' '1]/text()').extract_first()