def spider_error(self, failure, response, spider): message = 'Error: {0}'.format(failure.getErrorMessage()) ei_item = ErrorInfoItem() ei_item['time'] = now_string() ei_item['url'] = response.url ei_item['error_level'] = "E" ei_item['error_type'] = "E1000" ei_item['description'] = message spider.logger.error(message) return ei_item
def spider_closed(self, spider): now = now_string() message = 'bdbk spider end at: {0}'.format(now) ei_item = ErrorInfoItem() ei_item['time'] = now ei_item['url'] = spider.start_page ei_item['error_level'] = "I" ei_item['error_type'] = "I2" ei_item['description'] = message spider.logger.info(message) return ei_item
def parse_person(self, response): url = response.url.split('?')[0] if self.check_visited(url): return kwlist = response.xpath('//meta[@name="keywords"]/@content').extract() if len(kwlist) == 0: self.redis_client_person.set(url, 1) return keywords = kwlist[0].encode('utf-8', 'ignore') ''' # the 'keywords' meta must contains '人物' if keywords.find('人物') == -1: self.redis_client_person.set(url, 1) return ''' description = response.xpath('//meta[@name="description"]/@content').extract()[0].encode('utf-8', 'ignore') page_title = response.xpath('//h1/text()').extract()[0].encode('utf-8', 'ignore') # get person tags (人物标签) person_tags = list() categories = dict() is_person = False for sel in response.xpath('//span[@class="taglist"]'): tag = sel.xpath('text()').extract()[0] tag = re.sub(r'[\r\n]*', '', tag).encode('utf-8', 'ignore') if len(tag) == 0: continue if tag in self.ignore_tags: message = 'In ignore list. name: {0}, tag: {1}'.format(page_title, tag) ei_item = ErrorInfoItem() ei_item['time'] = now_string() ei_item['url'] = url ei_item['error_level'] = "W" ei_item['error_type'] = 'W1' ei_item['description'] = message yield ei_item self.logger.warning(message) self.redis_client_person.set(url, 1) return if tag.find('人物') != -1: is_person = True person_tags.append(tag) # save to redis category_cnt = self.redis_client.get(tag) if str(category_cnt) == 'None': category_cnt = 1 else: category_cnt = int(category_cnt) + 1 self.redis_client.set(tag, category_cnt) categories[tag] = category_cnt # if tags do not contains |人物|, just follow link if is_person == False and self.follow_link == True: self.redis_client_person.set(url, 1) # follow link that which url contains |view|(view/subview) for sel in response.xpath('//a[contains(@href, "view")]'): url = response.urljoin(sel.xpath('@href').extract()[0].split('?')[0]) if self.check_visited(url): return request = scrapy.Request(url, callback = self.parse_person) yield request return person_item = PersonItem() person_item['name'] = page_title person_item['url'] = url person_item['description'] = description person_item['tags'] = person_tags person_item['keywords'] = keywords summary_pic = response.xpath('//div[@class="summary-pic"]/a/img/@src').extract() if len(summary_pic) > 0: summary_pic = summary_pic[0].split('/')[-1].split('.')[0] else: summary_pic = '' person_item['summary_pic'] = summary_pic # for the data pipeline yield person_item yield categories # crawling image gallery (图册) # album list album_list = response.xpath('//script/text()').re(r'AlbumList\({.*[\n\t]*.*[\n\t]*.*[\n\t]*.*') albums = list() if len(album_list) > 0: album_list = album_list[0] album_list = re.sub(r'[\r\n\t]*', '', album_list) album_lemma_id = re.findall(r'lemmaId:"([\d]+)"', album_list)[0] album_sublemma_id = re.findall(r'subLemmaId:"([\d]+)"', album_list)[0] album_data_json = re.sub(r'AlbumList.*data:', '', album_list) try: album_data_dict = json.loads(album_data_json) i = 0 for d in album_data_dict: if isinstance(album_data_dict, list): cover_pic = d["coverpic"] album_desc= d["desc"] album_total= d["total"] album_url = '/picture/{0}/{1}/{2}/{3}'.format(album_lemma_id, album_sublemma_id, i, cover_pic) i += 1 else: cover_pic = album_data_dict[d]["coverpic"] album_desc= album_data_dict[d]["desc"] album_total= album_data_dict[d]["total"] album_url = '/picture/{0}/{1}/{2}/{3}'.format(album_lemma_id, album_sublemma_id, d, cover_pic) album_url = response.urljoin(album_url) # build album_item album_item = AlbumItem() album_item['url'] = album_url album_item['description'] = album_desc.encode('utf8', 'ignore') album_item['total'] = album_total album_item['cover_pic'] = cover_pic album_item['person_name'] = person_item['name'] album_item['person_url'] = person_item['url'] albums.append(album_item) except Exception, e: self.logger.error('json parse album list info error. url: %s, err: %r', response.url, e)
try: r = re.compile('albums:.*,[\r\n\s]*lemmaId:') for s in response.xpath('//script/text()').extract(): match = re.search(r, s) if match: album_info_str = match.group() album_info_str = re.sub(r',[\r\n\s]*lemmaId:', '', album_info_str) album_info_str = "{%s}" % album_info_str.replace('albums', '"albums"') break except Exception, e: self.logger.error('get album info json error. url: %s, err: %r', response.url, e) return if album_info_str == None: message = 'Album not found. person_name: {0}, person_url: {1}'.format(person_info['name'], person_info['url']) ei_item = ErrorInfoItem() ei_item['time'] = now_string() ei_item['url'] = response.url ei_item['error_level'] = "E" ei_item['error_type'] = "E1" ei_item['description'] = message yield ei_item self.logger.warning('{%s}. url: %s', message, response.url) return album_info_dic = None try: album_info_dic = json.loads(album_info_str) album_info_dic = album_info_dic['albums'] except Exception, e: message = 'json.loads album info error. url: {0}, json: {1}, err: {2}'.format(response.url, album_info_str, e) ei_item = ErrorInfoItem()