예제 #1
0
 def spider_error(self, failure, response, spider):
     message = 'Error: {0}'.format(failure.getErrorMessage())
     ei_item = ErrorInfoItem()
     ei_item['time'] = now_string()
     ei_item['url'] = response.url
     ei_item['error_level'] = "E"
     ei_item['error_type'] = "E1000"
     ei_item['description'] = message
     spider.logger.error(message)
     return ei_item
예제 #2
0
 def spider_closed(self, spider):
     now = now_string()
     message = 'bdbk spider end at: {0}'.format(now)
     ei_item = ErrorInfoItem()
     ei_item['time'] = now
     ei_item['url'] = spider.start_page
     ei_item['error_level'] = "I"
     ei_item['error_type'] = "I2"
     ei_item['description'] = message
     spider.logger.info(message)
     return ei_item
예제 #3
0
    def parse_person(self, response):
        url = response.url.split('?')[0]
        if self.check_visited(url):
          return

        kwlist = response.xpath('//meta[@name="keywords"]/@content').extract()
        if len(kwlist) == 0:
          self.redis_client_person.set(url, 1)
          return

        keywords = kwlist[0].encode('utf-8', 'ignore')
        '''
        # the 'keywords' meta must contains '人物'
        if keywords.find('人物') == -1:
            self.redis_client_person.set(url, 1)
            return
        '''

        description = response.xpath('//meta[@name="description"]/@content').extract()[0].encode('utf-8', 'ignore')
        page_title = response.xpath('//h1/text()').extract()[0].encode('utf-8', 'ignore')

        # get person tags (人物标签)
        person_tags = list()
        categories = dict()
        is_person = False
        for sel in response.xpath('//span[@class="taglist"]'):
            tag = sel.xpath('text()').extract()[0]
            tag = re.sub(r'[\r\n]*', '', tag).encode('utf-8', 'ignore')
            if len(tag) == 0:
                continue
            if tag in self.ignore_tags:
                message = 'In ignore list. name: {0}, tag: {1}'.format(page_title, tag)
                ei_item = ErrorInfoItem()
                ei_item['time'] = now_string()
                ei_item['url'] = url
                ei_item['error_level'] = "W"
                ei_item['error_type'] = 'W1'
                ei_item['description'] = message
                yield ei_item
                self.logger.warning(message)
                self.redis_client_person.set(url, 1)
                return
            if tag.find('人物') != -1:
                is_person = True
            person_tags.append(tag)
            # save to redis
            category_cnt = self.redis_client.get(tag)
            if  str(category_cnt) == 'None':
                category_cnt = 1
            else:
                category_cnt = int(category_cnt) + 1
            self.redis_client.set(tag, category_cnt)

            categories[tag] = category_cnt

        # if tags do not contains |人物|, just follow link
        if is_person == False and self.follow_link == True:
            self.redis_client_person.set(url, 1)
            # follow link that which url contains |view|(view/subview)
            for sel in response.xpath('//a[contains(@href, "view")]'):
                url = response.urljoin(sel.xpath('@href').extract()[0].split('?')[0])
                if self.check_visited(url):
                    return
                request = scrapy.Request(url, callback = self.parse_person)
                yield request
            return

        person_item = PersonItem()
        person_item['name'] = page_title
        person_item['url'] = url
        person_item['description'] = description
        person_item['tags'] = person_tags
        person_item['keywords'] = keywords 

        summary_pic = response.xpath('//div[@class="summary-pic"]/a/img/@src').extract()
        if len(summary_pic) > 0:
            summary_pic = summary_pic[0].split('/')[-1].split('.')[0]
        else:
            summary_pic = ''
        person_item['summary_pic'] = summary_pic

        # for the data pipeline
        yield person_item
        yield categories

        # crawling image gallery (图册)
        # album list
        album_list = response.xpath('//script/text()').re(r'AlbumList\({.*[\n\t]*.*[\n\t]*.*[\n\t]*.*')
        albums = list()
        if len(album_list) > 0:
            album_list = album_list[0]
            album_list = re.sub(r'[\r\n\t]*', '', album_list)
            album_lemma_id = re.findall(r'lemmaId:"([\d]+)"', album_list)[0]
            album_sublemma_id = re.findall(r'subLemmaId:"([\d]+)"', album_list)[0]
            album_data_json = re.sub(r'AlbumList.*data:', '', album_list)
            try:
                album_data_dict = json.loads(album_data_json)
                i = 0
                for d in album_data_dict:
                    if isinstance(album_data_dict, list):
                        cover_pic = d["coverpic"]
                        album_desc= d["desc"]
                        album_total= d["total"]
                        album_url = '/picture/{0}/{1}/{2}/{3}'.format(album_lemma_id, album_sublemma_id, i, cover_pic)
                        i += 1
                    else:
                        cover_pic = album_data_dict[d]["coverpic"]
                        album_desc= album_data_dict[d]["desc"]
                        album_total= album_data_dict[d]["total"]
                        album_url = '/picture/{0}/{1}/{2}/{3}'.format(album_lemma_id, album_sublemma_id, d, cover_pic)
                    album_url = response.urljoin(album_url)

                    # build album_item
                    album_item = AlbumItem()
                    album_item['url'] = album_url
                    album_item['description'] = album_desc.encode('utf8', 'ignore')
                    album_item['total'] = album_total
                    album_item['cover_pic'] = cover_pic
                    album_item['person_name'] = person_item['name']
                    album_item['person_url'] = person_item['url']
                    albums.append(album_item)
            except Exception, e:
                self.logger.error('json parse album list info error. url: %s, err: %r', response.url, e)
예제 #4
0
        try:
            r = re.compile('albums:.*,[\r\n\s]*lemmaId:')
            for s in response.xpath('//script/text()').extract():
                match = re.search(r, s)
                if match:
                    album_info_str =  match.group()
                    album_info_str = re.sub(r',[\r\n\s]*lemmaId:', '', album_info_str)
                    album_info_str = "{%s}" % album_info_str.replace('albums', '"albums"')
                    break
        except Exception, e:
            self.logger.error('get album info json error. url: %s, err: %r', response.url, e)
            return
        if album_info_str == None:
            message = 'Album not found. person_name: {0}, person_url: {1}'.format(person_info['name'], person_info['url'])
            ei_item = ErrorInfoItem()
            ei_item['time'] = now_string()
            ei_item['url'] = response.url
            ei_item['error_level'] = "E"
            ei_item['error_type'] = "E1"
            ei_item['description'] = message
            yield ei_item
            self.logger.warning('{%s}. url: %s', message, response.url)
            return

        album_info_dic = None
        try:
            album_info_dic = json.loads(album_info_str)
            album_info_dic = album_info_dic['albums']
        except Exception, e:
            message = 'json.loads album info error. url: {0}, json: {1}, err: {2}'.format(response.url, album_info_str, e)
            ei_item = ErrorInfoItem()