def parse_about(self, response): selector = Selector(response) item = ZhihuspiderItem() # http://www.zhihu.com/people/jiang-xiao-fan-92-14/about # 取jiang-xiao-fan-92-14作为userID item['userid'] = response.url.split('/')[-2] item['name'] = selector.css('div>.name').xpath('text()').extract() item['location'] = selector.css('.location>a').xpath('text()').extract() item['business'] = selector.css('.business>a').xpath('text()').extract() item['gender'] = selector.css('.gender>i').xpath('@class').extract() item['employment'] = selector.css('.employment>a').xpath('text()').extract() item['position'] = selector.css('.position>a').xpath('text()').extract() item['education'] = selector.css('.education>a').xpath('text()').extract() item['major'] = selector.css('.education-extra>a').xpath('text()').extract() for (key, val) in item.items(): if isinstance(val, list) and len(val) > 0: item[key] = val[0] elif len(val) == 0: item[key] = '' if item['gender'].find('female') != -1: item['gender'] = u'female' elif item['gender'].find('male') != -1: item['gender'] = u'male' else: item['gender'] = u'unknown' yield item while self.server.llen(self.queue_key) > 2000: encoded_request = self.server.rpop(self.queue_key) if encoded_request: request = self.request_from_dict(pickle.loads(encoded_request)) if request._get_url().find('about') == -1: self.server.lpush(self.queue_key, encoded_request) continue yield request break
def parse_about(self, response): selector = Selector(response) item = ZhihuspiderItem() # http://www.zhihu.com/people/jiang-xiao-fan-92-14/about # 取jiang-xiao-fan-92-14作为userID item['userid'] = response.url.split('/')[-2] item['name'] = selector.css('div>.name').xpath('text()').extract() item['location'] = selector.css('.location>a').xpath('text()').extract() item['business'] = selector.css('.business>a').xpath('text()').extract() item['gender'] = selector.css('.gender>i').xpath('@class').extract() item['employment'] = selector.css('.employment>a').xpath('text()').extract() item['position'] = selector.css('.position>a').xpath('text()').extract() item['education'] = selector.css('.education>a').xpath('text()').extract() item['major'] = selector.css('.education-extra>a').xpath('text()').extract() for (key, val) in item.items(): if isinstance(val, list) and len(val) > 0: item[key] = val[0] elif len(val) == 0: item[key] = '' if item['gender'].find('female') != -1: item['gender'] = u'female' elif item['gender'].find('male') != -1: item['gender'] = u'male' else: item['gender'] = u'unknown' # 解析完后,获得关注列表和被关注列表的链接,生成新的请求 followeesHref = selector.xpath('/html/body/div[3]/div[2]/div[1]/a[1]/@href').extract()[0] followersHref = selector.xpath('/html/body/div[3]/div[2]/div[1]/a[2]/@href').extract()[0] yield FormRequest(self.host + followeesHref, meta={'cookiejar': 'followeesCookie'}, headers=self.headers, cookies=self.cookies, callback=self.parse_followees) yield FormRequest(self.host + followersHref, meta={'cookiejar': 'followersCookie'}, headers=self.headers, cookies=self.cookies, callback=self.parse_followers) yield item