def parse_about(self, response):
        selector = Selector(response)
        item = ZhihuspiderItem()
        # http://www.zhihu.com/people/jiang-xiao-fan-92-14/about
        # 取jiang-xiao-fan-92-14作为userID
        item['userid'] = response.url.split('/')[-2]
        item['name'] = selector.css('div>.name').xpath('text()').extract()
        item['location'] = selector.css('.location>a').xpath('text()').extract()
        item['business'] = selector.css('.business>a').xpath('text()').extract()
        item['gender'] = selector.css('.gender>i').xpath('@class').extract()
        item['employment'] = selector.css('.employment>a').xpath('text()').extract()
        item['position'] = selector.css('.position>a').xpath('text()').extract()
        item['education'] = selector.css('.education>a').xpath('text()').extract()
        item['major'] = selector.css('.education-extra>a').xpath('text()').extract()

        for (key, val) in item.items():
            if isinstance(val, list) and len(val) > 0:
                item[key] = val[0]
            elif len(val) == 0:
                item[key] = ''

        if item['gender'].find('female') != -1:
            item['gender'] = u'female'
        elif item['gender'].find('male') != -1:
            item['gender'] = u'male'
        else:
            item['gender'] = u'unknown'

        yield item

        while self.server.llen(self.queue_key) > 2000:
            encoded_request = self.server.rpop(self.queue_key)
            if encoded_request:
                request = self.request_from_dict(pickle.loads(encoded_request))
                if request._get_url().find('about') == -1:
                    self.server.lpush(self.queue_key, encoded_request)
                    continue
            yield request
            break
Beispiel #2
0
    def parse_about(self, response):
        selector = Selector(response)

        item = ZhihuspiderItem()

        # http://www.zhihu.com/people/jiang-xiao-fan-92-14/about
        # 取jiang-xiao-fan-92-14作为userID
        item['userid'] = response.url.split('/')[-2]
        item['name'] = selector.css('div>.name').xpath('text()').extract()
        item['location'] = selector.css('.location>a').xpath('text()').extract()
        item['business'] = selector.css('.business>a').xpath('text()').extract()
        item['gender'] = selector.css('.gender>i').xpath('@class').extract()
        item['employment'] = selector.css('.employment>a').xpath('text()').extract()
        item['position'] = selector.css('.position>a').xpath('text()').extract()
        item['education'] = selector.css('.education>a').xpath('text()').extract()
        item['major'] = selector.css('.education-extra>a').xpath('text()').extract()

        for (key, val) in item.items():
            if isinstance(val, list) and len(val) > 0:
                item[key] = val[0]
            elif len(val) == 0:
                item[key] = ''

        if item['gender'].find('female') != -1:
            item['gender'] = u'female'
        elif item['gender'].find('male') != -1:
            item['gender'] = u'male'
        else:
            item['gender'] = u'unknown'

        # 解析完后,获得关注列表和被关注列表的链接,生成新的请求
        followeesHref = selector.xpath('/html/body/div[3]/div[2]/div[1]/a[1]/@href').extract()[0]
        followersHref = selector.xpath('/html/body/div[3]/div[2]/div[1]/a[2]/@href').extract()[0]
        yield FormRequest(self.host + followeesHref, meta={'cookiejar': 'followeesCookie'}, headers=self.headers,
                          cookies=self.cookies, callback=self.parse_followees)
        yield FormRequest(self.host + followersHref, meta={'cookiejar': 'followersCookie'}, headers=self.headers,
                          cookies=self.cookies, callback=self.parse_followers)

        yield item