def parse_mini_card(self, response): if response.status == 404: yield response.meta['uItem'] yield response.meta['item'] pItem = response.meta['item'] user = response.xpath('//a[@target="_blank"]/@href').extract()[0] uid = user.split('/')[-1] pItem['uid'] = uid if uid in self.users: yield pItem return self.users.add(uid) item = userItem() item['disPage'] = pItem['disPage'] item['uid'] = uid name = response.xpath( '//h5[@class="userInfo__content-name"]/text()').extract()[0] item['firstName'], item['lastName'] = self.getName(name) item['source'] = response.urljoin(user) request = scrapy.Request(item['source'], callback=self.parse_profile, dont_filter=True) request.meta['item'] = item request.meta['pItem'] = pItem yield request
def parse_author(self, response, disPage): item = userItem() item['disPage'] = disPage user = response.xpath( '//div[@class="author-details"]//a/@href').extract()[0] item['uid'] = user.split('/')[-1] item['source'] = response.urljoin(user) firstName = response.xpath( '//span[@class="given-name"]/text()').extract() lastName = response.xpath( '//span[@class="family-name"]/text()').extract() if firstName: item['firstName'] = firstName[0] if lastName: item['lastName'] = lastName[0] occupation = response.xpath( '//div[@class="agent-details-col"]/div/text()').extract() if occupation: item['occupation'] = occupation[0].strip() rainmaker = response.xpath('//div[@class="agent-mast-img"]/div') if rainmaker: item['account'], item['points'] = self.getRainmaker(rainmaker) location = response.xpath( '//div[@id="find_agents"]/p/a/@href').extract() if location: item['state'], item['city'] = [i.split('/')[-1] for i in location] return item
def parse_user(self, c, disPage): item = userItem() item['disPage'] = disPage user = c.xpath('div[@class="comment-left-section"]/a/@href').extract() if not user: user = c.xpath('.//div[@class="comment-author"]/text()').extract() item['uid'] = user[0].split('/')[-1] item['source'] = 'http://activerain.com' + user[0] name = c.xpath('.//div[@class="comment-author"]/text()').extract()[0] item['firstName'], item['lastName'] = self.getName(name) occupation = c.xpath('.//div[@class="tagline"]/text()').extract() if occupation: item['occupation'] = occupation[0].strip() rainmaker = c.xpath('.//div[@class="comment-header"]/div') if rainmaker: item['account'], item['points'] = self.getRainmaker(rainmaker) location = c.xpath('.//div[@class="company"]/text()').extract() if location: l = location[0].split('-')[-1].strip() if ',' in l: item['city'], item['state'] = l.split(',') if l else (None, None) return item
def parse_comments(self, response): blog = response.meta['blog'] count = blog['count'] d = json.loads(response.text) for id in d: for i in d[id]: count += 1 c = Selector(text=i) item = postItem() uItem = userItem() item['URL'] = blog['URL'] item['title'] = blog['title'] item['disPage'] = blog['blogPage'] item['pid'] = int( c.xpath('//div[@class="blog-comment-comment "]/@data-id'). extract()[0]) item['replyid'] = count t = c.xpath( './/meta[@itemprop="datePublished"]/@content').extract()[0] item['postTime'] = datetime.strptime(t, '%Y-%m-%dT%H:%M:%S') item['replyTo'] = blog['replyid'].get(id) # Ignore deleted post if not item['replyTo']: continue body = c.xpath( './/div[@class="blog-comment-comment-body"]//text()' ).extract() item['body'] = ''.join(body).strip() uid = c.xpath( '//div[@class="blog-comment-comment-details"]/div/@data-id' ).extract()[0] item['uid'] = uid uItem['uid'] = uid name = c.xpath( '//div[contains(@class, "agent-tag")]/text()').extract()[0] uItem['firstName'], uItem['lastName'] = self.getName(name) url = 'http://activerain.com/profile/{0}/mini_vcard'.format( uid) request = scrapy.Request(url, callback=self.parse_mini_card, dont_filter=True) request.meta['item'] = item request.meta['uItem'] = uItem request.meta['handle_httpstatus_list'] = [404] yield request