def _start_requests_vk(self): scrape_wall_url = utils.build_url(utils.API_URL_WALL, count=self.count, owner_id=self.owner_id, offset=self.offset, version=utils.API_VERSION, format='json', access_token=self.access_token) scrape_board_urls = [utils.build_url(utils.API_URL_BOARD, count=1, offset=0, topic_id=topic_id, group_id=abs(self.owner_id), version=utils.API_VERSION, format='json', access_token=self.access_token) for topic_id in self.boards] urls = [('wall', scrape_wall_url)] urls.extend([('board', url) for url in scrape_board_urls]) for (i, (type, url)) in enumerate(urls): if type == 'wall': request = scrapy.Request(url, dont_filter=True, callback=self.parse_wall) else: request = scrapy.Request( url, dont_filter=True, callback=self.parse_board, meta={'topic_id': self.boards[i-1]}) yield request
def _parse_vk_board(self, response): """Deals with board comments' json data received from VK API""" if response.status != 200: LOG.info("200 OK expected, got %s" % response.status) raise exc.SpiderException("Response code not supported: %s" % response.status) data = json.loads(response.body) # FIXME code duplication if "error" in data: raise exc.SpiderException("%(name)s spider failed: %(reason)s" % {"reason": data["error"]["error_msg"], "name": self.name}) count = data["response"]["comments"][0] topic_id = response.meta['topic_id'] def _process_comments(response): data = json.loads(response.body) posts_data = data["response"]["comments"][1:] for post in posts_data: item = postscraper.items.PostItem() item['date'] = utils.convert_date_to_str( datetime.fromtimestamp(post['date'])) item['text'] = post['text'] item['title'] = ("Board post from %s" % item['date']) item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" % {'group': abs(self.owner_id), 'id': "%s_%s" % (abs(self.owner_id), post['id'])}) item['author'] = ("http://vk.com/" + ('id%s' % post['from_id'] if post['from_id'] > 0 else 'club%s' % abs(post['from_id']))) yield item # FIXME last 100 comments per request is VK API limitation fetch_last_100 = utils.build_url(utils.API_URL_BOARD, count=100, offset=max(count-100, 0), topic_id=topic_id, group_id=abs(self.owner_id), api_version=utils.API_VERSION, format='json', access_token=self.access_token) yield scrapy.Request(fetch_last_100, callback=_process_comments)