Esempio n. 1
0
def _start_requests_vk(self):
    scrape_wall_url = utils.build_url(utils.API_URL_WALL,
                                      count=self.count,
                                      owner_id=self.owner_id,
                                      offset=self.offset,
                                      version=utils.API_VERSION,
                                      format='json',
                                      access_token=self.access_token)
    scrape_board_urls = [utils.build_url(utils.API_URL_BOARD,
                                         count=1,
                                         offset=0,
                                         topic_id=topic_id,
                                         group_id=abs(self.owner_id),
                                         version=utils.API_VERSION,
                                         format='json',
                                         access_token=self.access_token)
                         for topic_id in self.boards]
    urls = [('wall', scrape_wall_url)]
    urls.extend([('board', url) for url in scrape_board_urls])
    for (i, (type, url)) in enumerate(urls):
        if type == 'wall':
            request = scrapy.Request(url, dont_filter=True,
                                     callback=self.parse_wall)
        else:
            request = scrapy.Request(
                url, dont_filter=True,
                callback=self.parse_board,
                meta={'topic_id': self.boards[i-1]})
        yield request
Esempio n. 2
0
def _parse_vk_board(self, response):
    """Deals with board comments' json data received from VK API"""
    if response.status != 200:
        LOG.info("200 OK expected, got %s" % response.status)
        raise exc.SpiderException("Response code not supported: %s" %
                                  response.status)
    data = json.loads(response.body)
    # FIXME code duplication
    if "error" in data:
        raise exc.SpiderException("%(name)s spider failed: %(reason)s" %
                                  {"reason": data["error"]["error_msg"],
                                   "name": self.name})
    count = data["response"]["comments"][0]
    topic_id = response.meta['topic_id']

    def _process_comments(response):
        data = json.loads(response.body)
        posts_data = data["response"]["comments"][1:]
        for post in posts_data:
            item = postscraper.items.PostItem()
            item['date'] = utils.convert_date_to_str(
                datetime.fromtimestamp(post['date']))
            item['text'] = post['text']
            item['title'] = ("Board post from %s" % item['date'])
            item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" %
                            {'group': abs(self.owner_id),
                             'id': "%s_%s" % (abs(self.owner_id), post['id'])})
            item['author'] = ("http://vk.com/" +
                              ('id%s' % post['from_id']
                               if post['from_id'] > 0
                               else 'club%s' % abs(post['from_id'])))
            yield item

    # FIXME last 100 comments per request is VK API limitation
    fetch_last_100 = utils.build_url(utils.API_URL_BOARD,
                                     count=100,
                                     offset=max(count-100, 0),
                                     topic_id=topic_id,
                                     group_id=abs(self.owner_id),
                                     api_version=utils.API_VERSION,
                                     format='json',
                                     access_token=self.access_token)
    yield scrapy.Request(fetch_last_100, callback=_process_comments)