Beispiel #1
0
def _set_last_ts(self, date):
    date_str = (utils.convert_date_to_str(date)
                if isinstance(date, datetime) else date)
    if not date_str:
        return
    with open(self.last_seen_filename, 'wb') as f:
        f.write(date_str)
Beispiel #2
0
 def test_last_ts_prop(self):
     # check that on creation no ts is set
     self.assertIsNone(self.spider.last_ts)
     date = datetime.datetime.utcnow()
     date_str = utils.convert_date_to_str(date)
     # check that both datetime object and date_str can be set
     self.spider.last_ts = date
     self.assertEqual(datetime.datetime, type(self.spider.last_ts))
     self.spider.last_ts = date_str
     self.assertEqual(datetime.datetime, type(self.spider.last_ts))
     # last_ts is set after crawl job and the file exists as well
     self.assertIsNotNone(self.spider.last_ts)
     self.assertTrue(os.path.exists(self.spider.last_seen_filename))
Beispiel #3
0
 def _process_comments(response):
     data = json.loads(response.body)
     posts_data = data["response"]["comments"][1:]
     for post in posts_data:
         item = postscraper.items.PostItem()
         item['date'] = utils.convert_date_to_str(
             datetime.fromtimestamp(post['date']))
         item['text'] = post['text']
         item['title'] = ("Board post from %s" % item['date'])
         item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" %
                         {'group': abs(self.owner_id),
                          'id': "%s_%s" % (abs(self.owner_id), post['id'])})
         item['author'] = ("http://vk.com/" +
                           ('id%s' % post['from_id']
                            if post['from_id'] > 0
                            else 'club%s' % abs(post['from_id'])))
         yield item
Beispiel #4
0
 def close_spider(self, spider):
     """Sends an email with new items if any"""
     items = self._filter_by_query(spider)
     # don't generate an email with 0 results
     if len(items) == 0:
         spider.email = None
         return
     env = Environment(loader=FileSystemLoader(settings.TEMPLATES_DIR))
     template = env.get_template('mail_items.html')
     body = template.render(items=items, query=settings.QUERY)
     # save email body in a file
     date = ("the very beginning" if not spider.last_ts
             else utils.convert_date_to_str(spider.last_ts))
     text = ("<h1>"
             "%(count)s new items from %(link)s since %(date)s</h1>\n"
             "%(body)s"
             % {'count': len(items), 'link': spider.name,
                 'date': date, 'body': body})
     spider.email = text
Beispiel #5
0
def _parse_vk_wall(self, response):
    """Deals with wall posts' json data received from VK API"""
    if response.status != 200:
        LOG.info("200 OK expected, got %s" % response.status)
        raise exc.SpiderException("Response code not supported: %s" %
                                  response.status)
    data = json.loads(response.body)
    # FIXME code duplication
    if "error" in data:
        raise exc.SpiderException("%(name)s spider failed: %(reason)s" %
                                  {"reason": data["error"]["error_msg"],
                                   "name": self.name})
    posts_data = data["response"][1:]
    for post in posts_data:
        item = postscraper.items.PostItem()
        if post['text'] == '':
            # a repost of some kind
            try:
                item['text'] = ("%(title)s\n%(description)s" % {
                    'description': post['attachment']['link']['description'],
                    'title': post['attachment']['link']['title']})
                item['link'] = post['attachment']['link']['url']
            except (KeyError, ValueError):
                continue
        else:
            # a native post
            item['text'] = post['text']
            item['link'] = ("http://vk.com/public%(group)s?w=wall-%(id)s" %
                            {'group': abs(self.owner_id),
                             'id': "%s_%s" % (abs(self.owner_id), post['id'])})
        item['date'] = utils.convert_date_to_str(
            datetime.fromtimestamp(post['date']))
        item['title'] = ("Wall post from %s" % item['date'])
        item['author'] = ("http://vk.com/" +
                          ('id%s' % post['from_id']
                           if post['from_id'] > 0
                           else 'club%s' % abs(post['from_id'])))
        yield item