Exemple #1
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched", feed)
        process_feed.apply_async(args=[feed.id])
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
            break  # only one at a time
    # applying clusterizer
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if not UserController().get(id=user_id).effectivly_active:
            continue
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            clusterizer.apply_async(args=[user_id])
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    WORKER.labels(method='scheduler').observe(
        (datetime.now() - start).total_seconds())
    update_slow_metrics.apply_async()
Exemple #2
0
    def test_fetchable(self):
        fctrl = FeedController()
        total = fctrl.read().count()
        unix = datetime(1970, 1, 1).replace(tzinfo=timezone.utc)
        count = 0
        for fd in fctrl.list_late():
            count += 1
            self.assertEqual(unix, fd.last_retrieved)
            self.assertEqual(unix, fd.expires)
        self.assertEqual(total, count)

        fetchables = fctrl.list_fetchable()
        now = utc_now()
        for fd in fetchables:
            self.assert_in_range(now - timedelta(seconds=1), fd.last_retrieved,
                                 now)
            self.assertEqual(unix, fd.expires)
        self.assert_late_count(
            0, "no late feed to report because all just fetched")
        fctrl.update({}, {'expires': unix})
        now = utc_now()
        for fd in fctrl.read():  # expires should be corrected
            self.assert_in_range(
                now + timedelta(seconds=conf.feed.min_expires - 1), fd.expires,
                now + timedelta(seconds=conf.feed.min_expires + 1))

        lr_not_matter = timedelta(seconds=conf.feed.min_expires + 10)
        self.update_all_no_ctrl(expires=utc_now() - timedelta(seconds=1),
                                last_retrieved=utc_now() - lr_not_matter)
        self.assert_late_count(total, "all feed just expired")
        self.update_all_no_ctrl(expires=utc_now() + timedelta(seconds=1))
        self.assert_late_count(
            0, "all feed will expire in a second, none are expired")
Exemple #3
0
    def test_scheduler(self):
        scheduler()
        UserController().update({}, {'last_connection': utc_now()})
        fctrl = FeedController()

        epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)
        self.assertEqual(fctrl.read().count(),
                         self.process_feed_patch.apply_async.call_count)
        self.assertEqual(0, self.clusteriser_patch.apply_async.call_count)
        self.assertEqual(0, self.feed_cleaner_patch.apply_async.call_count)
        feed1, feed2, feed3 = list(FeedController().read().limit(3))
        FeedController().update({'id__in': [feed1.id, feed3.id]},
                                {'status': 'to_delete'})
        FeedController().update({'id': feed2.id}, {
            'last_retrieved': epoch,
            'expires': epoch
        })
        self.assertEqual(1, len(list(fctrl.list_fetchable())))
        scheduler()
        self.assertEqual(fctrl.read().count(),
                         self.process_feed_patch.apply_async.call_count)
        self.assertEqual(0, self.clusteriser_patch.apply_async.call_count)
        self.assertEqual(1, self.feed_cleaner_patch.apply_async.call_count)
Exemple #4
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched on queue:%r",
                     feed, queue.value)
        process_feed.apply_async(args=[feed.id], queue=queue.value)
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
    # applying clusterizer
    queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            logger.debug('Scheduling clusterizer for User(%d) on queue:%r',
                         user_id, queue.value)
            clusterizer.apply_async(args=[user_id], queue=queue.value)
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    metrics_users_any.apply_async()
    metrics_users_active.apply_async()
    metrics_users_long_term.apply_async()
    metrics_articles_unclustered.apply_async()
    observe_worker_result_since(start, 'scheduler', 'ok')
Exemple #5
0
 def assert_late_count(self, count, msg):
     fctrl = FeedController()
     self.assertEqual(count, len(list(fctrl.list_late())), msg)
     self.assertEqual(count, len(fctrl.list_fetchable()), msg)