Exemple #1
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched", feed)
        process_feed.apply_async(args=[feed.id])
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
            break  # only one at a time
    # applying clusterizer
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if not UserController().get(id=user_id).effectivly_active:
            continue
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            clusterizer.apply_async(args=[user_id])
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    WORKER.labels(method='scheduler').observe(
        (datetime.now() - start).total_seconds())
    update_slow_metrics.apply_async()
Exemple #2
0
 def clusterize_pending_articles(self):
     results = []
     actrl = ArticleController(self.user_id)
     articles = list(actrl.read(cluster_id=None))
     logger.info('got %d articles to clusterize', len(articles))
     WORKER_BATCH.labels(worker_type='clusterizer').observe(len(articles))
     for article in actrl.read(cluster_id=None):
         filter_result = process_filters(article.feed.filters, {
             'tags': article.tags,
             'title': article.title,
             'link': article.link
         })
         result = self.clusterize(article, filter_result).id
         results.append(result)
     return results
Exemple #3
0
 def clusterize_pending_articles(self):
     results = []
     actrl = ArticleController(self.user_id)
     art_count = actrl.read(cluster_id=None).count()
     logger.info('User(%s) got %d articles to clusterize', self.user_id,
                 art_count)
     WORKER_BATCH.labels(worker_type='clusterizer').observe(art_count)
     clusterizer = Clusterizer(self.user_id)
     for article in actrl.read(cluster_id=None):
         filter_result = process_filters(article.feed.filters, {
             'tags': article.tags,
             'title': article.title,
             'link': article.link
         })
         result = clusterizer.main(article, filter_result).id
         results.append(result)
     return results
Exemple #4
0
def feed_cleaner(feed_id):
    logger.warning("Feed cleaner - start => %s", feed_id)
    WORKER_BATCH.labels(worker_type='delete').observe(1)
    fctrl = FeedController()
    result = fctrl.update({'id': feed_id, 'status': FeedStatus.to_delete},
                          {'status': FeedStatus.deleting})
    if not result:
        logger.error('feed %r seems locked, not doing anything', feed_id)
        return
    try:
        logger.warning("Deleting feed %r", feed_id)
        fctrl.delete(feed_id)
    except Exception:
        logger.exception('something went wrong when deleting feeds %r',
                         feed_id)
        fctrl.update({'id': feed_id}, {'status': FeedStatus.to_delete})
        raise
    finally:
        REDIS_CONN.delete(JARR_FEED_DEL_KEY)
Exemple #5
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched on queue:%r",
                     feed, queue.value)
        process_feed.apply_async(args=[feed.id], queue=queue.value)
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
    # applying clusterizer
    queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            logger.debug('Scheduling clusterizer for User(%d) on queue:%r',
                         user_id, queue.value)
            clusterizer.apply_async(args=[user_id], queue=queue.value)
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    metrics_users_any.apply_async()
    metrics_users_active.apply_async()
    metrics_users_long_term.apply_async()
    metrics_articles_unclustered.apply_async()
    observe_worker_result_since(start, 'scheduler', 'ok')
Exemple #6
0
    def _get_cluster_by_similarity(self, article):
        neighbors = list(self.get_neighbors(article))

        min_sample_size = get_tfidf_pref(article.feed, 'min_sample_size')
        if len(neighbors) < min_sample_size:
            logger.info('only %d docs against %d required, no TFIDF for %r',
                        len(neighbors), min_sample_size, article)
            cluster_event(context='tfidf', result='sample size forbird')
            return None
        logger.info('%r TFIDF is gonna work with a corpus of %d documents',
                    article.feed, len(neighbors))
        WORKER_BATCH.labels(worker_type='tfidf_batch').observe(len(neighbors))

        best_match, score = get_best_match_and_score(article, neighbors)
        TFIDF_SCORE.labels(
            feed_type=article.feed.feed_type.value).observe(score)
        if score > get_tfidf_pref(article.feed, 'min_score'):
            article.cluster_reason = ClusterReason.tf_idf
            article.cluster_score = int(score * 1000)
            article.cluster_tfidf_neighbor_size = len(neighbors)
            article.cluster_tfidf_with = best_match.id
            cluster_event(context='tfidf', result='match', level=logging.INFO)
            return best_match.cluster
        cluster_event(context='tfidf', result='miss')