def scheduler(): logger.warning("Running scheduler") start = datetime.now() fctrl = FeedController() # browsing feeds to fetch feeds = list(fctrl.list_fetchable(conf.crawler.batch_size)) WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds)) logger.info('%d to enqueue', len(feeds)) for feed in feeds: logger.debug("%r: scheduling to be fetched", feed) process_feed.apply_async(args=[feed.id]) # browsing feeds to delete feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete)) if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'): REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE) logger.info('%d to delete, deleting one', len(feeds_to_delete)) for feed in feeds_to_delete: logger.debug("%r: scheduling to be delete", feed) feed_cleaner.apply_async(args=[feed.id]) break # only one at a time # applying clusterizer for user_id in ArticleController.get_user_id_with_pending_articles(): if not UserController().get(id=user_id).effectivly_active: continue if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'): REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id, conf.crawler.clusterizer_delay) clusterizer.apply_async(args=[user_id]) scheduler.apply_async(countdown=conf.crawler.idle_delay) WORKER.labels(method='scheduler').observe( (datetime.now() - start).total_seconds()) update_slow_metrics.apply_async()
def clusterize_pending_articles(self): results = [] actrl = ArticleController(self.user_id) articles = list(actrl.read(cluster_id=None)) logger.info('got %d articles to clusterize', len(articles)) WORKER_BATCH.labels(worker_type='clusterizer').observe(len(articles)) for article in actrl.read(cluster_id=None): filter_result = process_filters(article.feed.filters, { 'tags': article.tags, 'title': article.title, 'link': article.link }) result = self.clusterize(article, filter_result).id results.append(result) return results
def clusterize_pending_articles(self): results = [] actrl = ArticleController(self.user_id) art_count = actrl.read(cluster_id=None).count() logger.info('User(%s) got %d articles to clusterize', self.user_id, art_count) WORKER_BATCH.labels(worker_type='clusterizer').observe(art_count) clusterizer = Clusterizer(self.user_id) for article in actrl.read(cluster_id=None): filter_result = process_filters(article.feed.filters, { 'tags': article.tags, 'title': article.title, 'link': article.link }) result = clusterizer.main(article, filter_result).id results.append(result) return results
def feed_cleaner(feed_id): logger.warning("Feed cleaner - start => %s", feed_id) WORKER_BATCH.labels(worker_type='delete').observe(1) fctrl = FeedController() result = fctrl.update({'id': feed_id, 'status': FeedStatus.to_delete}, {'status': FeedStatus.deleting}) if not result: logger.error('feed %r seems locked, not doing anything', feed_id) return try: logger.warning("Deleting feed %r", feed_id) fctrl.delete(feed_id) except Exception: logger.exception('something went wrong when deleting feeds %r', feed_id) fctrl.update({'id': feed_id}, {'status': FeedStatus.to_delete}) raise finally: REDIS_CONN.delete(JARR_FEED_DEL_KEY)
def scheduler(): logger.warning("Running scheduler") start = datetime.now() fctrl = FeedController() # browsing feeds to fetch queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT feeds = list(fctrl.list_fetchable(conf.crawler.batch_size)) WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds)) logger.info('%d to enqueue', len(feeds)) for feed in feeds: logger.debug("%r: scheduling to be fetched on queue:%r", feed, queue.value) process_feed.apply_async(args=[feed.id], queue=queue.value) # browsing feeds to delete feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete)) if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'): REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE) logger.info('%d to delete, deleting one', len(feeds_to_delete)) for feed in feeds_to_delete: logger.debug("%r: scheduling to be delete", feed) feed_cleaner.apply_async(args=[feed.id]) # applying clusterizer queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT for user_id in ArticleController.get_user_id_with_pending_articles(): if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'): REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id, conf.crawler.clusterizer_delay) logger.debug('Scheduling clusterizer for User(%d) on queue:%r', user_id, queue.value) clusterizer.apply_async(args=[user_id], queue=queue.value) scheduler.apply_async(countdown=conf.crawler.idle_delay) metrics_users_any.apply_async() metrics_users_active.apply_async() metrics_users_long_term.apply_async() metrics_articles_unclustered.apply_async() observe_worker_result_since(start, 'scheduler', 'ok')
def _get_cluster_by_similarity(self, article): neighbors = list(self.get_neighbors(article)) min_sample_size = get_tfidf_pref(article.feed, 'min_sample_size') if len(neighbors) < min_sample_size: logger.info('only %d docs against %d required, no TFIDF for %r', len(neighbors), min_sample_size, article) cluster_event(context='tfidf', result='sample size forbird') return None logger.info('%r TFIDF is gonna work with a corpus of %d documents', article.feed, len(neighbors)) WORKER_BATCH.labels(worker_type='tfidf_batch').observe(len(neighbors)) best_match, score = get_best_match_and_score(article, neighbors) TFIDF_SCORE.labels( feed_type=article.feed.feed_type.value).observe(score) if score > get_tfidf_pref(article.feed, 'min_score'): article.cluster_reason = ClusterReason.tf_idf article.cluster_score = int(score * 1000) article.cluster_tfidf_neighbor_size = len(neighbors) article.cluster_tfidf_with = best_match.id cluster_event(context='tfidf', result='match', level=logging.INFO) return best_match.cluster cluster_event(context='tfidf', result='miss')