Example #1
0
 def _verify_workers_limits(self, url, avg_links_per_page=10):
     active_domains = Domain.get_active_domains(self.db)
     return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
Example #2
0
    def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10):
        from holmes.models import Settings, Worker, Domain, Limiter  # Avoid circular dependency

        page = None
        lock = None
        settings = Settings.instance(db)
        workers = db.query(Worker).all()
        number_of_workers = len(workers)

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        all_domains_pages_in_need_of_review = {}

        for domain_id in active_domains_ids:
            pages = db \
                .query(
                    Page.uuid,
                    Page.url,
                    Page.score,
                    Page.last_review_date
                ) \
                .filter(Page.domain_id == domain_id) \
                .order_by(Page.score.desc())[:number_of_workers]
            if pages:
                all_domains_pages_in_need_of_review[domain_id] = pages

        pages_in_need_of_review = []
        current_domain = 0
        while all_domains_pages_in_need_of_review:
            domains = all_domains_pages_in_need_of_review.keys()
            if current_domain >= len(domains):
                current_domain = 0

            domain_id = domains[current_domain]

            item = all_domains_pages_in_need_of_review[domain_id].pop(0)
            pages_in_need_of_review.append(item)

            if not all_domains_pages_in_need_of_review[domain_id]:
                del all_domains_pages_in_need_of_review[domain_id]

            current_domain += 1

        if not pages_in_need_of_review:
            return None

        if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score:
            cls.update_pages_score_by(settings, settings.lambda_score, db)

        for i in range(len(pages_in_need_of_review)):
            if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page):
                continue

            lock = cache.has_next_job_lock(
                pages_in_need_of_review[i].url,
                lock_expiration
            )

            if lock is not None:
                page = pages_in_need_of_review[i]
                break

        if page is None:
            return None

        return {
            'page': str(page.uuid),
            'url': page.url,
            'score': page.score,
            'lock': lock
        }