Example #1
0
    def test_can_get_active_domains(self):
        self.db.query(Domain).delete()

        domain = DomainFactory(is_active=True)
        DomainFactory(is_active=False)

        domains = Domain.get_active_domains(self.db)

        expect(domains).to_length(1)
        expect(domains[0].id).to_equal(domain.id)
Example #2
0
    def test_can_get_active_domains(self):
        self.db.query(Domain).delete()

        domain = DomainFactory(is_active=True)
        DomainFactory(is_active=False)

        domains = Domain.get_active_domains(self.db)

        expect(domains).to_length(1)
        expect(domains[0].id).to_equal(domain.id)
Example #3
0
    def get_next_jobs_count(cls, db, config):
        from holmes.models import Domain

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        return db \
                .query(
                    sa.func.count(Page.id)
                ) \
                .filter(Page.domain_id.in_(active_domains_ids)) \
                .scalar()
Example #4
0
    def get_next_job_list(cls, db, expiration, current_page=1, page_size=200):
        from holmes.models import Domain

        lower_bound = (current_page - 1) * page_size
        upper_bound = lower_bound + page_size

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        pages_query = db \
            .query(
                Page.uuid,
                Page.url,
                Page.score,
                Page.last_review_date
            ) \
            .filter(Page.domain_id.in_(active_domains_ids)) \
            .order_by(Page.score.desc())

        return pages_query[lower_bound:upper_bound]
Example #5
0
    def fill_job_bucket(self,
                        expiration,
                        look_ahead_pages=1000,
                        avg_links_per_page=10.0):
        try:
            with Lock('next-job-fill-bucket-lock', redis=self.redis):
                logging.info('Refilling job bucket. Lock acquired...')
                expired_time = datetime.utcnow() - timedelta(
                    seconds=expiration)

                active_domains = Domain.get_active_domains(self.db)

                if not active_domains:
                    return

                active_domains_ids = [item.id for item in active_domains]

                limiter_buckets = self.get_limiter_buckets(
                    active_domains, avg_links_per_page)

                all_domains_pages_in_need_of_review = []

                for domain_id in active_domains_ids:
                    pages = self.db \
                        .query(
                            Page.uuid,
                            Page.url,
                            Page.score,
                            Page.last_review_date
                        ) \
                        .filter(Page.domain_id == domain_id) \
                        .filter(or_(
                            Page.last_review_date == None,
                            Page.last_review_date <= expired_time
                        ))[:look_ahead_pages]

                    if pages:
                        all_domains_pages_in_need_of_review.append(pages)

                logging.debug(
                    'Total of %d pages found to add to redis.' % (sum([
                        len(item)
                        for item in all_domains_pages_in_need_of_review
                    ])))

                item_count = int(self.redis.zcard('next-job-bucket'))
                current_domain = 0
                while item_count < look_ahead_pages and len(
                        all_domains_pages_in_need_of_review) > 0:
                    if current_domain >= len(
                            all_domains_pages_in_need_of_review):
                        current_domain = 0

                    item = all_domains_pages_in_need_of_review[
                        current_domain].pop(0)

                    has_limit = True
                    logging.debug('Available Limit Buckets: %s' %
                                  limiter_buckets)
                    for index, (limit,
                                available) in enumerate(limiter_buckets):
                        if limit.matches(item.url):
                            if available <= 0:
                                has_limit = False
                                break
                            limiter_buckets[index] = (limit, available - 1)

                    if has_limit:
                        self.add_next_job_bucket(item.uuid, item.url)
                        item_count += 1

                    # if there are not any more pages in this domain remove it from dictionary
                    if not all_domains_pages_in_need_of_review[current_domain]:
                        del all_domains_pages_in_need_of_review[current_domain]

                    current_domain += 1

                logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' %
                              item_count)

        except LockTimeout:
            logging.info("Can't acquire lock. Moving on...")
Example #6
0
 def _verify_workers_limits(self, url, avg_links_per_page=10):
     active_domains = Domain.get_active_domains(self.db)
     return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
Example #7
0
    def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10):
        from holmes.models import Settings, Worker, Domain, Limiter  # Avoid circular dependency

        page = None
        lock = None
        settings = Settings.instance(db)
        workers = db.query(Worker).all()
        number_of_workers = len(workers)

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        all_domains_pages_in_need_of_review = {}

        for domain_id in active_domains_ids:
            pages = db \
                .query(
                    Page.uuid,
                    Page.url,
                    Page.score,
                    Page.last_review_date
                ) \
                .filter(Page.domain_id == domain_id) \
                .order_by(Page.score.desc())[:number_of_workers]
            if pages:
                all_domains_pages_in_need_of_review[domain_id] = pages

        pages_in_need_of_review = []
        current_domain = 0
        while all_domains_pages_in_need_of_review:
            domains = all_domains_pages_in_need_of_review.keys()
            if current_domain >= len(domains):
                current_domain = 0

            domain_id = domains[current_domain]

            item = all_domains_pages_in_need_of_review[domain_id].pop(0)
            pages_in_need_of_review.append(item)

            if not all_domains_pages_in_need_of_review[domain_id]:
                del all_domains_pages_in_need_of_review[domain_id]

            current_domain += 1

        if not pages_in_need_of_review:
            return None

        if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score:
            cls.update_pages_score_by(settings, settings.lambda_score, db)

        for i in range(len(pages_in_need_of_review)):
            if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page):
                continue

            lock = cache.has_next_job_lock(
                pages_in_need_of_review[i].url,
                lock_expiration
            )

            if lock is not None:
                page = pages_in_need_of_review[i]
                break

        if page is None:
            return None

        return {
            'page': str(page.uuid),
            'url': page.url,
            'score': page.score,
            'lock': lock
        }
Example #8
0
    def fill_job_bucket(self, expiration, look_ahead_pages=1000, avg_links_per_page=10.0):
        try:
            with Lock('next-job-fill-bucket-lock', redis=self.redis):
                logging.info('Refilling job bucket. Lock acquired...')
                expired_time = datetime.utcnow() - timedelta(seconds=expiration)

                active_domains = Domain.get_active_domains(self.db)

                if not active_domains:
                    return

                active_domains_ids = [item.id for item in active_domains]

                limiter_buckets = self.get_limiter_buckets(active_domains, avg_links_per_page)

                all_domains_pages_in_need_of_review = []

                for domain_id in active_domains_ids:
                    pages = self.db \
                        .query(
                            Page.uuid,
                            Page.url,
                            Page.score,
                            Page.last_review_date
                        ) \
                        .filter(Page.domain_id == domain_id) \
                        .filter(or_(
                            Page.last_review_date == None,
                            Page.last_review_date <= expired_time
                        ))[:look_ahead_pages]

                    if pages:
                        all_domains_pages_in_need_of_review.append(pages)

                logging.debug('Total of %d pages found to add to redis.' % (sum([len(item) for item in all_domains_pages_in_need_of_review])))

                item_count = int(self.redis.zcard('next-job-bucket'))
                current_domain = 0
                while item_count < look_ahead_pages and len(all_domains_pages_in_need_of_review) > 0:
                    if current_domain >= len(all_domains_pages_in_need_of_review):
                        current_domain = 0

                    item = all_domains_pages_in_need_of_review[current_domain].pop(0)

                    has_limit = True
                    logging.debug('Available Limit Buckets: %s' % limiter_buckets)
                    for index, (limit, available) in enumerate(limiter_buckets):
                        if limit.matches(item.url):
                            if available <= 0:
                                has_limit = False
                                break
                            limiter_buckets[index] = (limit, available - 1)

                    if has_limit:
                        self.add_next_job_bucket(item.uuid, item.url)
                        item_count += 1

                    # if there are not any more pages in this domain remove it from dictionary
                    if not all_domains_pages_in_need_of_review[current_domain]:
                        del all_domains_pages_in_need_of_review[current_domain]

                    current_domain += 1

                logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' % item_count)

        except LockTimeout:
            logging.info("Can't acquire lock. Moving on...")