def get_next_job(cls, db, expiration, cache, lock_expiration): from holmes.models import Settings settings = Settings.instance(db) pages_in_need_of_review = cls.get_next_job_list(db, expiration) if len(pages_in_need_of_review) == 0: if settings.lambda_score > 0: cls.update_pages_score_by(settings, settings.lambda_score, db) return None if settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) return None page = choice(pages_in_need_of_review) for i in range(10): lock = cache.has_next_job_lock(page.url, lock_expiration) if lock is None: page = choice(pages_in_need_of_review) else: break if lock is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }
def test_increases_page_score_when_lambda_is_top_page(self): WorkerFactory.create() page = PageFactory.create() page2 = PageFactory.create() settings = Settings.instance(self.db) settings.lambda_score = 10000 Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) self.db.refresh(page) self.db.refresh(page2) expect(page.score).to_equal(5000) expect(page2.score).to_equal(5000)
def test_increases_page_score_when_all_pages_have_been_reviewed(self): page = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10)) page2 = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10)) settings = Settings.instance(self.db) settings.lambda_score = 10000 next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).to_be_null() self.db.refresh(page) self.db.refresh(page2) expect(page.score).to_equal(5000) expect(page2.score).to_equal(5000)
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10): from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency page = None lock = None settings = Settings.instance(db) workers = db.query(Worker).all() number_of_workers = len(workers) active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] all_domains_pages_in_need_of_review = {} for domain_id in active_domains_ids: pages = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .order_by(Page.score.desc())[:number_of_workers] if pages: all_domains_pages_in_need_of_review[domain_id] = pages pages_in_need_of_review = [] current_domain = 0 while all_domains_pages_in_need_of_review: domains = all_domains_pages_in_need_of_review.keys() if current_domain >= len(domains): current_domain = 0 domain_id = domains[current_domain] item = all_domains_pages_in_need_of_review[domain_id].pop(0) pages_in_need_of_review.append(item) if not all_domains_pages_in_need_of_review[domain_id]: del all_domains_pages_in_need_of_review[domain_id] current_domain += 1 if not pages_in_need_of_review: return None if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) for i in range(len(pages_in_need_of_review)): if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page): continue lock = cache.has_next_job_lock( pages_in_need_of_review[i].url, lock_expiration ) if lock is not None: page = pages_in_need_of_review[i] break if page is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }
def test_can_get_instance(self): settings = Settings.instance(self.db) expect(settings.lambda_score).to_equal(0)