Beispiel #1
0
    def test_can_get_next_job_when_domain_limited(self):
        self.db.query(Domain).delete()
        self.db.query(Page).delete()

        domain_a = DomainFactory.create()
        domain_b = DomainFactory.create()

        LimiterFactory.create(url=domain_a.url, value=2)

        pages_a = []
        pages_b = []
        workers = []
        for i in range(10):
            for j in range(2):
                workers.append(WorkerFactory.create())

            pages_a.append(PageFactory.create(domain=domain_a, url="%s/%d.html" % (domain_a.url, i), score=i * 10))
            pages_b.append(PageFactory.create(domain=domain_b, url="%s/%d.html" % (domain_b.url, i), score=i))

        # first one should not be limited
        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1,
            avg_links_per_page=10
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(pages_a[-1].uuid))
        workers[0].current_url = next_job['url']
        self.db.flush()

        # second one should be limited (2 / 10 = 0.2, rounded up = 1 job at a time)
        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(pages_b[-1].uuid))
Beispiel #2
0
    def test_increases_page_score_when_lambda_is_top_page(self):
        WorkerFactory.create()
        page = PageFactory.create()
        page2 = PageFactory.create()

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
Beispiel #3
0
    def test_can_get_next_job(self):
        page = PageFactory.create()

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(page.uuid))
Beispiel #4
0
    def test_can_get_next_job_when_expired(self):
        page = PageFactory.create(last_review_date=datetime(2010, 10, 10, 10, 10, 10))

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).not_to_be_null()
        expect(next_job['page']).to_equal(str(page.uuid))
Beispiel #5
0
    def test_get_next_job_does_not_get_from_inactive_domains(self):
        domain = DomainFactory.create(is_active=False)
        PageFactory.create(domain=domain)

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).to_be_null()
Beispiel #6
0
    def test_can_get_next_job(self):
        domain = DomainFactory.create()
        pages = []
        for i in range(20):
            WorkerFactory.create()
            pages.append(PageFactory.create(
                domain=domain,
                score=float(i)
            ))

        for i in range(20):
            next_job = Page.get_next_job(
                self.db,
                expiration=100,
                cache=self.sync_cache,
                lock_expiration=100
            )

            expect(next_job).not_to_be_null()
            expect(next_job['page']).to_equal(str(pages[19 - i].uuid))
Beispiel #7
0
    def test_increases_page_score_when_all_pages_have_been_reviewed(self):
        page = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))
        page2 = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10))

        settings = Settings.instance(self.db)
        settings.lambda_score = 10000

        next_job = Page.get_next_job(
            self.db,
            expiration=100,
            cache=self.sync_cache,
            lock_expiration=1
        )

        expect(next_job).to_be_null()

        self.db.refresh(page)
        self.db.refresh(page2)

        expect(page.score).to_equal(5000)
        expect(page2.score).to_equal(5000)
Beispiel #8
0
 def _load_next_job(self):
     return Page.get_next_job(
         self.db,
         self.config.REVIEW_EXPIRATION_IN_SECONDS,
         self.cache,
         self.config.NEXT_JOB_URL_LOCK_EXPIRATION_IN_SECONDS)