def test_update_pages_score(self): config = Config() config.MAX_PAGE_SCORE = 15000000 self.db.query(Page).delete() self.sync_cache.redis.delete('pages-score') page1 = PageFactory.create(score=3) page2 = PageFactory.create(score=0) for i in range(3): self.sync_cache.increment_page_score(page1.id) self.sync_cache.increment_page_score(page2.id) expect(page1.score).to_equal(3) expect(page2.score).to_equal(0) Page.update_pages_score(self.db, self.sync_cache, config) self.db.flush() self.db.refresh(page1) self.db.refresh(page2) expect(page1.score).to_equal(6) expect(page2.score).to_equal(1)
def test_can_get_page_by_uuid(self): page = PageFactory.create() PageFactory.create() loaded_page = Page.by_uuid(page.uuid, self.db) expect(loaded_page.id).to_equal(page.id) invalid_page = Page.by_uuid(uuid4(), self.db) expect(invalid_page).to_be_null()
def test_can_get_page_by_url_hash(self): page = PageFactory.create() PageFactory.create() loaded_page = Page.by_url_hash(page.url_hash, self.db) expect(loaded_page.id).to_equal(page.id) invalid_page = Page.by_uuid('123', self.db) expect(invalid_page).to_be_null()
def _update_pages_score(self): expiration = self.config.UPDATE_PAGES_SCORE_EXPIRATION lock = self.cache.has_update_pages_lock(expiration) if lock is not None: self.debug('Updating pages score...') Page.update_pages_score(self.db, self.cache, self.config) self.cache.release_update_pages_lock(lock) self.last_update_pages_score = datetime.utcnow()
def enqueue(self, urls): if not urls: return for url, score in urls: Page.add_page(self.db, self.cache, url, score, self.async_get_func, self.publish, self.config, self.girl, self.violation_definitions, self.handle_page_added) self.wait_for_async_requests()
def test_can_get_next_jobs_count(self): config = Config() config.REVIEW_EXPIRATION_IN_SECONDS = 100 for x in range(3): PageFactory.create() next_job_list = Page.get_next_jobs_count(self.db, config) expect(next_job_list).to_equal(3) for x in range(2): PageFactory.create() next_job_list = Page.get_next_jobs_count(self.db, config) expect(next_job_list).to_equal(5)
def post(self): post_data = loads(self.request.body) url = post_data["url"] score = float(post_data.get("score", self.application.config.DEFAULT_PAGE_SCORE)) result = yield Page.add_page( self.db, self.application.cache, url, score, self.application.http_client.fetch, self.application.event_bus.publish, self.application.config, ) created, url, result = result if not created and result["reason"] == "invalid_url": self.set_status(400, "Invalid url [%s]" % url) self.write_json( {"reason": "invalid_url", "url": url, "status": result["status"], "details": result["details"]} ) return if not created and result["reason"] == "redirect": self.set_status(400, "Redirect URL [%s]" % url) self.write_json({"reason": "redirect", "url": url, "effectiveUrl": result["effectiveUrl"]}) return self.write(str(result)) self.finish()
def get_next_jobs_count(self, callback=None): self.get_data( 'next-jobs', int(self.config.NEXT_JOBS_COUNT_EXPIRATION_IN_SECONDS), lambda: Page.get_next_jobs_count(self.db, self.config), callback=callback )
def get(self): current_page = int(self.get_argument('current_page', 1)) page_size = int(self.get_argument('page_size', 10)) get_next_job_list = Page.get_next_job_list( self.db, self.application.config.REVIEW_EXPIRATION_IN_SECONDS, current_page=current_page, page_size=page_size ) #review_count = self.girl.get('next_jobs_count') review_count = 0 result = {'reviewCount': review_count} pages = [] for item in get_next_job_list: pages.append({ 'uuid': item.uuid, 'url': item.url, }) result['pages'] = pages self.write_json(result)
def get(self, uuid="", limit=10): uuid = UUID(uuid) page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._("Page UUID [%s] not found") % uuid) return reviews = ( self.db.query(Review) .filter(Review.page == page) .filter(Review.is_complete == True) .order_by(Review.completed_date.desc())[:limit] ) result = [] for review in reviews: result.append( { "uuid": str(review.uuid), "completedAt": review.completed_date, "violationCount": review.violation_count, } ) self.write_json(result)
def test_can_save(self): self.mock_request(status_code=200, effective_url="http://www.globo.com") self.server.application.girl = Mock() response = yield self.authenticated_fetch( '/page', method='POST', body=dumps({ 'url': 'http://www.globo.com' }) ) expect(response.code).to_equal(200) page_uuid = UUID(response.body) page = Page.by_uuid(page_uuid, self.db) expect(page).not_to_be_null() expect(str(page_uuid)).to_equal(page.uuid) expect(self.server.application.girl.expire.call_count).to_equal(4) self.server.application.girl.assert_has_calls([ call.expire('domains_details'), call.expire('failed_responses_count'), call.expire('violation_count_for_domains'), call.expire('top_violations_in_category_for_domains'), ])
def get(self, page_uuid, review_uuid): review = None page = None if self._parse_uuid(review_uuid): review = Review.by_uuid(review_uuid, self.db) if self._parse_uuid(page_uuid): page = Page.by_uuid(page_uuid, self.db) if not review and page: self.redirect('/page/%s/review/%s/' % (page_uuid, page.last_review_uuid)) return if not page: self.set_status(404, self._('Page UUID [%s] not found') % page_uuid) return result = review.to_dict(self.application.fact_definitions, self.application.violation_definitions, self._) result.update({ 'violationPoints': review.get_violation_points(), 'violationCount': review.violation_count, }) self.write_json(result)
def increment_next_jobs_count(self, increment=1, callback=None): self.increment_data( 'next-jobs', lambda: Page.get_next_jobs_count(self.db, self.config), increment, callback )
def get_count(self, key, domain_name, expiration, get_count_method): cache_key = '%s-%s' % (self.get_domain_name(domain_name), key) count = self.redis.get(cache_key) if count is not None: return int(count) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex( cache_key, expiration, value=int(count) ) return int(count)
def enqueue(self, urls): if not urls: return for url, score in urls: Page.add_page( self.db, self.cache, url, score, self.async_get_func, self.publish, self.config, self.handle_page_added ) self.wait_for_async_requests()
def test_can_get_next_job_when_domain_limited(self): self.db.query(Domain).delete() self.db.query(Page).delete() domain_a = DomainFactory.create() domain_b = DomainFactory.create() LimiterFactory.create(url=domain_a.url, value=2) pages_a = [] pages_b = [] workers = [] for i in range(10): for j in range(2): workers.append(WorkerFactory.create()) pages_a.append(PageFactory.create(domain=domain_a, url="%s/%d.html" % (domain_a.url, i), score=i * 10)) pages_b.append(PageFactory.create(domain=domain_b, url="%s/%d.html" % (domain_b.url, i), score=i)) # first one should not be limited next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1, avg_links_per_page=10 ) expect(next_job).not_to_be_null() expect(next_job['page']).to_equal(str(pages_a[-1].uuid)) workers[0].current_url = next_job['url'] self.db.flush() # second one should be limited (2 / 10 = 0.2, rounded up = 1 job at a time) next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).not_to_be_null() expect(next_job['page']).to_equal(str(pages_b[-1].uuid))
def test_increases_page_score_when_lambda_is_top_page(self): WorkerFactory.create() page = PageFactory.create() page2 = PageFactory.create() settings = Settings.instance(self.db) settings.lambda_score = 10000 Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) self.db.refresh(page) self.db.refresh(page2) expect(page.score).to_equal(5000) expect(page2.score).to_equal(5000)
def get(self, uuid=''): uuid = UUID(uuid) page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._('Page UUID [%s] not found') % uuid) return page_json = {"uuid": str(page.uuid), "url": page.url} self.write(page_json)
def test_can_get_next_job_when_expired(self): page = PageFactory.create(last_review_date=datetime(2010, 10, 10, 10, 10, 10)) next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).not_to_be_null() expect(next_job['page']).to_equal(str(page.uuid))
def test_get_next_job_does_not_get_from_inactive_domains(self): domain = DomainFactory.create(is_active=False) PageFactory.create(domain=domain) next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).to_be_null()
def test_can_get_next_job(self): page = PageFactory.create() next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).not_to_be_null() expect(next_job['page']).to_equal(str(page.uuid))
def get(self, uuid): page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._('Page UUID [%s] not found') % uuid) return violations_per_day = page.get_violations_per_day(self.db) page_json = {"violations": violations_per_day} self.write_json(page_json)
def get(self, uuid=""): uuid = UUID(uuid) page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._("Page UUID [%s] not found") % uuid) return page_json = {"uuid": str(page.uuid), "url": page.url} self.write(page_json)
def get(self, uuid): page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._("Page UUID [%s] not found") % uuid) return violations_per_day = page.get_violations_per_day(self.db) page_json = {"violations": violations_per_day} self.write_json(page_json)
def test_get_next_job_list(self): page = PageFactory.create() PageFactory.create() next_job_list = Page.get_next_job_list(self.db, expiration=100) expect(next_job_list).to_length(2) pages = [{'url': x.url, 'uuid': str(x.uuid)} for x in next_job_list] expect(pages).to_include({ 'url': page.url, 'uuid': str(page.uuid) })
def handle(has_key): domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if has_key: self.redis.incrby(key, increment, callback=callback) else: if domain is None: value = Page.get_page_count(self.db) + increment - 1 else: value = get_default_method(domain) + increment - 1 self.redis.set(key, value, callback=callback)
def increment_count(self, key, domain_name, get_default_method, increment=1): key = '%s-%s' % (self.get_domain_name(domain_name), key) has_key = self.has_key(key) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if has_key: self.redis.incrby(key, increment) else: if domain is None: value = Page.get_page_count(self.db) + increment - 1 else: value = get_default_method(domain) + increment - 1 self.redis.set(key, value)
def handle(count): if count is not None: callback(int(count)) return domain = self.get_domain(domain_name) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex(key=cache_key, value=int(count), seconds=expiration, callback=self.handle_set_count(count, callback))
def test_can_save_known_domain(self): DomainFactory.create(url='http://www.globo.com', name='globo.com') self.mock_request(status_code=200, effective_url="http://www.globo.com") response = self.fetch( '/page', method='POST', body=dumps({ 'url': 'http://www.globo.com' }) ) expect(response.code).to_equal(200) page_uuid = UUID(response.body) page = Page.by_uuid(page_uuid, self.db) expect(page).not_to_be_null() expect(str(page_uuid)).to_equal(page.uuid)
def test_can_save_known_domain(self): DomainFactory.create(url='http://www.globo.com', name='globo.com') self.mock_request(status_code=200, effective_url="http://www.globo.com") response = yield self.authenticated_fetch( '/page', method='POST', body=dumps({ 'url': 'http://www.globo.com' }) ) expect(response.code).to_equal(200) page_uuid = UUID(response.body) page = Page.by_uuid(page_uuid, self.db) expect(page).not_to_be_null() expect(str(page_uuid)).to_equal(page.uuid)
def test_can_get_next_job(self): domain = DomainFactory.create() pages = [] for i in range(20): WorkerFactory.create() pages.append(PageFactory.create( domain=domain, score=float(i) )) for i in range(20): next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=100 ) expect(next_job).not_to_be_null() expect(next_job['page']).to_equal(str(pages[19 - i].uuid))
def handle(count): if count is not None: callback(int(count)) return domain = self.get_domain(domain_name) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex( key=cache_key, value=int(count), seconds=expiration, callback=self.handle_set_count(count, callback) )
def test_increases_page_score_when_all_pages_have_been_reviewed(self): page = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10)) page2 = PageFactory.create(last_review_date=datetime(2014, 10, 10, 10, 10, 10)) settings = Settings.instance(self.db) settings.lambda_score = 10000 next_job = Page.get_next_job( self.db, expiration=100, cache=self.sync_cache, lock_expiration=1 ) expect(next_job).to_be_null() self.db.refresh(page) self.db.refresh(page2) expect(page.score).to_equal(5000) expect(page2.score).to_equal(5000)
def get(self): current_page = int(self.get_argument("current_page", 1)) page_size = int(self.get_argument("page_size", 10)) get_next_job_list = Page.get_next_job_list( self.db, self.application.config.REVIEW_EXPIRATION_IN_SECONDS, current_page=current_page, page_size=page_size, ) review_count = self.girl.get("next_jobs_count") result = {"reviewCount": review_count} pages = [] for item in get_next_job_list: pages.append({"uuid": item.uuid, "url": item.url}) result["pages"] = pages self.write_json(result)
def post(self): post_data = loads(self.request.body) url = post_data['url'] score = float( post_data.get('score', self.application.config.DEFAULT_PAGE_SCORE)) result = yield Page.add_page( self.db, self.application.cache, url, score, self.application.http_client.fetch, self.application.event_bus.publish, self.application.config, self.application.girl, self.application.default_violations_values, self.application.violation_definitions) created, url, result = result if not created and result['reason'] == 'invalid_url': self.set_status(400, self._('Invalid url [%s]') % url) self.write_json({ 'reason': 'invalid_url', 'url': url, 'status': result['status'], 'details': result['details'] }) return if not created and result['reason'] == 'redirect': self.set_status(400, self._('Supplied URL is a redirect [%s]') % url) self.write_json({ 'reason': 'redirect', 'url': url, 'effectiveUrl': result['effectiveUrl'] }) return yield self.application.cache.add_next_job_bucket(result, url) self.write(str(result)) self.finish()
def test_can_save(self): def side_effect(*args, **kw): response_mock = Mock(status_code=200, effective_url="http://www.globo.com") kw['callback'](response_mock) self.mock_request(status_code=200, effective_url="http://www.globo.com") response = yield self.http_client.fetch( self.get_url('/page'), method='POST', body=dumps({ 'url': 'http://www.globo.com' }) ) expect(response.code).to_equal(200) page_uuid = UUID(response.body) page = Page.by_uuid(page_uuid, self.db) expect(page).not_to_be_null() expect(str(page_uuid)).to_equal(page.uuid)
def get_count(self, key, domain_name, expiration, get_count_method): cache_key = '%s-%s' % (self.get_domain_name(domain_name), key) count = self.redis.get(cache_key) if count is not None: return int(count) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex(cache_key, expiration, value=int(count)) return int(count)
def get(self, uuid='', limit=10): uuid = UUID(uuid) page = Page.by_uuid(uuid, self.db) if not page: self.set_status(404, self._('Page UUID [%s] not found') % uuid) return reviews = self.db.query(Review) \ .filter(Review.page == page) \ .filter(Review.is_complete == True) \ .order_by(Review.completed_date.desc())[:limit] result = [] for review in reviews: result.append({ 'uuid': str(review.uuid), 'completedAt': review.completed_date, 'violationCount': review.violation_count }) self.write_json(result)