def add_domain(cls, url, db, publish_method, config): from holmes.models import Domain domain_name, domain_url = get_domain_from_url(url) domains = db.query(Domain).filter(or_( Domain.name == domain_name, Domain.name == domain_name.rstrip('/'), Domain.name == "%s/" % domain_name )).all() if not domains: domain = None else: domain = domains[0] if not domain: url_hash = hashlib.sha512(domain_url).hexdigest() domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name) db.add(domain) db.flush() db.commit() publish_method(dumps({ 'type': 'new-domain', 'domainUrl': str(domain_url) })) from holmes.models import Limiter connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS Limiter.add_or_update_limiter(db, domain_url, connections) return domain
def test_can_get_limiter_by_id(self): limiter = LimiterFactory.create() loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter.id).to_equal(limiter.id) invalid_limiter = Limiter.by_id(-1, self.db) expect(invalid_limiter).to_be_null()
def test_can_get_limiter_by_url(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create(url='http://test.com/') loaded_limiter = Limiter.by_url('http://test.com/', self.db) expect(loaded_limiter.id).to_equal(limiter.id) invalid_limiter = Limiter.by_url('http://test.com/1', self.db) expect(invalid_limiter).to_be_null()
def test_can_delete_one_limiter(self): limiter = LimiterFactory.create() loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter.id).to_equal(limiter.id) affected = Limiter.delete(limiter.id, self.db) expect(affected).to_equal(1) deleted_limiter = Limiter.by_id(limiter.id, self.db) expect(deleted_limiter).to_be_null()
def test_can_get_limiter_by_url_hash(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create(url='http://test.com/') url_hash = hashlib.sha512('http://test.com/').hexdigest() loaded_limiter = Limiter.by_url_hash(url_hash, self.db) expect(loaded_limiter.id).to_equal(limiter.id) invalid_limiter = Limiter.by_url_hash('00000000', self.db) expect(invalid_limiter).to_be_null()
def test_can_save_limiters_with_access_token(self): self.db.query(Limiter).delete() self.db.query(User).delete() dt = datetime(2014, 2, 14, 15, 0, 30) UserFactory(email='*****@*****.**') user_data = dumps({ 'is_superuser': True, 'fullname': u'Marcelo Jorge Vieira', 'last_login': dt, 'email': u'*****@*****.**' }) self.mock_request(code=200, body=user_data) yield self.http_client.fetch( self.get_url('/limiters'), method='POST', body=dumps({ 'url': 'http://globo.com/', 'maxValue': 10 }), headers={'X-AUTH-HOLMES': '111'} ) loaded_limiter = Limiter.by_url('http://globo.com/', self.db) expect(loaded_limiter).not_to_be_null()
def post(self): access_token = self.request.headers.get('X-AUTH-HOLMES', None) if access_token is None: self.set_status(403) self.write_json({'reason': 'Empty access token'}) return result = yield User.authenticate( access_token, self.application.http_client.fetch, self.db, self.application.config ) if result and result.get('user', None) is None: self.set_status(403) self.write_json({'reason': 'Not authorized user.'}) return post_data = loads(self.request.body) url = post_data.get('url', None) connections = self.application.config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS value = post_data.get('value', connections) if not url and not value: self.set_status(400) self.write_json({'reason': 'Not url or value'}) return result = Limiter.add_or_update_limiter(self.db, url, value) yield self.cache.remove_domain_limiters_key() self.write_json(result)
def add_domain(cls, url, db, publish_method, config, girl, default_violations_values, violation_definitions, cache): from holmes.models import Domain, DomainsViolationsPrefs from holmes.material import expire_materials domain_name, domain_url = get_domain_from_url(url) domains = ( db.query(Domain) .filter( or_( Domain.name == domain_name, Domain.name == domain_name.rstrip("/"), Domain.name == "%s/" % domain_name, ) ) .all() ) if not domains: domain = None else: domain = domains[0] if not domain: url_hash = hashlib.sha512(domain_url).hexdigest() domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name) db.add(domain) db.flush() expire_materials(girl) publish_method(dumps({"type": "new-domain", "domainUrl": str(domain_url)})) keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache ) from holmes.models import Limiter connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS Limiter.add_or_update_limiter(db, domain_url, connections) return domain
def test_can_get_all_limiters(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create(url='http://test.com/') LimiterFactory.create() LimiterFactory.create() limiters = Limiter.get_all(self.db) expect(limiters).not_to_be_null() expect(limiters).to_length(3) expect(limiters).to_include(limiter) DomainFactory.create(name='test.com', url='http://test.com') limiters = Limiter.get_all(self.db, domain_filter='test.com') expect(limiters).not_to_be_null() expect(limiters).to_length(1)
def test_can_delete_limiter_as_superuser(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() response = yield self.authenticated_fetch('/limiters/%d' % limiter.id, user_email=user.email, method='DELETE') expect(response.code).to_equal(204) expect(response.body).to_length(0) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).to_be_null()
def get_limiter_buckets(self, active_domains, avg_links_per_page=10.0): available = [] all_limiters = reversed(sorted(Limiter.get_limiters_for_domains(self.db, active_domains), key=lambda item: item.url)) for limiter in all_limiters: capacity = float(limiter.value - self.get_limit_usage(limiter.url)) available.append((limiter, capacity)) return available
def add_domain(cls, url, db, publish_method, config, girl, default_violations_values, violation_definitions, cache): from holmes.models import Domain, DomainsViolationsPrefs from holmes.material import expire_materials domain_name, domain_url = get_domain_from_url(url) domains = db.query(Domain).filter( or_(Domain.name == domain_name, Domain.name == domain_name.rstrip('/'), Domain.name == "%s/" % domain_name)).all() if not domains: domain = None else: domain = domains[0] if not domain: url_hash = hashlib.sha512(domain_url).hexdigest() domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name) db.add(domain) db.flush() expire_materials(girl) publish_method( dumps({ 'type': 'new-domain', 'domainUrl': str(domain_url) })) keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache) from holmes.models import Limiter connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS Limiter.add_or_update_limiter(db, domain_url, connections) return domain
def test_get_limiters_for_domains(self): self.db.query(Limiter).delete() self.db.query(Domain).delete() for i in range(2): DomainFactory.create() value = 2 url = 'http://globo.com/' Limiter.add_or_update_limiter(self.db, url, value) DomainFactory.create(url=url, name='globo.com') active_domains = self.db.query(Domain).all() domains = Limiter.get_limiters_for_domains(self.db, active_domains) expect(domains).to_length(1) expect(str(domains[0])).to_equal(url)
def test_can_delete_limiter_as_superuser(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() response = yield self.authenticated_fetch( '/limiters/%d' % limiter.id, user_email=user.email, method='DELETE' ) expect(response.code).to_equal(204) expect(response.body).to_length(0) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).to_be_null()
def get_limiter_buckets(self, active_domains, avg_links_per_page=10.0): available = [] all_limiters = reversed( sorted(Limiter.get_limiters_for_domains(self.db, active_domains), key=lambda item: item.url)) for limiter in all_limiters: capacity = float(limiter.value - self.get_limit_usage(limiter.url)) available.append((limiter, capacity)) return available
def get_limiters_for_domains(cls, db, active_domains): from holmes.models import Limiter # Avoid circular dependency all_limiters = Limiter.get_all(db) limiters = [] for limiter in all_limiters: for domain in active_domains: if limiter.matches(domain.url): limiters.append(limiter) return limiters
def delete(self, limiter_id=None): if not self.validate_superuser(): return if not limiter_id: self.set_status(400) self.write_json({'reason': 'Invalid data', 'description': self._('Invalid data')}) return limiter = Limiter.by_id(limiter_id, self.db) if not limiter or not limiter.id: self.set_status(404) self.write_json({'reason': 'Not Found', 'description': self._('Not Found')}) return Limiter.delete(limiter.id, self.db) yield self.cache.remove_domain_limiters_key() self.set_status(204)
def get_domain_limiters(self): domains = self.redis.get('domain-limiters') if domains: domains = loads(domains) else: limiters = Limiter.get_all(self.db) if limiters: domains = [{d.url: d.value} for d in limiters] self.set_domain_limiters( domains, self.config.LIMITER_VALUES_CACHE_EXPIRATION) return domains
def get_domain_limiters(self): domains = self.redis.get('domain-limiters') if domains: domains = loads(domains) else: limiters = Limiter.get_all(self.db) if limiters: domains = [{d.url: d.value} for d in limiters] self.set_domain_limiters( domains, self.config.LIMITER_VALUES_CACHE_EXPIRATION ) return domains
def test_can_save_limiters_as_superuser(self): self.db.query(Limiter).delete() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) response = yield self.authenticated_fetch( '/limiters', user_email=user.email, method='POST', body=dumps({ 'url': 'http://globo.com/', 'maxValue': 10 }) ) expect(response).not_to_be_null() expect(response.code).to_equal(200) loaded_limiter = Limiter.by_url('http://globo.com/', self.db) expect(loaded_limiter).not_to_be_null()
def test_cant_delete_limiter_as_normal_user(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=False) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() try: yield self.authenticated_fetch('/limiters/%d' % limiter.id, user_email=user.email, method='DELETE') except HTTPError, e: expect(e).not_to_be_null() expect(e.code).to_equal(401) expect(e.response.reason).to_be_like('Unauthorized')
def test_cant_delete_limiter_as_normal_user(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=False) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() try: yield self.authenticated_fetch( '/limiters/%d' % limiter.id, user_email=user.email, method='DELETE' ) except HTTPError, e: expect(e).not_to_be_null() expect(e.code).to_equal(401) expect(e.response.reason).to_be_like('Unauthorized')
def test_can_save_limiters_as_superuser(self): self.db.query(Limiter).delete() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) response = yield self.authenticated_fetch('/limiters', user_email=user.email, method='POST', body=dumps({ 'url': 'http://globo.com/', 'maxValue': 10 })) expect(response).not_to_be_null() expect(response.code).to_equal(200) loaded_limiter = Limiter.by_url('http://globo.com/', self.db) expect(loaded_limiter).not_to_be_null()
def get(self): limiters = Limiter.get_all(self.db) result = [] for limit in limiters: current_value = yield self.cache.get_limit_usage(limit.url) or 0 percentage = 0 if limit.value > 0: percentage = float(current_value) / limit.value result.append({ 'id': limit.id, 'url': limit.url, 'currentValue': current_value, 'maxValue': limit.value or 0, 'concurrentRequestsPercentage': percentage }) self.write_json(result)
def post(self): if not self.validate_superuser(): return post_data = loads(self.request.body) url = post_data.get('url', None) connections = self.config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS value = post_data.get('value', connections) if not url and not value: self.set_status(400) self.write_json({ 'reason': 'Not url or value', 'description': self._('Not url or value') }) return result = Limiter.add_or_update_limiter(self.db, url, value) yield self.cache.remove_domain_limiters_key() self.write_json(result)
def test_can_add_or_update_limiter(self): self.db.query(Limiter).delete() limiters = Limiter.get_all(self.db) expect(limiters).to_equal([]) # Add url = 'http://globo.com/' value = 2 Limiter.add_or_update_limiter(self.db, url, value) limiter = Limiter.by_url(url, self.db) expect(limiter.value).to_equal(2) # Update url = 'http://globo.com/' value = 3 Limiter.add_or_update_limiter(self.db, url, value) limiter = Limiter.by_url(url, self.db) expect(limiter.value).to_equal(3)
def add_or_update_limiter(cls, db, url, value): if not url: return url = url.encode('utf-8') url_hash = hashlib.sha512(url).hexdigest() limiter = Limiter.by_url_hash(url_hash, db) if limiter: db \ .query(Limiter) \ .filter(Limiter.id == limiter.id) \ .update({'value': value}) db.flush() db.commit() return limiter.url limiter = Limiter(url=url, url_hash=url_hash, value=value) db.add(limiter) return limiter.url
def get(self): limiters = Limiter.get_all( self.db, domain_filter=self.get_argument('domain_filter', None) ) result = [] for limit in limiters: current_value = yield self.cache.get_limit_usage(limit.url) or 0 percentage = 0 if limit.value > 0: percentage = float(current_value) / limit.value result.append({ 'id': limit.id, 'url': limit.url, 'currentValue': current_value, 'maxValue': limit.value or 0, 'concurrentRequestsPercentage': percentage }) self.write_json(result)
loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() try: yield self.authenticated_fetch('/limiters/%d' % limiter.id, user_email=user.email, method='DELETE') except HTTPError, e: expect(e).not_to_be_null() expect(e.code).to_equal(401) expect(e.response.reason).to_be_like('Unauthorized') else: assert False, 'Should not have got this far' loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() @gen_test def test_can_delete_limiter_as_superuser(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() response = yield self.authenticated_fetch('/limiters/%d' % limiter.id, user_email=user.email, method='DELETE')
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10): from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency page = None lock = None settings = Settings.instance(db) workers = db.query(Worker).all() number_of_workers = len(workers) active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] all_domains_pages_in_need_of_review = {} for domain_id in active_domains_ids: pages = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .order_by(Page.score.desc())[:number_of_workers] if pages: all_domains_pages_in_need_of_review[domain_id] = pages pages_in_need_of_review = [] current_domain = 0 while all_domains_pages_in_need_of_review: domains = all_domains_pages_in_need_of_review.keys() if current_domain >= len(domains): current_domain = 0 domain_id = domains[current_domain] item = all_domains_pages_in_need_of_review[domain_id].pop(0) pages_in_need_of_review.append(item) if not all_domains_pages_in_need_of_review[domain_id]: del all_domains_pages_in_need_of_review[domain_id] current_domain += 1 if not pages_in_need_of_review: return None if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) for i in range(len(pages_in_need_of_review)): if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page): continue lock = cache.has_next_job_lock( pages_in_need_of_review[i].url, lock_expiration ) if lock is not None: page = pages_in_need_of_review[i] break if page is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }
loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() try: yield self.authenticated_fetch( '/limiters/%d' % limiter.id, user_email=user.email, method='DELETE' ) except HTTPError, e: expect(e).not_to_be_null() expect(e.code).to_equal(401) expect(e.response.reason).to_be_like('Unauthorized') else: assert False, 'Should not have got this far' loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() @gen_test def test_can_delete_limiter_as_superuser(self): self.db.query(Limiter).delete() limiter = LimiterFactory.create() self.db.query(User).delete() user = UserFactory(email='*****@*****.**', is_superuser=True) loaded_limiter = Limiter.by_id(limiter.id, self.db) expect(loaded_limiter).not_to_be_null() response = yield self.authenticated_fetch( '/limiters/%d' % limiter.id, user_email=user.email, method='DELETE'
def _verify_workers_limits(self, url, avg_links_per_page=10): active_domains = Domain.get_active_domains(self.db) return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)