def test_can_get_domains_details(self): self.db.query(Domain).delete() details = Domain.get_domains_details(self.db) expect(details).to_length(0) domain = DomainFactory.create(name='domain-1.com', url='http://domain-1.com/') domain2 = DomainFactory.create(name='domain-2.com', url='http://domain-2.com/') DomainFactory.create() page = PageFactory.create(domain=domain) page2 = PageFactory.create(domain=domain) page3 = PageFactory.create(domain=domain2) ReviewFactory.create(domain=domain, page=page, is_active=True, number_of_violations=20) ReviewFactory.create(domain=domain, page=page2, is_active=True, number_of_violations=10) ReviewFactory.create(domain=domain2, page=page3, is_active=True, number_of_violations=30) RequestFactory.create(status_code=200, domain_name=domain.name, response_time=0.25) RequestFactory.create(status_code=304, domain_name=domain.name, response_time=0.35) RequestFactory.create(status_code=400, domain_name=domain.name, response_time=0.25) RequestFactory.create(status_code=403, domain_name=domain.name, response_time=0.35) RequestFactory.create(status_code=404, domain_name=domain.name, response_time=0.25) details = Domain.get_domains_details(self.db) expect(details).to_length(3) expect(details[0]).to_length(10) expect(details[0]['url']).to_equal('http://domain-1.com/') expect(details[0]['name']).to_equal('domain-1.com') expect(details[0]['violationCount']).to_equal(30) expect(details[0]['pageCount']).to_equal(2) expect(details[0]['reviewCount']).to_equal(2) expect(details[0]['reviewPercentage']).to_equal(100.0) expect(details[0]['errorPercentage']).to_equal(60.0) expect(details[0]['is_active']).to_be_true() expect(details[0]['averageResponseTime']).to_equal(0.3)
def save_request(self, url, response): if not response: return request_time = response.request_time effective_url = response.effective_url status_code = response.status_code domain_name, domain_url = get_domain_from_url(url) if domain_name not in Domain.get_domain_names(self.db): return req = Request( domain_name=domain_name, url=url, effective_url=effective_url, status_code=int(status_code), response_time=request_time, completed_date=datetime.now().date(), review_url=self.page_url ) self.db.add(req) self.cache.increment_requests_count() self.publish(dumps({ 'type': 'new-request', 'url': str(url) }))
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return violation_defs = self.application.violation_definitions grouped_violations = self.girl.get('violation_count_by_category_for_domains') total = 0 violations = [] for item in grouped_violations.get(domain.id, []): key_name, key_category_id, count = item['key_name'], item['category_id'], item['violation_count'] violations.append({ 'categoryId': key_category_id, 'categoryName': violation_defs[key_name]['category'], 'count': count }) total += count result = { "domainId": domain.id, 'domainName': domain.name, 'domainURL': domain.url, 'total': total, 'violations': violations } self.write_json(result)
def get(self, domain_name): term = self.get_argument('term', None) current_page = int(self.get_argument('current_page', 1)) page_size = int(self.get_argument('page_size', 10)) domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return reviews = yield self.application.search_provider.get_domain_active_reviews( domain=domain, current_page=current_page, page_size=page_size, page_filter=term, ) if 'error' in reviews: self.set_status(reviews['error']['status_code'], reviews['error']['reason']) self.finish() return if 'reviewsCount' not in reviews: if not term: reviews[ 'reviewsCount'] = yield self.cache.get_active_review_count( domain) else: reviews['reviewsCount'] = None self.write_json(reviews) self.finish()
def save_request(self, url, response): if not response: return request_time = response.request_time effective_url = response.effective_url status_code = response.status_code if self.domain_name not in Domain.get_domain_names(self.db): return req = Request( domain_name=self.domain_name, url=url, effective_url=effective_url, status_code=int(status_code), response_time=request_time, completed_date=datetime.now().date(), review_url=self.page_url ) self.db.add(req) url = url.encode('utf-8') self.publish(dumps({ 'type': 'new-request', 'url': str(url) }))
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return prefs = DomainsViolationsPrefs.get_domains_violations_prefs_by_domain( self.db, domain.name ) violation_defs = self.application.violation_definitions result = [] for pref in prefs: key = violation_defs.get(pref.get('key')) if key is None: continue result.append({ 'key': pref.get('key'), 'title': key.get('default_value_description', None), 'category': key.get('category', None), 'value': pref.get('value'), 'default_value': key.get('default_value', None), 'unit': key.get('unit', None) }) self.write_json(result)
def get_count(self, key, domain_name, expiration, get_count_method): cache_key = '%s-%s' % (self.get_domain_name(domain_name), key) count = self.redis.get(cache_key) if count is not None: return int(count) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex( cache_key, expiration, value=int(count) ) return int(count)
def test_can_insert_default_violations_values_for_all_domains(self): DomainsViolationsPrefsFactory.create(domain=Domain(name='globo.com'), key=Key(name='some.random.fact'), value='whatever') for x in range(3): DomainFactory.create(name='g%d.com' % x) domains_violations_prefs = \ DomainsViolationsPrefs.get_domains_violations_prefs(self.db) expect(domains_violations_prefs).to_length(1) default_violations_values = { 'page.title.size': 100, 'total.requests.img': 5, } page_title_size = KeyFactory.create(name='page.title.size') total_requests_img = KeyFactory.create(name='total.requests.img') violation_definitions = { 'page.title.size': { 'key': page_title_size, 'default_value': 100 }, 'total.requests.img': { 'key': total_requests_img, 'default_value': 5 } } DomainsViolationsPrefs.insert_default_violations_values_for_all_domains( self.db, default_violations_values, violation_definitions, self.cache) domains_violations_prefs = \ DomainsViolationsPrefs.get_domains_violations_prefs(self.db) expect(domains_violations_prefs).to_length(4) expect(domains_violations_prefs).to_be_like({ 'globo.com': { 'some.random.fact': 'whatever', 'total.requests.img': 5, 'page.title.size': 100 }, 'g0.com': { 'page.title.size': 100, 'total.requests.img': 5 }, 'g1.com': { 'page.title.size': 100, 'total.requests.img': 5 }, 'g2.com': { 'page.title.size': 100, 'total.requests.img': 5 }, })
def get(self, domain_name, key_category_id): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return violation_defs = self.application.violation_definitions top_violations = yield self.cache.get_top_in_category_for_domain( domain, key_category_id, self.application.config.get('TOP_CATEGORY_VIOLATIONS_LIMIT') ) violations = [] for key_name, count in top_violations: violations.append({ 'title': violation_defs[key_name]['title'], 'count': count }) result = { "domainId": domain.id, 'domainName': domain.name, 'domainURL': domain.url, 'categoryId': key_category_id, 'violations': violations } self.write_json(result)
def post(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return domain.is_active = not domain.is_active
def test_domains_violations_prefs_str(self): data = DomainsViolationsPrefsFactory.create( domain=Domain(name='globo.com'), key=Key(name='some.random.fact'), value='whatever') loaded = self.db.query(DomainsViolationsPrefs).get(data.id) expect( str(loaded)).to_be_like('some.random.fact (globo.com): whatever')
def test_can_get_active_domains(self): self.db.query(Domain).delete() domain = DomainFactory(is_active=True) DomainFactory(is_active=False) domains = Domain.get_active_domains(self.db) expect(domains).to_length(1) expect(domains[0].id).to_equal(domain.id)
def test_get_domain_names(self): self.db.query(Domain).delete() DomainFactory.create(name="g1.globo.com") DomainFactory.create(name="globoesporte.globo.com") domain_names = Domain.get_domain_names(self.db) expect(domain_names).to_be_like( ['g1.globo.com', 'globoesporte.globo.com'])
def test_can_set_domain_to_inactive(self): domain = DomainFactory.create(url="http://www.domain.com", name="domain.com", is_active=True) response = yield self.authenticated_fetch( '/domains/%s/change-status/' % domain.name, method='POST', body='') expect(response.code).to_equal(200) domain_from_db = Domain.get_domain_by_name(domain.name, self.db) expect(domain_from_db.is_active).to_be_false()
def post(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return domain.is_active = not domain.is_active if not domain.is_active: yield self.cache.delete_limit_usage_by_domain(domain.url)
def test_can_set_domain_to_active(self): domain = DomainFactory.create(url="http://www.domain.com", name="domain.com", is_active=False) response = yield self.http_client.fetch( self.get_url(r'/domains/%s/change-status/' % domain.name), method='POST', body='' ) expect(response.code).to_equal(200) domain_from_db = Domain.get_domain_by_name(domain.name, self.db) expect(domain_from_db.is_active).to_be_true()
def test_can_convert_to_dict(self): data = DomainsViolationsPrefsFactory.create( domain=Domain(name='globo.com'), key=Key(name='some.random.fact'), value='whatever') expect(data.to_dict()).to_be_like({ 'domain': 'globo.com', 'key': 'some.random.fact', 'value': 'whatever', })
def test_can_create_domains_violations_prefs(self): data = DomainsViolationsPrefsFactory.create( domain=Domain(name='globo.com'), key=Key(name='some.random.fact'), value='whatever') loaded = self.db.query(DomainsViolationsPrefs).get(data.id) expect(loaded.domain.name).to_equal('globo.com') expect(loaded.key.name).to_equal('some.random.fact') expect(loaded.value).to_equal('whatever')
def test_can_set_domain_to_inactive(self): domain = DomainFactory.create(url="http://www.domain.com", name="domain.com", is_active=True) response = yield self.authenticated_fetch( '/domains/%s/change-status/' % domain.name, method='POST', body='' ) expect(response.code).to_equal(200) domain_from_db = Domain.get_domain_by_name(domain.name, self.db) expect(domain_from_db.is_active).to_be_false()
def test_get_domain_names(self): self.db.query(Domain).delete() DomainFactory.create(name="g1.globo.com") DomainFactory.create(name="globoesporte.globo.com") domain_names = Domain.get_domain_names(self.db) expect(domain_names).to_be_like([ 'g1.globo.com', 'globoesporte.globo.com' ])
def test_can_get_domains_violations_prefs(self): data = DomainsViolationsPrefsFactory.create( domain=Domain(name='globo.com'), key=Key(name='some.random.fact'), value='whatever') data = DomainsViolationsPrefs.get_domains_violations_prefs(self.db) expect(data).to_be_like( {'globo.com': { 'some.random.fact': 'whatever' }})
def get_next_jobs_count(cls, db, config): from holmes.models import Domain active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] return db \ .query( sa.func.count(Page.id) ) \ .filter(Page.domain_id.in_(active_domains_ids)) \ .scalar()
def handle(has_key): domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if has_key: self.redis.incrby(key, increment, callback=callback) else: if domain is None: value = Page.get_page_count(self.db) + increment - 1 else: value = get_default_method(domain) + increment - 1 self.redis.set(key, value, callback=callback)
def test_can_get_pages_per_domain(self): domain = DomainFactory.create() domain2 = DomainFactory.create() DomainFactory.create() PageFactory.create(domain=domain) PageFactory.create(domain=domain) PageFactory.create(domain=domain2) PageFactory.create(domain=domain2) PageFactory.create(domain=domain2) pages_per_domain = Domain.get_pages_per_domain(self.db) expect(pages_per_domain).to_be_like({domain.id: 2, domain2.id: 3})
def get(self, key_name): current_page = int(self.get_argument('current_page', 1)) page_size = int(self.get_argument('page_size', 10)) domain_filter = self.get_argument('domain_filter', None) page_filter = self.get_argument('page_filter', None) domain = None if domain_filter is not None: domain = Domain.get_domain_by_name(domain_filter, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_filter) self.finish() return violations = self.application.violation_definitions if key_name not in violations: self.set_status(404, self._('Invalid violation key %s') % key_name) self.finish() return violation_title = violations[key_name]['title'] key_id = violations[key_name]['key'].id violation = yield self.application.search_provider.get_by_violation_key_name( key_id=key_id, current_page=current_page, page_size=page_size, domain=domain, page_filter=page_filter, ) if 'error' in violation: self.set_status(violation['error']['status_code'], violation['error']['reason']) self.finish() return if 'reviewsCount' not in violation: if not domain and not page_filter: violation['reviewsCount'] = Review.count_by_violation_key_name( self.db, key_id) else: violation['reviewsCount'] = None violation['title'] = violation_title self.write_json(violation) self.finish()
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return result = self.girl.get('domains_details') or [] data = next((l for l in result if l['name'] == domain_name), None) if not data: self.set_status(404, self._('Domain %s not found') % domain_name) return page_count = data.get('pageCount', 0) review_count = data.get('reviewCount', 0) violation_count = data.get('violationCount', 0) error_percentage = data.get('errorPercentage', 0) response_time_avg = data.get('averageResponseTime', 0) review_percentage = data.get('reviewPercentage', 0) domain_json = { "id": domain.id, "name": domain.name, "url": domain.url, "pageCount": page_count, "reviewCount": review_count, "violationCount": violation_count, "reviewPercentage": review_percentage, "is_active": domain.is_active, "errorPercentage": error_percentage, "averageResponseTime": response_time_avg, "homepageId": "", "homepageReviewId": "", } homepage = domain.get_homepage(self.db) if homepage: if homepage.uuid: domain_json["homepageId"] = str(homepage.uuid) if homepage.last_review_uuid: domain_json["homepageReviewId"] = str( homepage.last_review_uuid) self.write_json(domain_json)
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return violations_per_day = domain.get_violations_per_day(self.db) domain_json = { "id": domain.id, "name": domain.name, "url": domain.url, "violations": violations_per_day } self.write_json(domain_json)
def test_can_get_pages_per_domain(self): domain = DomainFactory.create() domain2 = DomainFactory.create() DomainFactory.create() PageFactory.create(domain=domain) PageFactory.create(domain=domain) PageFactory.create(domain=domain2) PageFactory.create(domain=domain2) PageFactory.create(domain=domain2) pages_per_domain = Domain.get_pages_per_domain(self.db) expect(pages_per_domain).to_be_like({ domain.id: 2, domain2.id: 3 })
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return violations_per_day = domain.get_violations_per_day(self.db) domain_json = { "id": domain.id, "name": domain.name, "url": domain.url, "violations": violations_per_day } self.write_json(domain_json)
def increment_count(self, key, domain_name, get_default_method, increment=1): key = '%s-%s' % (self.get_domain_name(domain_name), key) has_key = self.has_key(key) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if has_key: self.redis.incrby(key, increment) else: if domain is None: value = Page.get_page_count(self.db) + increment - 1 else: value = get_default_method(domain) + increment - 1 self.redis.set(key, value)
def add_domain(cls, url, db, publish_method, config, girl, default_violations_values, violation_definitions, cache): from holmes.models import Domain, DomainsViolationsPrefs from holmes.material import expire_materials domain_name, domain_url = get_domain_from_url(url) domains = db.query(Domain).filter( or_(Domain.name == domain_name, Domain.name == domain_name.rstrip('/'), Domain.name == "%s/" % domain_name)).all() if not domains: domain = None else: domain = domains[0] if not domain: url_hash = hashlib.sha512(domain_url).hexdigest() domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name) db.add(domain) db.flush() expire_materials(girl) publish_method( dumps({ 'type': 'new-domain', 'domainUrl': str(domain_url) })) keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache) from holmes.models import Limiter connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS Limiter.add_or_update_limiter(db, domain_url, connections) return domain
def test_can_get_violations_per_domain(self): domain = DomainFactory.create() domain2 = DomainFactory.create() DomainFactory.create() page = PageFactory.create(domain=domain) page2 = PageFactory.create(domain=domain) page3 = PageFactory.create(domain=domain2) page4 = PageFactory.create(domain=domain2) page5 = PageFactory.create(domain=domain2) ReviewFactory.create(domain=domain, page=page, number_of_violations=40, is_active=True, is_complete=True) ReviewFactory.create(domain=domain, page=page2, number_of_violations=20, is_active=True, is_complete=True) ReviewFactory.create(domain=domain2, page=page3, number_of_violations=15, is_active=True, is_complete=True) ReviewFactory.create(domain=domain2, page=page4, number_of_violations=25, is_active=True, is_complete=True) ReviewFactory.create(domain=domain2, page=page5, number_of_violations=50, is_active=True, is_complete=True) violations_per_domain = Domain.get_violations_per_domain(self.db) expect(violations_per_domain).to_be_like({ domain.id: 60, domain2.id: 90 })
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return page_count = yield self.cache.get_page_count(domain) review_count = yield self.cache.get_active_review_count(domain) violation_count = yield self.cache.get_violation_count(domain) bad_request_count = yield self.cache.get_bad_request_count(domain) good_request_count = yield self.cache.get_good_request_count(domain) total_request_count = good_request_count + bad_request_count if total_request_count > 0: error_percentage = round(float(bad_request_count) / total_request_count * 100, 2) else: error_percentage = 0 response_time_avg = yield self.cache.get_response_time_avg(domain) status_code_info = Request.get_status_code_info(domain_name, self.db) if page_count > 0: review_percentage = round(float(review_count) / page_count * 100, 2) else: review_percentage = 0 domain_json = { "id": domain.id, "name": domain.name, "url": domain.url, "pageCount": page_count, "reviewCount": review_count, "violationCount": violation_count, "reviewPercentage": review_percentage, "is_active": domain.is_active, "statusCodeInfo": status_code_info, "errorPercentage": error_percentage, "averageResponseTime": response_time_avg, } self.write_json(domain_json)
def get(self, key_name): current_page = int(self.get_argument("current_page", 1)) page_size = int(self.get_argument("page_size", 10)) domain_filter = self.get_argument("domain_filter", None) page_filter = self.get_argument("page_filter", None) domain = None if domain_filter is not None: domain = Domain.get_domain_by_name(domain_filter, self.db) if not domain: self.set_status(404, self._("Domain %s not found") % domain_filter) self.finish() return violations = self.application.violation_definitions if key_name not in violations: self.set_status(404, self._("Invalid violation key %s") % key_name) self.finish() return violation_title = violations[key_name]["title"] key_id = violations[key_name]["key"].id violation = yield self.application.search_provider.get_by_violation_key_name( key_id=key_id, current_page=current_page, page_size=page_size, domain=domain, page_filter=page_filter ) if "error" in violation: self.set_status(violation["error"]["status_code"], violation["error"]["reason"]) self.finish() return if "reviewsCount" not in violation: if not domain and not page_filter: violation["reviewsCount"] = Review.count_by_violation_key_name(self.db, key_id) else: violation["reviewsCount"] = None violation["title"] = violation_title self.write_json(violation) self.finish()
def get_next_job_list(cls, db, expiration, current_page=1, page_size=200): from holmes.models import Domain lower_bound = (current_page - 1) * page_size upper_bound = lower_bound + page_size active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] pages_query = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id.in_(active_domains_ids)) \ .order_by(Page.score.desc()) return pages_query[lower_bound:upper_bound]
def post(self, domain_name): if not self.validate_superuser(): return domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return post_data = loads(self.request.body) DomainsViolationsPrefs.update_by_domain( self.db, self.cache, domain, post_data ) self.write_json({ 'reason': self._('Preferences successfully saved!'), 'description': self._('Preferences successfully saved!') })
def get(self, domain_name, key_category_id): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return key_category = KeysCategory.get_by_id(self.db, key_category_id) if not key_category: self.set_status( 404, self._('Key category %s not found') % key_category_id) return violation_defs = self.application.violation_definitions top_violations = self.girl.get( 'top_violations_in_category_for_domains') or {} violations = [] for top_violation in top_violations.get(domain_name, {}).get(key_category.id, []): violations.append({ 'title': self._(violation_defs[top_violation['key_name']]['title']), 'count': top_violation['count'], 'key': top_violation['key_name'], }) result = { "domainId": domain.id, 'domainName': domain.name, 'domainURL': domain.url, 'categoryId': key_category_id, 'violations': violations } self.write_json(result)
def insert_default_violations_values_for_all_domains( cls, db, default_violations_values, violation_definitions, cache): from holmes.models import Domain domains_violations_prefs = DomainsViolationsPrefs.get_domains_violations_prefs( db) domains = Domain.get_all_domains(db) for domain in domains: domain_data = domains_violations_prefs.get(domain.name, None) if domain_data: keys = set(default_violations_values.keys()) - set( domain_data.keys()) else: keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache)
def get_count(self, key, domain_name, expiration, get_count_method): cache_key = '%s-%s' % (self.get_domain_name(domain_name), key) count = self.redis.get(cache_key) if count is not None: return int(count) domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) if domain is None: count = Page.get_page_count(self.db) else: count = get_count_method(domain) cache_key = '%s-%s' % (self.get_domain_name(domain), key) self.redis.setex(cache_key, expiration, value=int(count)) return int(count)
def get(self, domain_name): term = self.get_argument('term', None) current_page = int(self.get_argument('current_page', 1)) page_size = int(self.get_argument('page_size', 10)) domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, 'Domain %s not found' % domain_name) return reviews = domain.get_active_reviews( self.db, url_starts_with=term, current_page=current_page, page_size=page_size ) if term: review_count = len(reviews) else: review_count = yield self.cache.get_active_review_count(domain) result = { 'reviewCount': review_count, 'pages': [], } for page in reviews: result['pages'].append({ "url": page.url, "uuid": str(page.uuid), "violationCount": page.violations_count, "completedAt": page.last_review_date, "reviewId": str(page.last_review_uuid) }) self.write_json(result)
def get(self, domain_name): domain = Domain.get_domain_by_name(domain_name, self.db) if not domain: self.set_status(404, self._('Domain %s not found') % domain_name) return violation_defs = self.application.violation_definitions grouped_violations = self.girl.get( 'violation_count_by_category_for_domains') or {} total = 0 violations = [] for item in grouped_violations.get(domain.id, []): key_name, key_category_id, count = item['key_name'], item[ 'category_id'], item['violation_count'] violations.append({ 'categoryId': key_category_id, 'categoryName': self._(violation_defs[key_name]['category']), 'count': count }) total += count result = { "domainId": domain.id, 'domainName': domain.name, 'domainURL': domain.url, 'total': total, 'violations': violations } self.write_json(result)
def insert_default_violations_values_for_all_domains( cls, db, default_violations_values, violation_definitions, cache): from holmes.models import Domain domains_violations_prefs = DomainsViolationsPrefs.get_domains_violations_prefs(db) domains = Domain.get_all_domains(db) for domain in domains: domain_data = domains_violations_prefs.get(domain.name, None) if domain_data: keys = set(default_violations_values.keys()) - set(domain_data.keys()) else: keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache )
def _verify_workers_limits(self, url, avg_links_per_page=10): active_domains = Domain.get_active_domains(self.db) return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)
def load_all_domains_violations_prefs(self): from holmes.models import Domain for domain in Domain.get_all_domains(self.db): self.cache.get_domain_violations_prefs(domain.name)
def get_domain(self, domain_name): domain = domain_name if domain and not isinstance(domain, Domain): domain = Domain.get_domain_by_name(domain_name, self.db) return domain
def test_can_get_domain_by_name(self): domain = DomainFactory.create() loaded_domain = Domain.get_domain_by_name(domain.name, self.db) expect(loaded_domain.id).to_equal(domain.id)
def test_invalid_domain_returns_None(self): domain_name = 'domain-details.com' domain = Domain.get_domain_by_name(domain_name, self.db) expect(domain).to_be_null()
def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10): from holmes.models import Settings, Worker, Domain, Limiter # Avoid circular dependency page = None lock = None settings = Settings.instance(db) workers = db.query(Worker).all() number_of_workers = len(workers) active_domains = Domain.get_active_domains(db) active_domains_ids = [item.id for item in active_domains] all_domains_pages_in_need_of_review = {} for domain_id in active_domains_ids: pages = db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .order_by(Page.score.desc())[:number_of_workers] if pages: all_domains_pages_in_need_of_review[domain_id] = pages pages_in_need_of_review = [] current_domain = 0 while all_domains_pages_in_need_of_review: domains = all_domains_pages_in_need_of_review.keys() if current_domain >= len(domains): current_domain = 0 domain_id = domains[current_domain] item = all_domains_pages_in_need_of_review[domain_id].pop(0) pages_in_need_of_review.append(item) if not all_domains_pages_in_need_of_review[domain_id]: del all_domains_pages_in_need_of_review[domain_id] current_domain += 1 if not pages_in_need_of_review: return None if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score: cls.update_pages_score_by(settings, settings.lambda_score, db) for i in range(len(pages_in_need_of_review)): if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page): continue lock = cache.has_next_job_lock( pages_in_need_of_review[i].url, lock_expiration ) if lock is not None: page = pages_in_need_of_review[i] break if page is None: return None return { 'page': str(page.uuid), 'url': page.url, 'score': page.score, 'lock': lock }
def fill_job_bucket(self, expiration, look_ahead_pages=1000, avg_links_per_page=10.0): try: with Lock('next-job-fill-bucket-lock', redis=self.redis): logging.info('Refilling job bucket. Lock acquired...') expired_time = datetime.utcnow() - timedelta( seconds=expiration) active_domains = Domain.get_active_domains(self.db) if not active_domains: return active_domains_ids = [item.id for item in active_domains] limiter_buckets = self.get_limiter_buckets( active_domains, avg_links_per_page) all_domains_pages_in_need_of_review = [] for domain_id in active_domains_ids: pages = self.db \ .query( Page.uuid, Page.url, Page.score, Page.last_review_date ) \ .filter(Page.domain_id == domain_id) \ .filter(or_( Page.last_review_date == None, Page.last_review_date <= expired_time ))[:look_ahead_pages] if pages: all_domains_pages_in_need_of_review.append(pages) logging.debug( 'Total of %d pages found to add to redis.' % (sum([ len(item) for item in all_domains_pages_in_need_of_review ]))) item_count = int(self.redis.zcard('next-job-bucket')) current_domain = 0 while item_count < look_ahead_pages and len( all_domains_pages_in_need_of_review) > 0: if current_domain >= len( all_domains_pages_in_need_of_review): current_domain = 0 item = all_domains_pages_in_need_of_review[ current_domain].pop(0) has_limit = True logging.debug('Available Limit Buckets: %s' % limiter_buckets) for index, (limit, available) in enumerate(limiter_buckets): if limit.matches(item.url): if available <= 0: has_limit = False break limiter_buckets[index] = (limit, available - 1) if has_limit: self.add_next_job_bucket(item.uuid, item.url) item_count += 1 # if there are not any more pages in this domain remove it from dictionary if not all_domains_pages_in_need_of_review[current_domain]: del all_domains_pages_in_need_of_review[current_domain] current_domain += 1 logging.debug('ADDED A TOTAL of %d ITEMS TO REDIS...' % item_count) except LockTimeout: logging.info("Can't acquire lock. Moving on...")