def test_validate_page_with_invalid_url(self): config = Config() page = PageFactory.create(url='http://[globo.com/1?item=test') reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) content = '<html><head></head></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.head': [{}]} validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_image_requests_on_globo_html(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) reviewer.violation_definitions = { 'single.size.img': {'default_value': 6}, 'total.requests.img': {'default_value': 50}, 'total.size.img': {'default_value': 100}, } validator = ImageRequestsValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.images': [ ( 'some_image.jpg', Mock(status_code=200, text=self.get_file('2x2.png')) ) for i in range(60) ], 'total.size.img': 106, } validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='total.requests.img', value={'total': 60, 'limit': 10}, points=50 )) expect(validator.add_violation.call_args_list).to_include( call( key='single.size.img', value={ 'limit': 6, 'over_max_size': set([('some_image.jpg', 6.57421875)]) }, points=0.57421875 ))
def test_can_validate_page_with_metatag_description_too_long(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'page.metatags.description_too_big': {'default_value': 300}, } validator = MetaTagsValidator(reviewer) validator.add_violation = Mock() validator.review.data['meta.tags'] = [ {'content': 'X' * 301, 'property': 'name', 'key': 'description'}, ] validator.validate() validator.add_violation.assert_called_once_with( key='page.metatags.description_too_big', value={'max_size': 300}, points=20 ) validator.add_violation = Mock() validator.review.data['meta.tags'] = [ {'content': 'X' * 300, 'property': 'name', 'key': 'description'}, ] validator.validate() expect(validator.add_violation.called).to_be_false()
def test_handle_sitemap_url_loaded(self): page = PageFactory.create(url="http://g1.globo.com/") reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) reviewer.enqueue = Mock() content = self.get_file('url_sitemap.xml') response = Mock(status_code=200, text=content) facter = SitemapFacter(reviewer) facter.async_get = Mock() facter.get_facts() facter.handle_sitemap_loaded("http://g1.globo.com/sitemap.xml", response) expect(facter.review.data['sitemap.files.size']["http://g1.globo.com/sitemap.xml"]).to_equal(0.296875) expect(facter.review.data['sitemap.urls']["http://g1.globo.com/sitemap.xml"]).to_equal(set(['http://domain.com/1.html', 'http://domain.com/2.html'])) expect(facter.review.facts['total.size.sitemap']['value']).to_equal(0.296875) expect(facter.review.facts['total.size.sitemap.gzipped']['value']).to_equal(0.1494140625) expect(facter.review.data['total.size.sitemap']).to_equal(0.296875) expect(facter.review.data['total.size.sitemap.gzipped']).to_equal(0.1494140625) expect(facter.review.data['sitemap.files.urls']["http://g1.globo.com/sitemap.xml"]).to_equal(2) expect(facter.review.facts['total.sitemap.urls']['value']).to_equal(2)
def test_query_string_without_params(self): config = Config() config.FORCE_CANONICAL = False page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><head></head></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.head': [{}]} validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate(self): page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], ) content = '<html><meta charset="UTF-8"></html>' result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.review.data = {"meta.tags": [{"key": "meta", "content": "utf-8", "property": ""}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(points=200, key="absent.metatags.open_graph", value=["og:title", "og:type", "og:image", "og:url"]) )
def test_no_title_tag(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() reviewer.content_loaded(page.url, Mock(status_code=200, text=content, headers={})) facter = TitleFacter(reviewer) facter.add_fact = Mock() facter.get_facts() expect(facter.add_fact.called).to_be_false() expect(facter.review.data).to_be_like({})
def test_can_get_fact_definitions(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = TitleFacter(reviewer) definitions = facter.get_fact_definitions() expect(definitions).to_length(1) expect('page.title' in definitions).to_be_true()
def test_can_validate_no_title_tag(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = TitleValidator(reviewer) validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='page.title.not_found', value=page.url, points=50)
def test_validator(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[], ) url = "http://my-site.com/test.html" content = '<html><a href="%s" rel="nofollow">Test</a></html>' % url result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelNofollowValidator(reviewer) validator.add_violation = Mock() validator.review.data = {"page.all_links": [{"href": url, "rel": "nofollow"}]} validator.validate() validator.add_violation.assert_called_once_with( key="invalid.links.nofollow", value=["http://my-site.com/test.html"], points=10 )
def test_can_validate_no_title_tag(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = TitleValidator(reviewer) validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='page.title.not_found', value=page.url, points=50)
def test_javascript_link(self): page = PageFactory.create() reviewer = Reviewer( api_url="http://*****:*****@'+OAS_listpos+'!'+pos+'?'+OAS_query+'\" TARGET='+OAS_target+'>');</script>" result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.add_fact = Mock() facter.async_get = Mock() facter.get_facts() expect(facter.add_fact.call_args_list).to_include(call(key="page.links", value=set([]))) expect(facter.add_fact.call_args_list).to_include(call(key="total.number.links", value=0))
def _start_reviewer(self, job): if job: if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS: self.info('Max URL levels! Details: %s' % job['url']) return self.debug('Starting Review for [%s]' % job['url']) reviewer = Reviewer( api_url=self.config.HOLMES_API_URL, page_uuid=job['page'], page_url=job['url'], page_score=0, config=self.config, validators=self.validators, facters=self.facters, search_provider=self.search_provider, async_get=self.async_get, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, girl=self.girl, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions) reviewer.review()
def _start_reviewer(self, job): if job: if count_url_levels(job['url']) > self.config.MAX_URL_LEVELS: self.info('Max URL levels! Details: %s' % job['url']) return self.debug('Starting Review for [%s]' % job['url']) reviewer = Reviewer( api_url=self.config.HOLMES_API_URL, page_uuid=job['page'], page_url=job['url'], page_score=job['score'], increase_lambda_tax_method=self._increase_lambda_tax, config=self.config, validators=self.validators, facters=self.facters, async_get=self.async_get, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions ) reviewer.review()
def test_can_validate_css_requests_on_globo_html(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'total.requests.css': {'default_value': 1}, 'total.size.css': {'default_value': 0.0}, } content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = CSSRequestsValidator(reviewer) css = { 'url': 'some_style.css', 'status': 200, 'content': '#id{display:none}', 'html': None } validator.get_response = Mock(return_value=css) validator.add_violation = Mock() validator.review.data = { 'total.requests.css': 7, 'total.size.css.gzipped': 0.05 } validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='total.requests.css', value={'over_limit': 6, 'total_css_files': 7}, points=30 )) expect(validator.add_violation.call_args_list).to_include( call( key='total.size.css', value=0.05, points=0 ))
def test_can_validate_without_meta_tags(self): config = Config() page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_no_body_tag(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = SchemaOrgItemTypeValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_validate(self): config = Config() page = PageFactory.create(url='http://globo.com/1?item=test') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><head></head></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.head': [{}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(key='absent.meta.canonical', value=None, points=30))
def test_force_canonical(self): page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache, ) reviewer.violation_definitions = {"absent.meta.canonical": {"default_value": True}} content = "<html><head></head></html>" result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {"page.head": [{}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(key="absent.meta.canonical", value=None, points=30) )
def test_query_string_without_params(self): config = Config() config.FORCE_CANONICAL = False page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[], ) content = "<html><head></head></html>" result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRelCanonicalValidator(reviewer) validator.add_violation = Mock() validator.review.data = {"page.head": [{}]} validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_get_violation_definitions(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) reviewer.violation_definitions = { 'total.size.js': {'default_value': 0.03}, 'total.requests.js': {'default_value': 1}, } content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = JSRequestsValidator(reviewer) definitions = validator.get_violation_definitions() expect(definitions).to_length(2) expect('total.size.js' in definitions).to_be_true() expect('total.requests.js' in definitions).to_be_true()
def test_can_validate_page_without_meta_tags(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) content = '<html></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = MetaTagsValidator(reviewer) validator.add_fact = Mock() validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='absent.metatags', value='No metatags.', points=100 )
def test_handle_url_loaded(self): page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[], ) content = self.get_file("globo.html") result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.async_get = Mock() facter.get_facts() facter.handle_url_loaded(page.url, response) expect(facter.review.data).to_include("page.links") data = set([(page.url, response)]) expect(facter.review.data["page.links"]).to_equal(data)
def test_validate(self): config = Config() page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) content = '<html><body></body></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = SchemaOrgItemTypeValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.body': [{}]} validator.validate() expect(validator.add_violation.call_args_list).to_include( call(key='absent.schema.itemscope', value=None, points=10)) expect(validator.add_violation.call_args_list).to_include( call(key='absent.schema.itemtype', value=None, points=10))
def test_can_validate_css_requests_empty_html(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'total.requests.css': {'default_value': 1}, 'total.size.css': {'default_value': 0.0}, } result = { 'url': page.url, 'status': 200, 'content': None, 'html': None } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = CSSRequestsValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_without_meta_tags(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[], ) content = "<html></html>" result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_js_requests_empty_html(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) result = { 'url': page.url, 'status': 200, 'content': None, 'html': None } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = JSRequestsValidator(reviewer) validator.add_violation = Mock() validator.validate() expect(validator.add_violation.called).to_be_false()
def test_link_looks_like_image(self): page = PageFactory.create(url="http://globo.com/") reviewer = Reviewer( api_url="http://localhost:2368", page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[], ) content = '<html><a href="http://globo.com/metal.png">Metal</a></html>' result = {"url": page.url, "status": 200, "content": content, "html": lxml.html.fromstring(content)} reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.add_fact = Mock() facter.async_get = Mock() facter.get_facts() expect(facter.add_fact.call_args_list).to_include(call(key="page.links", value=set([]))) expect(facter.add_fact.call_args_list).to_include(call(key="total.number.links", value=0)) expect(facter.async_get.called).to_be_false()
def test_can_validate_title_size_with_domain(self): self.db.query(Key).delete() self.db.query(KeysCategory).delete() config = Config() config.MAX_TITLE_SIZE = 70 key = Key(name='page.title.size') domain = DomainFactory.create(name='globo.com', url='http://globo.com') page = PageFactory.create(domain=domain, url='http://globo.com/a.html') self.sync_cache.redis.delete('violations-prefs-%s' % domain.name) DomainsViolationsPrefsFactory.create( domain=domain, key=key, value='10' ) reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[], cache=self.sync_cache ) title = 'a' * 80 content = '<html><title>%s</title></html>' % title result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) reviewer.violation_definitions = { 'page.title.size': {'default_value': 70, 'key': key}, } validator = TitleValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.title_count': 1, 'page.title': title } validator.validate() validator.add_violation.assert_called_once_with( key='page.title.size', value={'max_size': 10, 'page_url': page.url}, points=10 )
def test_validator(self): config = Config() page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) url1 = 'http://globo.com/b.html' url2 = 'http://globo.com/a.html' content = '<html><a href="%s">Test</a><a href="%s">Test</a></html>' % ( url1, url2) result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = LinkWithRedirectValidator(reviewer) validator.add_violation = Mock() status_307 = Mock(status_code=307, text='Temporary Redirect') status_302 = Mock(status_code=302, text='Found') validator.review.data = { 'page.links': [ (url1, status_307), (url2, status_302) ] } validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='link.redirect.307', value=307, points=10 )) expect(validator.add_violation.call_args_list).to_include( call( key='link.redirect.302', value=302, points=10 ))
def test_can_validate_title_size_with_domain(self): self.db.query(Key).delete() self.db.query(KeysCategory).delete() config = Config() config.MAX_TITLE_SIZE = 70 key = Key(name='page.title.size') domain = DomainFactory.create(name='globo.com', url='http://globo.com') page = PageFactory.create(domain=domain, url='http://globo.com/a.html') self.sync_cache.redis.delete('violations-prefs-%s' % domain.name) DomainsViolationsPrefsFactory.create(domain=domain, key=key, value='10') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[], cache=self.sync_cache) title = 'a' * 80 content = '<html><title>%s</title></html>' % title result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) reviewer.violation_definitions = { 'page.title.size': { 'default_value': 70, 'key': key }, } validator = TitleValidator(reviewer) validator.add_violation = Mock() validator.review.data = {'page.title_count': 1, 'page.title': title} validator.validate() validator.add_violation.assert_called_once_with(key='page.title.size', value={ 'max_size': 10, 'page_url': page.url }, points=10)
def test_can_get_facts(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = MetaTagsFacter(reviewer) facter.add_fact = Mock() facter.get_facts() values = [{'content': 'utf-8', 'property': None, 'key': 'charset'}, {'content': 'text/html;charset=UTF-8', 'property': 'http-equiv', 'key': 'Content-Type'}, {'content': 'BKmmuVQac1JM6sKlj3IoXQvffyIRJvJfbicMouA2a88', 'property': 'name', 'key': 'google-site-verification'}, {'content': 'width=device-width, initial-scale=1.0, maximum-scale=1.0', 'property': 'name', 'key': 'viewport'}, {'content': u'globo.com - Absolutamente tudo sobre not\xedcias, esportes e entretenimento', 'property': 'property', 'key': 'og:title'}, {'content': 'website', 'property': 'property', 'key': 'og:type'}, {'content': 'http://www.globo.com/', 'property': 'property', 'key': 'og:url'}, {'content': 'http://s.glbimg.com/en/ho/static/globocom2012/img/gcom_marca_og.jpg', 'property': 'property', 'key': 'og:image'}, {'content': 'globo.com', 'property': 'property', 'key': 'og:site_name'}, {'content': u'S\xf3 na globo.com voc\xea encontra tudo sobre o conte\xfado e marcas das Organiza\xe7\xf5es Globo. O melhor acervo de v\xeddeos online sobre entretenimento, esportes e jornalismo do Brasil.', 'property': 'property', 'key': 'og:description'}, {'content': '224969370851736', 'property': 'property', 'key': 'fb:page_id'}, {'content': u'S\xf3 na globo.com voc\xea encontra tudo sobre o conte\xfado e marcas das Organiza\xe7\xf5es Globo. O melhor acervo de v\xeddeos online sobre entretenimento, esportes e jornalismo do Brasil.', 'property': 'name', 'key': 'description'}, {'content': u'Not\xedcias, Entretenimento, Esporte, Tecnologia, Portal, Conte\xfado, Rede Globo, TV Globo, V\xeddeos, Televis\xe3o', 'property': 'name', 'key': 'keywords'}, {'content': 'Globo.com', 'property': 'name', 'key': 'application-name'}, {'content': '#0669DE', 'property': 'name', 'key': 'msapplication-TileColor'}, {'content': 'http://s.glbimg.com/en/ho/static/globocom2012/img/globo-win-tile.png', 'property': 'name', 'key': 'msapplication-TileImage'}] expect(facter.add_fact.call_args_list).to_include( call( key='meta.tags', value=values, )) expect(facter.review.data).to_length(1) expect(facter.review.data).to_include('meta.tags') expect(facter.review.data).to_be_like({'meta.tags': values})
def test_can_validate_heading_hierarchy(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache ) reviewer.violation_definitions = { 'page.heading_hierarchy.size': {'default_value': 150}, } content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = HeadingHierarchyValidator(reviewer) # expecting no call of add_violation method validator.add_violation = Mock() validator.review.data = { 'page.heading_hierarchy': [ ('h1', 'Loren ipsum dolor sit amet'), ] } validator.validate() expect(validator.add_violation.called).to_be_false() # expecting calling add_violation for `page.heading_hierarchy.size` validator.add_violation = Mock() hh_list = [ ('h1', 'Loren ipsum dolor sit amet' * 10), ('h1', 'Loren ipsum dolor sit amet' * 10), ] validator.review.data = {'page.heading_hierarchy': hh_list} validator.validate() expect(validator.add_violation.called).to_be_true() validator.add_violation.assert_called_once_with( key='page.heading_hierarchy.size', value={ 'max_size': 150, 'hh_list': hh_list, }, points=40 )
def test_can_validate_og_title(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) content = '<html>' \ '<meta property="og:title" content="Metal" />' \ '<meta property="og:type" content="video.movie" />' \ '<meta property="og:url" content="http://a.com" />' \ '<meta property="og:image" content="http://a.com/rock.png" />' \ '</html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'meta.tags': [ { 'key': 'og:title', 'content': 'Metal', 'property': 'property' }, { 'key': 'og:type', 'content': 'video.movie', 'property': 'property' }, { 'key': 'og:url', 'content': 'http://a.com', 'property': 'property' }, { 'key': 'og:image', 'content': 'http://a.com/rock.png', 'property': 'property' } ], } validator.validate() expect(validator.add_violation.called).to_be_false()
def test_can_validate_heading_hierarchy(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[], cache=self.sync_cache) reviewer.violation_definitions = { 'page.heading_hierarchy.size': { 'default_value': 150 }, } content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = HeadingHierarchyValidator(reviewer) # expecting no call of add_violation method validator.add_violation = Mock() validator.review.data = { 'page.heading_hierarchy': [ ('h1', 'Loren ipsum dolor sit amet'), ] } validator.validate() expect(validator.add_violation.called).to_be_false() # expecting calling add_violation for `page.heading_hierarchy.size` validator.add_violation = Mock() hh_list = [ ('h1', 'Loren ipsum dolor sit amet' * 10), ('h1', 'Loren ipsum dolor sit amet' * 10), ] validator.review.data = {'page.heading_hierarchy': hh_list} validator.validate() expect(validator.add_violation.called).to_be_true() validator.add_violation.assert_called_once_with( key='page.heading_hierarchy.size', value={ 'max_size': 150, 'hh_list': hh_list, }, points=40)
def test_invalid_link(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = '<html><a href="http://]http://www.globo.com/malhacao">blah</a></html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.add_fact = Mock() facter.async_get = Mock() facter.get_facts() expect(facter.add_fact.call_args_list).to_include( call( key='page.links', value=set([]) )) expect(facter.add_fact.call_args_list).to_include( call( key='total.number.links', value=0 )) expect(facter.add_fact.call_args_list).to_include( call( key='total.number.invalid_links', value=1 )) expect(facter.add_fact.call_args_list).to_include( call( key='page.invalid_links', value=set(['http://]http://www.globo.com/malhacao']) ))
def test_no_get_url_that_exceed_max_url_level(self): page = PageFactory.create(url='http://m.com/') reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), facters=[] ) content = ( '<html>' '<a href="http://m.com/test/">test</a>' '<a href="http://m.com/1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/">m</a>' '<a href="http://m.com/1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/21/">m</a>' '</html>' ) result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer._wait_for_async_requests = Mock() reviewer.save_review = Mock() response = Mock(status_code=200, text=content, headers={}) reviewer.content_loaded(page.url, response) facter = LinkFacter(reviewer) facter.add_fact = Mock() facter.get_facts() expect(facter.add_fact.call_args_list).to_equal([ call( key='page.links', value=set([]) ), call( key='total.number.links', value=2 ), call( key='total.number.invalid_links', value=0 ), call( key='page.invalid_links', value=set([]) )])
def test_can_get_violation_definitions(self): config = Config() config.MAX_JS_REQUESTS_PER_PAGE = 1 config.MAX_JS_KB_PER_PAGE_AFTER_GZIP = 0.03 page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) content = self.get_file('globo.html') result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = JSRequestsValidator(reviewer) definitions = validator.get_violation_definitions() expect(definitions).to_length(2) expect('total.size.js' in definitions).to_be_true() expect('total.requests.js' in definitions).to_be_true() total_size_message = validator.get_total_size_message(0.03) requests_js_message = validator.get_requests_js_message({ 'total_js_files': 7, 'over_limit': 6 }) expect(total_size_message).to_equal( 'There\'s 0.03kb of JavaScript in this page and that adds ' 'up to more download time slowing down the page rendering ' 'to the user.' ) expect(requests_js_message).to_equal( 'This page has 7 JavaScript request (6 over limit). Having too ' 'many requests impose a tax in the browser due to handshakes.' )
def test_can_validate_og_title(self): page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) content = '<html>' \ '<meta property="og:title" content="Metal" />' \ '<meta property="og:type" content="video.movie" />' \ '<meta property="og:url" content="http://a.com" />' \ '<meta property="og:image" content="http://a.com/rock.png" />' \ '</html>' result = { 'url': page.url, 'status': 200, 'content': content, 'html': lxml.html.fromstring(content) } reviewer.responses[page.url] = result reviewer.get_response = Mock(return_value=result) validator = OpenGraphValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'meta.tags': [{ 'key': 'og:title', 'content': 'Metal', 'property': 'property' }, { 'key': 'og:type', 'content': 'video.movie', 'property': 'property' }, { 'key': 'og:url', 'content': 'http://a.com', 'property': 'property' }, { 'key': 'og:image', 'content': 'http://a.com/rock.png', 'property': 'property' }], } validator.validate() expect(validator.add_violation.called).to_be_false()
def test_will_call_reviewer_add_fact(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) reviewer.add_fact = Mock() validator = Validator(reviewer) validator.add_fact('random.fact', 'value') reviewer.add_fact.assert_called_once_with('random.fact', 'value')
def test_can_get_default_violations_values(self): config = Config() config.MAX_HEADING_HIEARARCHY_SIZE = 150 page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = HeadingHierarchyValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.heading_hierarchy.size') expect(violations_values['page.heading_hierarchy.size']).to_length(2) expect(violations_values['page.heading_hierarchy.size']).to_equal({ 'value': config.MAX_HEADING_HIEARARCHY_SIZE, 'description': config.get_description('MAX_HEADING_HIEARARCHY_SIZE') })
def test_add_violation_when_sitemap_is_too_large(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10241 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': [] } validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='total.size.sitemap', value={ 'url': 'http://g1.globo.com/sitemap.xml', 'size': 10.0009765625 }, points=10)
def test_add_violation_when_404(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=404, text=None) } validator.add_violation = Mock() validator.validate() validator.add_violation.assert_called_once_with( key='sitemap.not_found', value='http://g1.globo.com/sitemap.xml', points=100)
def test_add_violation_when_sitemap_with_good_link(self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data', url='http://g1.globo.com/%C3%BCmlat.php&q=name') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 20 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': ['http://g1.globo.com/%C3%BCmlat.php&q=name'] } validator.add_violation = Mock() validator.flush = Mock() validator.validate() expect(validator.add_violation.call_count).to_equal(0) expect(validator.flush.call_count).to_equal(1)
def test_add_violation_when_sitemap_has_links_that_not_need_to_be_encoded( self): page = PageFactory.create(url='http://globo.com') reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) validator = SitemapValidator(reviewer) validator.review.data['sitemap.files.size'] = { 'http://g1.globo.com/sitemap.xml': 10 } validator.review.data['sitemap.data'] = { 'http://g1.globo.com/sitemap.xml': Mock(status_code=200, text='data') } validator.review.data['sitemap.files.urls'] = { 'http://g1.globo.com/sitemap.xml': 20 } validator.review.data['sitemap.urls'] = { 'http://g1.globo.com/sitemap.xml': ['http://g1.globo.com/1.html'] } validator.add_violation = Mock() validator.validate() expect(validator.add_violation.call_count).to_equal(0)
def test_can_validate_with_headers(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = LastModifiedValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.last_modified': datetime.datetime(2014, 1, 13, 1, 16, 10) } validator.review.facts = { 'page.last_modified': datetime.datetime(2014, 1, 13, 1, 16, 10) } validator.validate() expect(validator.add_violation.called).to_be_false()
def test_validate(self): page = PageFactory.create(url='http://globo.com/') reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = RobotsValidator(reviewer) response = Mock(status_code=200, text='key:value') validator.review.data['robots.response'] = response validator.add_violation = Mock() validator.validate() expect(validator.add_violation.call_args_list).to_include( call( key='robots.sitemap.not_found', value=None, points=100 )) expect(validator.add_violation.call_args_list).to_include( call( key='robots.disallow.not_found', value=None, points=100 ))
def test_can_get_default_violations_values(self): config = Config() config.REQUIRED_META_TAGS = ['description'] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = RequiredMetaTagsValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('absent.meta.tags') expect(violations_values['absent.meta.tags']).to_length(2) expect(violations_values['absent.meta.tags']).to_be_like({ 'value': config.REQUIRED_META_TAGS, 'description': config.get_description('REQUIRED_META_TAGS') })
def test_can_get_default_violations_values(self): config = Config() config.SCHEMA_ORG_ITEMTYPE = [ 'http://schema.org/WebPage', 'http://schema.org/AboutPage', ] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = SchemaOrgItemTypeValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('invalid.schema.itemtype') expect(violations_values['invalid.schema.itemtype']).to_length(2) expect(violations_values['invalid.schema.itemtype']).to_equal({ 'value': config.SCHEMA_ORG_ITEMTYPE, 'description': config.get_description('SCHEMA_ORG_ITEMTYPE') })
def test_can_get_default_violations_values(self): config = Config() config.MAX_TITLE_SIZE = 70 page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = TitleValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.title.size') expect(violations_values['page.title.size']).to_length(2) expect(violations_values['page.title.size']).to_be_like({ 'value': config.MAX_TITLE_SIZE, 'description': config.get_description('MAX_TITLE_SIZE') })
def test_can_get_default_violations_values(self): config = Config() config.FORCE_CANONICAL = False page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = LinkWithRelCanonicalValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('absent.meta.canonical') expect(violations_values['absent.meta.canonical']).to_length(2) expect(violations_values['absent.meta.canonical']).to_be_like({ 'value': config.FORCE_CANONICAL, 'description': config.get_description('FORCE_CANONICAL') })
def test_can_get_default_violations_values(self): config = Config() config.METATAG_DESCRIPTION_MAX_SIZE = 300 page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[] ) validator = MetaTagsValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('page.metatags.description_too_big') expect(violations_values['page.metatags.description_too_big']).to_length(2) expect(violations_values['page.metatags.description_too_big']).to_be_like({ 'value': config.METATAG_DESCRIPTION_MAX_SIZE, 'description': config.get_description('METATAG_DESCRIPTION_MAX_SIZE') })
def test_can_get_default_violations_values(self): config = Config() config.BLACKLIST_DOMAIN = ['a.com'] page = PageFactory.create() reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=config, validators=[]) validator = BlackListValidator(reviewer) violations_values = validator.get_default_violations_values(config) expect(violations_values).to_include('blacklist.domains') expect(violations_values['blacklist.domains']).to_length(2) expect(violations_values['blacklist.domains']).to_be_like({ 'value': config.BLACKLIST_DOMAIN, 'description': config.get_description('BLACKLIST_DOMAIN') })
def get_reviewer(self, api_url=None, page_uuid=None, page_url='http://page.url', page_score=0.0, config=None): if api_url is None: api_url = self.get_url('/') if page_uuid is None: page_uuid = str(uuid4()) if config is None: config = self.config return Reviewer( api_url=api_url, page_uuid=str(page_uuid), page_url=page_url, page_score=0.0, config=config, validators=self.validators, facters=self.facters, search_provider=self.search_provider, wait=self.otto.wait, wait_timeout=0, # max time to wait for all requests to finish db=self.db, cache=self.cache, publish=self.publish, async_get=self.async_get, fact_definitions=self.fact_definitions, violation_definitions=self.violation_definitions, )
def test_get_robots_from_root_domain(self): page = PageFactory.create(url="http://www.globo.com") reviewer = Reviewer(api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[]) facter = RobotsFacter(reviewer) facter.async_get = Mock() facter.add_fact = Mock() facter.get_facts() robots_url = 'http://www.globo.com/robots.txt' expect(facter.review.data).to_length(1) expect(facter.review.data['robots.response']).to_equal(None) facter.async_get.assert_called_once_with(robots_url, facter.handle_robots_loaded) response = Mock(status_code=200, text='', headers={}) facter.handle_robots_loaded(robots_url, response) expect(facter.review.data['robots.response']).to_equal(response) expect(facter.add_fact.call_args_list).to_include( call( key='robots.url', value=robots_url, ))
def test_can_validate_last_modified(self): page = PageFactory.create() reviewer = Reviewer( api_url='http://localhost:2368', page_uuid=page.uuid, page_url=page.url, page_score=0.0, config=Config(), validators=[] ) validator = LastModifiedValidator(reviewer) validator.add_violation = Mock() validator.review.data = { 'page.last_modified': None } validator.review.facts = { 'page.last_modified': None } validator.validate() validator.add_violation.assert_called_once_with( key='page.last_modified.not_found', value=page.url, points=50 )