def test_null_url(self): domain, url = get_domain_from_url(None) expect(domain).to_equal('') expect(url).to_equal('') domain, url = get_domain_from_url('') expect(domain).to_equal('') expect(url).to_equal('')
def validate(self): links = self.get_links() total_score = float(self.reviewer.page_score) tax = total_score * float(self.reviewer.config.PAGE_SCORE_TAX_RATE) available_score = total_score - tax number_of_links = float(len(links)) or 1.0 link_score = available_score / number_of_links for url, response in links: domain, domain_url = get_domain_from_url(url) if domain in self.page_url: self.send_url(response.effective_url, link_score, response) else: if response.status_code in [302, 307]: self.moved_link_violation(url, response) elif response.status_code > 399: self.broken_link_violation(url, response) if self.broken_links: self.add_violation( key='link.broken', value=self.broken_links, points=100 * len(self.broken_links) ) if self.moved_links: self.add_violation( key='link.moved.temporarily', value=self.moved_links, points=100 ) self.flush()
def get_blacklist_domain_count(cls, db): ungrouped = defaultdict(int) for urls, count in Violation.get_group_by_value_for_key( db, 'blacklist.domains'): for url in urls: domain, null = get_domain_from_url(url) ungrouped[domain] += count blacklist = sorted(ungrouped.items(), key=lambda xz: -xz[1]) return [dict(zip(('domain', 'count'), x)) for x in blacklist]
def __init__( self, api_url, page_uuid, page_url, page_score, config=None, validators=[], facters=[], search_provider=None, async_get=None, wait=None, wait_timeout=None, db=None, cache=None, publish=None, fact_definitions=None, violation_definitions=None, girl=None): self.db = db self.cache = cache self.girl = girl self.publish = publish self.api_url = api_url self.page_uuid = page_uuid self.page_url = page_url self.page_score = page_score self.domain_name, self.domain_url = get_domain_from_url(self.page_url) self.ping_method = None self.review_dao = ReviewDAO(self.page_uuid, self.page_url) assert isinstance(config, Config), 'config argument must be an instance of holmes.config.Config' self.config = config for facter in facters: message = 'All facters must subclass holmes.facters.Facter (Error: %s)' % facter.__class__.__name__ assert inspect.isclass(facter), message assert issubclass(facter, Facter), message for validator in validators: message = 'All validators must subclass holmes.validators.base.Validator (Error: %s)' % validator.__class__.__name__ assert inspect.isclass(validator), message assert issubclass(validator, Validator), message self.validators = validators self.facters = facters self.search_provider = search_provider self.responses = {} self.raw_responses = {} self.status_codes = {} self.async_get_func = async_get self._wait_for_async_requests = wait self._wait_timeout = wait_timeout self.fact_definitions = fact_definitions self.violation_definitions = violation_definitions
def validate(self): links = self.get_links() page_domain, domain_url = get_domain_from_url(self.review.page_url) rel_nofollow = [] for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link.get('rel') == 'nofollow' and page_domain == link_domain: rel_nofollow.append(href) if rel_nofollow: self.add_violation(key='invalid.links.nofollow', value=rel_nofollow, points=10 * len(rel_nofollow))
def _start_job(self, job): try: lock = Lock(job['url'], redis=self.redis, timeout=1) lock.acquire() self.working_url = job['url'] if self.working_url: self.domain_name, domain_url = get_domain_from_url( self.working_url) self._ping_api() job['lock'] = lock return True except LockTimeout: job['lock'] = None return False
def add_domain(cls, url, db, publish_method, config, girl, default_violations_values, violation_definitions, cache): from holmes.models import Domain, DomainsViolationsPrefs from holmes.material import expire_materials domain_name, domain_url = get_domain_from_url(url) domains = db.query(Domain).filter( or_(Domain.name == domain_name, Domain.name == domain_name.rstrip('/'), Domain.name == "%s/" % domain_name)).all() if not domains: domain = None else: domain = domains[0] if not domain: url_hash = hashlib.sha512(domain_url).hexdigest() domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name) db.add(domain) db.flush() expire_materials(girl) publish_method( dumps({ 'type': 'new-domain', 'domainUrl': str(domain_url) })) keys = default_violations_values.keys() DomainsViolationsPrefs.insert_default_violations_values_for_domain( db, domain, keys, violation_definitions, cache) from holmes.models import Limiter connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS Limiter.add_or_update_limiter(db, domain_url, connections) return domain
def validate(self): blacklist_domains = self.get_violation_pref('blacklist.domains') domains = [] links = self.get_links() for link in links: href = link.get('href') if not is_valid(href): continue link_domain, link_domain_url = get_domain_from_url(href) if link_domain in blacklist_domains: domains.append(href) if domains: self.add_violation(key='blacklist.domains', value=domains, points=100 * len(domains))
def get(self): term = self.get_argument('term') page = self.db.query(Page) \ .filter(or_( Page.url == term, Page.url == term.rstrip('/') )) \ .filter(Page.last_review != None) \ .first() if page is None: self.write_json(None) return domain_name, domain_url = get_domain_from_url(page.url) self.write_json({ "uuid": str(page.uuid), "url": page.url, "reviewId": str(page.last_review.uuid), "domain": domain_name })
def add_page(cls, db, cache, url, score, fetch_method, publish_method, config, girl, default_violations_values, violation_definitions, callback): domain_name, domain_url = get_domain_from_url(url) if not url or not domain_name: callback((False, url, { 'reason': 'invalid_url', 'url': url, 'status': None, 'details': 'Domain name could not be determined.' })) return logging.debug('Obtaining "%s"...' % url) fetch_method(url, cls.handle_request( cls.handle_add_page(db, cache, url, score, publish_method, config, girl, default_violations_values, violation_definitions, callback)), proxy_host=config.HTTP_PROXY_HOST, proxy_port=config.HTTP_PROXY_PORT)
def test_single_url_using_custom_scheme(self): domain, url = get_domain_from_url('globo.com', default_scheme='https') expect(domain).to_equal('globo.com') expect(url).to_equal('https://globo.com')
def test_localhost(self): domain, url = get_domain_from_url('http://localhost/Python.html') expect(domain).to_equal('localhost') expect(url).to_equal('http://localhost')
def test_page_invalid_url(self): domain, url = get_domain_from_url('help/Python.html') expect(domain).to_equal('') expect(url).to_equal('')
def test_page_with_www(self): domain, url = get_domain_from_url('http://www.globo.com:80/%7Eguido/Python.html') expect(domain).to_equal('globo.com') expect(url).to_equal('http://www.globo.com')
def test_page_invalid_protocol(self): domain, url = get_domain_from_url('ttp://globo.com') expect(domain).to_equal('globo.com') expect(url).to_equal('http://globo.com')
def test_page_without_protocol_with_port(self): domain, url = get_domain_from_url('globo.com:80/%7Eguido/Python.html') expect(domain).to_equal('globo.com') expect(url).to_equal('http://globo.com')
def test_single_page_url_with_port(self): domain, url = get_domain_from_url("http://globo.com:80/index.html") expect(domain).to_equal('globo.com') expect(url).to_equal('http://globo.com')
def test_single_page_url(self): domain, url = get_domain_from_url('http://globo.com/index.html') expect(domain).to_equal('globo.com') expect(url).to_equal('http://globo.com')
def test_single_https_url(self): domain, url = get_domain_from_url('https://globo.com') expect(domain).to_equal('globo.com') expect(url).to_equal('https://globo.com')