Beispiel #1
0
    def test_null_url(self):
        domain, url = get_domain_from_url(None)
        expect(domain).to_equal('')
        expect(url).to_equal('')

        domain, url = get_domain_from_url('')
        expect(domain).to_equal('')
        expect(url).to_equal('')
Beispiel #2
0
    def validate(self):
        links = self.get_links()

        total_score = float(self.reviewer.page_score)
        tax = total_score * float(self.reviewer.config.PAGE_SCORE_TAX_RATE)
        available_score = total_score - tax

        number_of_links = float(len(links)) or 1.0
        link_score = available_score / number_of_links

        for url, response in links:
            domain, domain_url = get_domain_from_url(url)
            if domain in self.page_url:
                self.send_url(response.effective_url, link_score, response)
            else:
                if response.status_code in [302, 307]:
                    self.moved_link_violation(url, response)
                elif response.status_code > 399:
                    self.broken_link_violation(url, response)

        if self.broken_links:
            self.add_violation(
                key='link.broken',
                value=self.broken_links,
                points=100 * len(self.broken_links)
            )

        if self.moved_links:
            self.add_violation(
                key='link.moved.temporarily',
                value=self.moved_links,
                points=100
            )

        self.flush()
Beispiel #3
0
 def get_blacklist_domain_count(cls, db):
     ungrouped = defaultdict(int)
     for urls, count in Violation.get_group_by_value_for_key(
             db, 'blacklist.domains'):
         for url in urls:
             domain, null = get_domain_from_url(url)
             ungrouped[domain] += count
     blacklist = sorted(ungrouped.items(), key=lambda xz: -xz[1])
     return [dict(zip(('domain', 'count'), x)) for x in blacklist]
Beispiel #4
0
    def __init__(
            self, api_url, page_uuid, page_url, page_score,
            config=None, validators=[], facters=[], search_provider=None, async_get=None,
            wait=None, wait_timeout=None, db=None, cache=None, publish=None,
            fact_definitions=None, violation_definitions=None, girl=None):

        self.db = db
        self.cache = cache
        self.girl = girl
        self.publish = publish

        self.api_url = api_url

        self.page_uuid = page_uuid
        self.page_url = page_url
        self.page_score = page_score

        self.domain_name, self.domain_url = get_domain_from_url(self.page_url)

        self.ping_method = None

        self.review_dao = ReviewDAO(self.page_uuid, self.page_url)

        assert isinstance(config, Config), 'config argument must be an instance of holmes.config.Config'
        self.config = config

        for facter in facters:
            message = 'All facters must subclass holmes.facters.Facter (Error: %s)' % facter.__class__.__name__
            assert inspect.isclass(facter), message
            assert issubclass(facter, Facter), message

        for validator in validators:
            message = 'All validators must subclass holmes.validators.base.Validator (Error: %s)' % validator.__class__.__name__
            assert inspect.isclass(validator), message
            assert issubclass(validator, Validator), message

        self.validators = validators
        self.facters = facters

        self.search_provider = search_provider

        self.responses = {}
        self.raw_responses = {}
        self.status_codes = {}

        self.async_get_func = async_get
        self._wait_for_async_requests = wait
        self._wait_timeout = wait_timeout

        self.fact_definitions = fact_definitions
        self.violation_definitions = violation_definitions
    def validate(self):

        links = self.get_links()

        page_domain, domain_url = get_domain_from_url(self.review.page_url)

        rel_nofollow = []

        for link in links:
            href = link.get('href')

            if not is_valid(href):
                continue

            link_domain, link_domain_url = get_domain_from_url(href)

            if link.get('rel') == 'nofollow' and page_domain == link_domain:
                rel_nofollow.append(href)

        if rel_nofollow:
            self.add_violation(key='invalid.links.nofollow',
                               value=rel_nofollow,
                               points=10 * len(rel_nofollow))
Beispiel #6
0
    def _start_job(self, job):
        try:
            lock = Lock(job['url'], redis=self.redis, timeout=1)
            lock.acquire()

            self.working_url = job['url']

            if self.working_url:
                self.domain_name, domain_url = get_domain_from_url(
                    self.working_url)

            self._ping_api()
            job['lock'] = lock

            return True
        except LockTimeout:
            job['lock'] = None
            return False
Beispiel #7
0
    def add_domain(cls, url, db, publish_method, config, girl,
                   default_violations_values, violation_definitions, cache):

        from holmes.models import Domain, DomainsViolationsPrefs
        from holmes.material import expire_materials

        domain_name, domain_url = get_domain_from_url(url)

        domains = db.query(Domain).filter(
            or_(Domain.name == domain_name,
                Domain.name == domain_name.rstrip('/'),
                Domain.name == "%s/" % domain_name)).all()

        if not domains:
            domain = None
        else:
            domain = domains[0]

        if not domain:
            url_hash = hashlib.sha512(domain_url).hexdigest()
            domain = Domain(url=domain_url,
                            url_hash=url_hash,
                            name=domain_name)
            db.add(domain)
            db.flush()

            expire_materials(girl)

            publish_method(
                dumps({
                    'type': 'new-domain',
                    'domainUrl': str(domain_url)
                }))

            keys = default_violations_values.keys()

            DomainsViolationsPrefs.insert_default_violations_values_for_domain(
                db, domain, keys, violation_definitions, cache)

            from holmes.models import Limiter
            connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
            Limiter.add_or_update_limiter(db, domain_url, connections)

        return domain
Beispiel #8
0
    def validate(self):
        blacklist_domains = self.get_violation_pref('blacklist.domains')

        domains = []

        links = self.get_links()

        for link in links:
            href = link.get('href')

            if not is_valid(href):
                continue

            link_domain, link_domain_url = get_domain_from_url(href)
            if link_domain in blacklist_domains:
                domains.append(href)

        if domains:
            self.add_violation(key='blacklist.domains',
                               value=domains,
                               points=100 * len(domains))
Beispiel #9
0
    def get(self):
        term = self.get_argument('term')

        page = self.db.query(Page) \
            .filter(or_(
                Page.url == term,
                Page.url == term.rstrip('/')
            )) \
            .filter(Page.last_review != None) \
            .first()

        if page is None:
            self.write_json(None)
            return

        domain_name, domain_url = get_domain_from_url(page.url)

        self.write_json({
            "uuid": str(page.uuid),
            "url": page.url,
            "reviewId": str(page.last_review.uuid),
            "domain": domain_name
        })
Beispiel #10
0
    def add_page(cls, db, cache, url, score, fetch_method, publish_method,
                 config, girl, default_violations_values,
                 violation_definitions, callback):

        domain_name, domain_url = get_domain_from_url(url)
        if not url or not domain_name:
            callback((False, url, {
                'reason': 'invalid_url',
                'url': url,
                'status': None,
                'details': 'Domain name could not be determined.'
            }))
            return

        logging.debug('Obtaining "%s"...' % url)

        fetch_method(url,
                     cls.handle_request(
                         cls.handle_add_page(db, cache, url, score,
                                             publish_method, config, girl,
                                             default_violations_values,
                                             violation_definitions, callback)),
                     proxy_host=config.HTTP_PROXY_HOST,
                     proxy_port=config.HTTP_PROXY_PORT)
Beispiel #11
0
 def test_single_url_using_custom_scheme(self):
     domain, url = get_domain_from_url('globo.com', default_scheme='https')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('https://globo.com')
Beispiel #12
0
 def test_localhost(self):
     domain, url = get_domain_from_url('http://localhost/Python.html')
     expect(domain).to_equal('localhost')
     expect(url).to_equal('http://localhost')
Beispiel #13
0
 def test_page_invalid_url(self):
     domain, url = get_domain_from_url('help/Python.html')
     expect(domain).to_equal('')
     expect(url).to_equal('')
Beispiel #14
0
 def test_page_with_www(self):
     domain, url = get_domain_from_url('http://www.globo.com:80/%7Eguido/Python.html')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('http://www.globo.com')
Beispiel #15
0
 def test_page_invalid_protocol(self):
     domain, url = get_domain_from_url('ttp://globo.com')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('http://globo.com')
Beispiel #16
0
 def test_page_without_protocol_with_port(self):
     domain, url = get_domain_from_url('globo.com:80/%7Eguido/Python.html')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('http://globo.com')
Beispiel #17
0
 def test_single_page_url_with_port(self):
     domain, url = get_domain_from_url("http://globo.com:80/index.html")
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('http://globo.com')
Beispiel #18
0
 def test_single_page_url(self):
     domain, url = get_domain_from_url('http://globo.com/index.html')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('http://globo.com')
Beispiel #19
0
 def test_single_https_url(self):
     domain, url = get_domain_from_url('https://globo.com')
     expect(domain).to_equal('globo.com')
     expect(url).to_equal('https://globo.com')