Exemple #1
0
    def add_domain(cls, url, db, publish_method, config):
        from holmes.models import Domain

        domain_name, domain_url = get_domain_from_url(url)

        domains = db.query(Domain).filter(or_(
            Domain.name == domain_name,
            Domain.name == domain_name.rstrip('/'),
            Domain.name == "%s/" % domain_name
        )).all()

        if not domains:
            domain = None
        else:
            domain = domains[0]

        if not domain:
            url_hash = hashlib.sha512(domain_url).hexdigest()
            domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name)
            db.add(domain)
            db.flush()
            db.commit()

            publish_method(dumps({
                'type': 'new-domain',
                'domainUrl': str(domain_url)
            }))

            from holmes.models import Limiter
            connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
            Limiter.add_or_update_limiter(db, domain_url, connections)

        return domain
    def test_can_get_limiter_by_id(self):
        limiter = LimiterFactory.create()

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter.id).to_equal(limiter.id)

        invalid_limiter = Limiter.by_id(-1, self.db)
        expect(invalid_limiter).to_be_null()
    def test_can_get_limiter_by_url(self):
        self.db.query(Limiter).delete()

        limiter = LimiterFactory.create(url='http://test.com/')

        loaded_limiter = Limiter.by_url('http://test.com/', self.db)
        expect(loaded_limiter.id).to_equal(limiter.id)

        invalid_limiter = Limiter.by_url('http://test.com/1', self.db)
        expect(invalid_limiter).to_be_null()
    def test_can_delete_one_limiter(self):
        limiter = LimiterFactory.create()

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter.id).to_equal(limiter.id)

        affected = Limiter.delete(limiter.id, self.db)
        expect(affected).to_equal(1)

        deleted_limiter = Limiter.by_id(limiter.id, self.db)
        expect(deleted_limiter).to_be_null()
    def test_can_get_limiter_by_url_hash(self):
        self.db.query(Limiter).delete()

        limiter = LimiterFactory.create(url='http://test.com/')

        url_hash = hashlib.sha512('http://test.com/').hexdigest()

        loaded_limiter = Limiter.by_url_hash(url_hash, self.db)
        expect(loaded_limiter.id).to_equal(limiter.id)

        invalid_limiter = Limiter.by_url_hash('00000000', self.db)
        expect(invalid_limiter).to_be_null()
    def test_can_save_limiters_with_access_token(self):
        self.db.query(Limiter).delete()
        self.db.query(User).delete()

        dt = datetime(2014, 2, 14, 15, 0, 30)

        UserFactory(email='*****@*****.**')

        user_data = dumps({
            'is_superuser': True,
            'fullname': u'Marcelo Jorge Vieira',
            'last_login': dt,
            'email': u'*****@*****.**'
        })

        self.mock_request(code=200, body=user_data)

        yield self.http_client.fetch(
            self.get_url('/limiters'),
            method='POST',
            body=dumps({
                'url': 'http://globo.com/',
                'maxValue': 10
            }),
            headers={'X-AUTH-HOLMES': '111'}
        )

        loaded_limiter = Limiter.by_url('http://globo.com/', self.db)
        expect(loaded_limiter).not_to_be_null()
Exemple #7
0
    def post(self):
        access_token = self.request.headers.get('X-AUTH-HOLMES', None)

        if access_token is None:
            self.set_status(403)
            self.write_json({'reason': 'Empty access token'})
            return

        result = yield User.authenticate(
            access_token,
            self.application.http_client.fetch,
            self.db,
            self.application.config
        )

        if result and result.get('user', None) is None:
            self.set_status(403)
            self.write_json({'reason': 'Not authorized user.'})
            return

        post_data = loads(self.request.body)
        url = post_data.get('url', None)
        connections = self.application.config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
        value = post_data.get('value', connections)

        if not url and not value:
            self.set_status(400)
            self.write_json({'reason': 'Not url or value'})
            return

        result = Limiter.add_or_update_limiter(self.db, url, value)

        yield self.cache.remove_domain_limiters_key()

        self.write_json(result)
Exemple #8
0
    def add_domain(cls, url, db, publish_method, config, girl, default_violations_values, violation_definitions, cache):

        from holmes.models import Domain, DomainsViolationsPrefs
        from holmes.material import expire_materials

        domain_name, domain_url = get_domain_from_url(url)

        domains = (
            db.query(Domain)
            .filter(
                or_(
                    Domain.name == domain_name,
                    Domain.name == domain_name.rstrip("/"),
                    Domain.name == "%s/" % domain_name,
                )
            )
            .all()
        )

        if not domains:
            domain = None
        else:
            domain = domains[0]

        if not domain:
            url_hash = hashlib.sha512(domain_url).hexdigest()
            domain = Domain(url=domain_url, url_hash=url_hash, name=domain_name)
            db.add(domain)
            db.flush()

            expire_materials(girl)

            publish_method(dumps({"type": "new-domain", "domainUrl": str(domain_url)}))

            keys = default_violations_values.keys()

            DomainsViolationsPrefs.insert_default_violations_values_for_domain(
                db, domain, keys, violation_definitions, cache
            )

            from holmes.models import Limiter

            connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
            Limiter.add_or_update_limiter(db, domain_url, connections)

        return domain
    def test_can_get_all_limiters(self):
        self.db.query(Limiter).delete()

        limiter = LimiterFactory.create(url='http://test.com/')
        LimiterFactory.create()
        LimiterFactory.create()

        limiters = Limiter.get_all(self.db)

        expect(limiters).not_to_be_null()
        expect(limiters).to_length(3)
        expect(limiters).to_include(limiter)

        DomainFactory.create(name='test.com', url='http://test.com')
        limiters = Limiter.get_all(self.db, domain_filter='test.com')

        expect(limiters).not_to_be_null()
        expect(limiters).to_length(1)
Exemple #10
0
    def test_can_delete_limiter_as_superuser(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        response = yield self.authenticated_fetch('/limiters/%d' % limiter.id,
                                                  user_email=user.email,
                                                  method='DELETE')

        expect(response.code).to_equal(204)
        expect(response.body).to_length(0)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).to_be_null()
Exemple #11
0
    def get_limiter_buckets(self, active_domains, avg_links_per_page=10.0):
        available = []
        all_limiters = reversed(sorted(Limiter.get_limiters_for_domains(self.db, active_domains), key=lambda item: item.url))

        for limiter in all_limiters:
            capacity = float(limiter.value - self.get_limit_usage(limiter.url))
            available.append((limiter, capacity))

        return available
Exemple #12
0
    def add_domain(cls, url, db, publish_method, config, girl,
                   default_violations_values, violation_definitions, cache):

        from holmes.models import Domain, DomainsViolationsPrefs
        from holmes.material import expire_materials

        domain_name, domain_url = get_domain_from_url(url)

        domains = db.query(Domain).filter(
            or_(Domain.name == domain_name,
                Domain.name == domain_name.rstrip('/'),
                Domain.name == "%s/" % domain_name)).all()

        if not domains:
            domain = None
        else:
            domain = domains[0]

        if not domain:
            url_hash = hashlib.sha512(domain_url).hexdigest()
            domain = Domain(url=domain_url,
                            url_hash=url_hash,
                            name=domain_name)
            db.add(domain)
            db.flush()

            expire_materials(girl)

            publish_method(
                dumps({
                    'type': 'new-domain',
                    'domainUrl': str(domain_url)
                }))

            keys = default_violations_values.keys()

            DomainsViolationsPrefs.insert_default_violations_values_for_domain(
                db, domain, keys, violation_definitions, cache)

            from holmes.models import Limiter
            connections = config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
            Limiter.add_or_update_limiter(db, domain_url, connections)

        return domain
    def test_get_limiters_for_domains(self):
        self.db.query(Limiter).delete()
        self.db.query(Domain).delete()

        for i in range(2):
            DomainFactory.create()

        value = 2
        url = 'http://globo.com/'
        Limiter.add_or_update_limiter(self.db, url, value)

        DomainFactory.create(url=url, name='globo.com')

        active_domains = self.db.query(Domain).all()

        domains = Limiter.get_limiters_for_domains(self.db, active_domains)

        expect(domains).to_length(1)
        expect(str(domains[0])).to_equal(url)
    def test_can_delete_limiter_as_superuser(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        response = yield self.authenticated_fetch(
            '/limiters/%d' % limiter.id, user_email=user.email,
            method='DELETE'
        )

        expect(response.code).to_equal(204)
        expect(response.body).to_length(0)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).to_be_null()
Exemple #15
0
    def get_limiter_buckets(self, active_domains, avg_links_per_page=10.0):
        available = []
        all_limiters = reversed(
            sorted(Limiter.get_limiters_for_domains(self.db, active_domains),
                   key=lambda item: item.url))

        for limiter in all_limiters:
            capacity = float(limiter.value - self.get_limit_usage(limiter.url))
            available.append((limiter, capacity))

        return available
Exemple #16
0
    def get_limiters_for_domains(cls, db, active_domains):
        from holmes.models import Limiter  # Avoid circular dependency

        all_limiters = Limiter.get_all(db)

        limiters = []
        for limiter in all_limiters:
            for domain in active_domains:
                if limiter.matches(domain.url):
                    limiters.append(limiter)

        return limiters
Exemple #17
0
    def delete(self, limiter_id=None):
        if not self.validate_superuser():
            return

        if not limiter_id:
            self.set_status(400)
            self.write_json({'reason': 'Invalid data', 'description': self._('Invalid data')})
            return

        limiter = Limiter.by_id(limiter_id, self.db)

        if not limiter or not limiter.id:
            self.set_status(404)
            self.write_json({'reason': 'Not Found', 'description': self._('Not Found')})
            return

        Limiter.delete(limiter.id, self.db)

        yield self.cache.remove_domain_limiters_key()

        self.set_status(204)
Exemple #18
0
    def get_limiters_for_domains(cls, db, active_domains):
        from holmes.models import Limiter  # Avoid circular dependency

        all_limiters = Limiter.get_all(db)

        limiters = []
        for limiter in all_limiters:
            for domain in active_domains:
                if limiter.matches(domain.url):
                    limiters.append(limiter)

        return limiters
Exemple #19
0
    def get_domain_limiters(self):
        domains = self.redis.get('domain-limiters')

        if domains:
            domains = loads(domains)
        else:
            limiters = Limiter.get_all(self.db)
            if limiters:
                domains = [{d.url: d.value} for d in limiters]
                self.set_domain_limiters(
                    domains, self.config.LIMITER_VALUES_CACHE_EXPIRATION)

        return domains
Exemple #20
0
    def get_domain_limiters(self):
        domains = self.redis.get('domain-limiters')

        if domains:
            domains = loads(domains)
        else:
            limiters = Limiter.get_all(self.db)
            if limiters:
                domains = [{d.url: d.value} for d in limiters]
                self.set_domain_limiters(
                    domains,
                    self.config.LIMITER_VALUES_CACHE_EXPIRATION
                )

        return domains
    def test_can_save_limiters_as_superuser(self):
        self.db.query(Limiter).delete()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        response = yield self.authenticated_fetch(
            '/limiters', user_email=user.email, method='POST', body=dumps({
                'url': 'http://globo.com/',
                'maxValue': 10
            })
        )
        expect(response).not_to_be_null()
        expect(response.code).to_equal(200)

        loaded_limiter = Limiter.by_url('http://globo.com/', self.db)
        expect(loaded_limiter).not_to_be_null()
Exemple #22
0
    def test_cant_delete_limiter_as_normal_user(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=False)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        try:
            yield self.authenticated_fetch('/limiters/%d' % limiter.id,
                                           user_email=user.email,
                                           method='DELETE')
        except HTTPError, e:
            expect(e).not_to_be_null()
            expect(e.code).to_equal(401)
            expect(e.response.reason).to_be_like('Unauthorized')
    def test_cant_delete_limiter_as_normal_user(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=False)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        try:
            yield self.authenticated_fetch(
                '/limiters/%d' % limiter.id, user_email=user.email,
                method='DELETE'
            )
        except HTTPError, e:
            expect(e).not_to_be_null()
            expect(e.code).to_equal(401)
            expect(e.response.reason).to_be_like('Unauthorized')
Exemple #24
0
    def test_can_save_limiters_as_superuser(self):
        self.db.query(Limiter).delete()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        response = yield self.authenticated_fetch('/limiters',
                                                  user_email=user.email,
                                                  method='POST',
                                                  body=dumps({
                                                      'url':
                                                      'http://globo.com/',
                                                      'maxValue': 10
                                                  }))
        expect(response).not_to_be_null()
        expect(response.code).to_equal(200)

        loaded_limiter = Limiter.by_url('http://globo.com/', self.db)
        expect(loaded_limiter).not_to_be_null()
Exemple #25
0
    def get(self):
        limiters = Limiter.get_all(self.db)

        result = []

        for limit in limiters:
            current_value = yield self.cache.get_limit_usage(limit.url) or 0

            percentage = 0
            if limit.value > 0:
                percentage = float(current_value) / limit.value

            result.append({
                'id': limit.id,
                'url': limit.url,
                'currentValue': current_value,
                'maxValue': limit.value or 0,
                'concurrentRequestsPercentage': percentage
            })

        self.write_json(result)
Exemple #26
0
    def post(self):
        if not self.validate_superuser():
            return

        post_data = loads(self.request.body)
        url = post_data.get('url', None)
        connections = self.config.DEFAULT_NUMBER_OF_CONCURRENT_CONNECTIONS
        value = post_data.get('value', connections)

        if not url and not value:
            self.set_status(400)
            self.write_json({
                'reason': 'Not url or value',
                'description': self._('Not url or value')
            })
            return

        result = Limiter.add_or_update_limiter(self.db, url, value)

        yield self.cache.remove_domain_limiters_key()

        self.write_json(result)
    def test_can_add_or_update_limiter(self):
        self.db.query(Limiter).delete()

        limiters = Limiter.get_all(self.db)
        expect(limiters).to_equal([])

        # Add
        url = 'http://globo.com/'
        value = 2
        Limiter.add_or_update_limiter(self.db, url, value)
        limiter = Limiter.by_url(url, self.db)

        expect(limiter.value).to_equal(2)

        # Update
        url = 'http://globo.com/'
        value = 3
        Limiter.add_or_update_limiter(self.db, url, value)
        limiter = Limiter.by_url(url, self.db)

        expect(limiter.value).to_equal(3)
Exemple #28
0
    def add_or_update_limiter(cls, db, url, value):
        if not url:
            return

        url = url.encode('utf-8')
        url_hash = hashlib.sha512(url).hexdigest()
        limiter = Limiter.by_url_hash(url_hash, db)

        if limiter:
            db \
                .query(Limiter) \
                .filter(Limiter.id == limiter.id) \
                .update({'value': value})

            db.flush()
            db.commit()

            return limiter.url

        limiter = Limiter(url=url, url_hash=url_hash, value=value)
        db.add(limiter)

        return limiter.url
Exemple #29
0
    def get(self):

        limiters = Limiter.get_all(
            self.db, domain_filter=self.get_argument('domain_filter', None)
        )

        result = []

        for limit in limiters:
            current_value = yield self.cache.get_limit_usage(limit.url) or 0

            percentage = 0
            if limit.value > 0:
                percentage = float(current_value) / limit.value

            result.append({
                'id': limit.id,
                'url': limit.url,
                'currentValue': current_value,
                'maxValue': limit.value or 0,
                'concurrentRequestsPercentage': percentage
            })

        self.write_json(result)
Exemple #30
0
        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        try:
            yield self.authenticated_fetch('/limiters/%d' % limiter.id,
                                           user_email=user.email,
                                           method='DELETE')
        except HTTPError, e:
            expect(e).not_to_be_null()
            expect(e.code).to_equal(401)
            expect(e.response.reason).to_be_like('Unauthorized')
        else:
            assert False, 'Should not have got this far'

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

    @gen_test
    def test_can_delete_limiter_as_superuser(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        response = yield self.authenticated_fetch('/limiters/%d' % limiter.id,
                                                  user_email=user.email,
                                                  method='DELETE')
Exemple #31
0
    def get_next_job(cls, db, expiration, cache, lock_expiration, avg_links_per_page=10):
        from holmes.models import Settings, Worker, Domain, Limiter  # Avoid circular dependency

        page = None
        lock = None
        settings = Settings.instance(db)
        workers = db.query(Worker).all()
        number_of_workers = len(workers)

        active_domains = Domain.get_active_domains(db)
        active_domains_ids = [item.id for item in active_domains]

        all_domains_pages_in_need_of_review = {}

        for domain_id in active_domains_ids:
            pages = db \
                .query(
                    Page.uuid,
                    Page.url,
                    Page.score,
                    Page.last_review_date
                ) \
                .filter(Page.domain_id == domain_id) \
                .order_by(Page.score.desc())[:number_of_workers]
            if pages:
                all_domains_pages_in_need_of_review[domain_id] = pages

        pages_in_need_of_review = []
        current_domain = 0
        while all_domains_pages_in_need_of_review:
            domains = all_domains_pages_in_need_of_review.keys()
            if current_domain >= len(domains):
                current_domain = 0

            domain_id = domains[current_domain]

            item = all_domains_pages_in_need_of_review[domain_id].pop(0)
            pages_in_need_of_review.append(item)

            if not all_domains_pages_in_need_of_review[domain_id]:
                del all_domains_pages_in_need_of_review[domain_id]

            current_domain += 1

        if not pages_in_need_of_review:
            return None

        if settings.lambda_score > 0 and settings.lambda_score > pages_in_need_of_review[0].score:
            cls.update_pages_score_by(settings, settings.lambda_score, db)

        for i in range(len(pages_in_need_of_review)):
            if not Limiter.has_limit_to_work(db, active_domains, pages_in_need_of_review[i].url, avg_links_per_page):
                continue

            lock = cache.has_next_job_lock(
                pages_in_need_of_review[i].url,
                lock_expiration
            )

            if lock is not None:
                page = pages_in_need_of_review[i]
                break

        if page is None:
            return None

        return {
            'page': str(page.uuid),
            'url': page.url,
            'score': page.score,
            'lock': lock
        }
        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        try:
            yield self.authenticated_fetch(
                '/limiters/%d' % limiter.id, user_email=user.email,
                method='DELETE'
            )
        except HTTPError, e:
            expect(e).not_to_be_null()
            expect(e.code).to_equal(401)
            expect(e.response.reason).to_be_like('Unauthorized')
        else:
            assert False, 'Should not have got this far'

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

    @gen_test
    def test_can_delete_limiter_as_superuser(self):
        self.db.query(Limiter).delete()
        limiter = LimiterFactory.create()
        self.db.query(User).delete()
        user = UserFactory(email='*****@*****.**', is_superuser=True)

        loaded_limiter = Limiter.by_id(limiter.id, self.db)
        expect(loaded_limiter).not_to_be_null()

        response = yield self.authenticated_fetch(
            '/limiters/%d' % limiter.id, user_email=user.email,
            method='DELETE'
Exemple #33
0
 def _verify_workers_limits(self, url, avg_links_per_page=10):
     active_domains = Domain.get_active_domains(self.db)
     return LimiterModel.has_limit_to_work(self.db, active_domains, url, avg_links_per_page)