Exemple #1
0
 def test_trailing_whitespace(self, client):
     # accept, because browsers accept this
     url = client + '/?status=200 '
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #2
0
def download(context, resource, url_timeout=URL_TIMEOUT,
             max_content_length=MAX_CONTENT_LENGTH,
             data_formats=DATA_FORMATS):

    from ckanext.archiver import tasks

    res = None
    resource_changed = False

    link_context = "{}"
    link_data = json.dumps({
        'url': resource['url'],
        'url_timeout': url_timeout
    })

    user_agent_string = config.get('ckanext.archiver.user_agent_string', None)

    try:
        headers = json.loads(tasks.link_checker(link_context, link_data))
    except tasks.LinkCheckerError, e:
        if any(x in str(e).lower() for x in ('method not allowed', 'internal server error', )):
            # If the HEAD method is not supported or if a 500
            # error is returned we'll handle the download manually
            request_headers = {}
            if user_agent_string is not None:
                request_headers['User-Agent'] = user_agent_string
            res = requests.get(resource['url'], timeout=url_timeout,
                               headers=request_headers)
            headers = res.headers
        else:
            raise
    def _check_link(self, url):
        """
        Synchronously check the given link, and return dict representing results.
        Does not handle 30x redirects.
        """

        # If a user enters "www.example.com" then we assume they meant "http://www.example.com"
        scheme, path = splittype(url)
        if not scheme:
            url = 'http://' + path

        context = {}
        data = {'url_timeout': 10, 'url': url}
        result = {
            'errors': [],
            'url_errors': [],
            'format': '',
            'mimetype': '',
            'size': '',
            'last_modified': '',
        }

        try:
            headers = json.loads(
                link_checker(json.dumps(context), json.dumps(data)))
            result['format'] = self._extract_file_format(url, headers)
            result['mimetype'] = self._extract_mimetype(headers)
            result['size'] = headers.get('content-length', '')
            result['last_modified'] = self._parse_and_format_date(
                headers.get('last-modified', ''))
        except LinkCheckerError as e:
            result['url_errors'].append(six.text_type(e))
        return result
 def test_colon_in_query_string(self, url):
     # accept, because browsers accept this
     # see discussion: http://trac.ckan.org/ticket/318
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #5
0
    def _check_link(self, url):
        """
        Synchronously check the given link, and return dict representing results.
        Does not handle 30x redirects.
        """

        # If a user enters "www.example.com" then we assume they meant "http://www.example.com"
        scheme, path = urllib.splittype(url)
        if not scheme:
            url = 'http://' + path

        context = {}
        data = {
            'url_timeout': 10,
            'url': url
        }
        result = {
            'errors': [],
            'url_errors': [],
            'format': '',
            'mimetype': '',
            'size': '',
            'last_modified': '',
        }

        try:
            headers = json.loads(link_checker(json.dumps(context), json.dumps(data)))
            result['format'] = self._extract_file_format(url, headers)
            result['mimetype'] = self._extract_mimetype(headers)
            result['size'] = headers.get('content-length', '')
            result['last_modified'] = self._parse_and_format_date(headers.get('last-modified', ''))
        except LinkCheckerError, e:
            result['url_errors'].append(str(e))
 def test_colon_in_query_string(self, url):
     # accept, because browsers accept this
     # see discussion: http://trac.ckan.org/ticket/318
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #7
0
 def test_non_escaped_url(self, client):
     url = client + '/+/http://www.homeoffice.gov.uk/publications/science-research-statistics/research-statistics/' \
           + 'drugs-alcohol-research/hosb1310/hosb1310-ann2tabs?view=Binary'
     context = json.dumps({})
     data = json.dumps({'url': url})
     res = link_checker(context, data)
     assert res
Exemple #8
0
def download(context, resource, url_timeout=URL_TIMEOUT,
             max_content_length=MAX_CONTENT_LENGTH,
             data_formats=DATA_FORMATS):

    from ckanext.archiver import tasks

    res = None
    resource_changed = False

    link_context = "{}"
    link_data = json.dumps({
        'url': resource['url'],
        'url_timeout': url_timeout
    })

    user_agent_string = config.get('ckanext.archiver.user_agent_string', None)

    def _download_resource(resource_url, timeout):
        request_headers = {}
        if user_agent_string is not None:
            request_headers['User-Agent'] = user_agent_string
        res = requests.get(resource['url'], timeout=url_timeout,
                           headers=request_headers, verify=False)
        return res

    try:
        headers = json.loads(tasks.link_checker(link_context, link_data))
    except tasks.LinkHeadMethodNotSupported, e:
        res = _download_resource(resource_url=resource['url'],
                                 timeout=url_timeout)
        headers = res.headers
Exemple #9
0
def download(context,
             resource,
             url_timeout=URL_TIMEOUT,
             max_content_length=MAX_CONTENT_LENGTH,
             data_formats=DATA_FORMATS):

    from ckanext.archiver import tasks

    res = None
    resource_changed = False

    link_context = "{}"
    link_data = json.dumps({
        'url': resource['url'],
        'url_timeout': url_timeout
    })

    user_agent_string = config.get('ckanext.archiver.user_agent_string', None)

    def _download_resource(resource_url, timeout):
        request_headers = {}
        if user_agent_string is not None:
            request_headers['User-Agent'] = user_agent_string
        res = requests.get(resource['url'],
                           timeout=url_timeout,
                           headers=request_headers,
                           verify=False)
        return res

    try:
        headers = json.loads(tasks.link_checker(link_context, link_data))
    except tasks.LinkHeadMethodNotSupported, e:
        res = _download_resource(resource_url=resource['url'],
                                 timeout=url_timeout)
        headers = res.headers
 def test_url_with_30x_follows_redirect(self, url):
     redirect_url = url + u'?status=200&content=test&content-type=text/csv'
     url += u'?status=301&location=%s' % quote_plus(redirect_url)
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
 def test_url_with_30x_follows_redirect(self, url):
     redirect_url = url + u'?status=200&content=test&content-type=text/csv'
     url += u'?status=301&location=%s' % quote_plus(redirect_url)
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #12
0
    def test_colon_in_query_string(self, client):
        # accept, because browsers accept this
        # see discussion: http://trac.ckan.org/ticket/318

        url = client + '/?time=09:30&status=200'
        context = json.dumps({})
        data = json.dumps({'url': url})
        result = json.loads(link_checker(context, data))
        assert result
Exemple #13
0
    def _check_link(self, url):
        """
        Synchronously check the given link, and return dict representing results.
        Does not handle 30x redirects.
        """

        # If a user enters "www.example.com" then we assume they meant "http://www.example.com"
        scheme, path = urllib.splittype(url)
        if not scheme:
            url = "http://" + path

        context = {}
        data = {"url_timeout": 10, "url": url}
        result = {"errors": [], "url_errors": [], "format": "", "mimetype": "", "size": "", "last_modified": ""}

        try:
            headers = json.loads(link_checker(json.dumps(context), json.dumps(data)))
            result["format"] = self._extract_file_format(url, headers)
            result["mimetype"] = self._extract_mimetype(headers)
            result["size"] = headers.get("content-length", "")
            result["last_modified"] = self._parse_and_format_date(headers.get("last-modified", ""))
        except LinkCheckerError, e:
            result["url_errors"].append(str(e))
Exemple #14
0
def download(context,
             resource,
             url_timeout=URL_TIMEOUT,
             max_content_length=MAX_CONTENT_LENGTH,
             data_formats=DATA_FORMATS):

    from ckanext.archiver import tasks

    res = None
    resource_changed = False

    link_context = "{}"
    link_data = json.dumps({
        'url': resource['url'],
        'url_timeout': url_timeout
    })

    user_agent_string = config.get('ckanext.archiver.user_agent_string', None)

    try:
        headers = json.loads(tasks.link_checker(link_context, link_data))
    except tasks.LinkCheckerError, e:
        if any(x in str(e).lower() for x in (
                'method not allowed',
                'internal server error',
        )):
            # If the HEAD method is not supported or if a 500
            # error is returned we'll handle the download manually
            request_headers = {}
            if user_agent_string is not None:
                request_headers['User-Agent'] = user_agent_string
            res = requests.get(resource['url'],
                               timeout=url_timeout,
                               headers=request_headers)
            headers = res.headers
        else:
            raise
Exemple #15
0
def resource_score(context, data):
    """
    Score resources on Sir Tim Berners-Lee\'s five stars of openness
    based on mime-type.

    returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'openness_score_failure_count': the number of consecutive times that
                                        this resource has returned a score of 0
    """
    log = update.get_logger()

    score = 0
    score_reason = ""
    score_failure_count = 0

    # get openness score failure count for task status table if exists
    api_url = urlparse.urljoin(context["site_url"], "api/action")
    response = requests.post(
        api_url + "/task_status_show",
        json.dumps({"entity_id": data["id"], "task_type": "qa", "key": "openness_score_failure_count"}),
        headers={"Authorization": context["apikey"], "Content-Type": "application/json"},
    )
    if json.loads(response.content)["success"]:
        score_failure_count = int(json.loads(response.content)["result"].get("value", "0"))

    # no score for resources that don't have an open license
    if not data.get("is_open"):
        score_reason = "License not open"
    else:
        try:
            headers = json.loads(link_checker("{}", json.dumps(data)))
            ct = headers.get("content-type")

            # ignore charset if exists (just take everything before the ';')
            if ct and ";" in ct:
                ct = ct.split(";")[0]

            # also get format from resource and by guessing from file extension
            format = data.get("format", "").lower()
            file_type = mimetypes.guess_type(data["url"])[0]

            # file type takes priority for scoring
            if file_type:
                score = MIME_TYPE_SCORE.get(file_type, -1)
            elif ct:
                score = MIME_TYPE_SCORE.get(ct, -1)
            elif format:
                score = MIME_TYPE_SCORE.get(format, -1)

            score_reason = OPENNESS_SCORE_REASON[score]

            # negative scores are only useful for getting the reason message,
            # set it back to 0 if it's still <0 at this point
            if score < 0:
                score = 0

            # check for mismatches between content-type, file_type and format
            # ideally they should all agree
            if not ct:
                # TODO: use the todo extension to flag this issue
                pass

        except LinkCheckerError, e:
            score_reason = str(e)
        except Exception, e:
            log.error("Unexpected error while calculating openness score %s: %s", e.__class__.__name__, unicode(e))
            score_reason = "Unknown error: %s" % str(e)
 def test_non_escaped_url(self, url):
     context = json.dumps({})
     data = json.dumps({'url': url})
     res = link_checker(context, data)
     assert res
Exemple #17
0
def resource_score(context, data):
    """
    Score resources on Sir Tim Bernes-Lee's five stars of openness based on mime-type.

    returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'openness_score_failure_count': the number of consecutive times this resource has returned a score of 0

    Raises the following exceptions:

    """
    score = 0
    score_reason = ""
    score_failure_count = 0

    # get openness score failure count for task status table if exists
    api_url = urlparse.urljoin(context['site_url'], 'api/action')
    response = requests.post(
        api_url + '/task_status_show', 
        json.dumps({'entity_id': data['id'], 'task_type': 'qa', 
                    'key': 'openness_score_failure_count'}),
        headers = {'Authorization': context['apikey'],
                   'Content-Type': 'application/json'}
    )
    if json.loads(response.content)['success']:
        score_failure_count = int(json.loads(response.content)['result'].get('value', '0'))

    try:
        headers = json.loads(link_checker("{}", json.dumps(data)))

        cl = headers.get('content-length')
        ct = headers.get('content-type')

        # ignore charset if exists (just take everything before the ';')
        if ct and ';' in ct:
            ct = ct.split(';')[0]

        # also get format from resource and by guessing from file extension
        format = data.get('format', '').lower()
        file_type = mimetypes.guess_type(data['url'])[0] 

        # file type takes priority for scoring
        if file_type:
            score = MIME_TYPE_SCORE.get(file_type, -1)
        elif ct:
            score = MIME_TYPE_SCORE.get(ct, -1)
        elif format:
            score = MIME_TYPE_SCORE.get(format, -1)
        
        score_reason = OPENNESS_SCORE_REASON[score]

        # negative scores are only useful for getting the reason message, set it back
        # to 0 if it's still <0 at this point
        if score < 0:
            score = 0

        # check for mismatches between content-type, file_type and format
        # ideally they should all agree
        if not ct:
            # TODO: use the todo extension to flag this issue
            pass
        else:
            allowed_formats = [ct.lower().split('/')[-1], ct.lower().split('/')]
            allowed_formats.append(ct.lower())
            if format not in allowed_formats:
                # TODO: use the todo extension to flag this issue
                pass
            if file_type != ct:
                # TODO: use the todo extension to flag this issue
                pass

    except LinkCheckerError, e:
        score_reason = str(e)
Exemple #18
0
 def test_url_with_405(self, client):  # 405: method (HEAD) not allowed
     url = client + '/?status=405'
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkCheckerError):
         link_checker(context, data)
Exemple #19
0
 def test_url_with_404(self, client):
     url = client + 'http://localhost:9091/?status=404'
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkCheckerError):
         link_checker(context, data)
Exemple #20
0
 def test_url_with_503(self, client):
     url = client + '/?status=503'
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkCheckerError):
         link_checker(context, data)
Exemple #21
0
 def test_empty_url(self):
     url = u''
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkCheckerError):
         link_checker(context, data)
Exemple #22
0
 def test_bad_url(self):
     url = u'http:www.buckshealthcare.nhs.uk/freedom-of-information.htm'
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkInvalidError):
         link_checker(context, data)
Exemple #23
0
 def test_file_url(self):
     url = u'file:///home/root/test.txt'  # schema not allowed
     context = json.dumps({})
     data = json.dumps({'url': url})
     with pytest.raises(LinkInvalidError):
         link_checker(context, data)
Exemple #24
0
def resource_score(context, data):
    """
    Score resources on Sir Tim Berners-Lee\'s five stars of openness
    based on mime-type.

    returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'openness_score_failure_count': the number of consecutive times that
                                        this resource has returned a score of 0
    """
    log = update.get_logger()

    score = 0
    score_reason = ''
    score_failure_count = 0

    # get openness score failure count for task status table if exists
    api_url = urlparse.urljoin(context['site_url'], 'api/action')
    response = requests.post(api_url + '/task_status_show',
                             json.dumps({
                                 'entity_id': data['id'],
                                 'task_type': 'qa',
                                 'key': 'openness_score_failure_count'
                             }),
                             headers={
                                 'Authorization': context['apikey'],
                                 'Content-Type': 'application/json'
                             })
    if json.loads(response.content)['success']:
        score_failure_count = int(
            json.loads(response.content)['result'].get('value', '0'))

    # no score for resources that don't have an open license
    if not data.get('is_open'):
        score_reason = 'License not open'
    else:
        try:
            headers = json.loads(link_checker("{}", json.dumps(data)))
            ct = headers.get('content-type')

            # ignore charset if exists (just take everything before the ';')
            if ct and ';' in ct:
                ct = ct.split(';')[0]

            # also get format from resource and by guessing from file extension
            format = data.get('format', '').lower()
            file_type = mimetypes.guess_type(data['url'])[0]

            # file type takes priority for scoring
            if file_type:
                score = MIME_TYPE_SCORE.get(file_type, -1)
            elif ct:
                score = MIME_TYPE_SCORE.get(ct, -1)
            elif format:
                score = MIME_TYPE_SCORE.get(format, -1)

            score_reason = OPENNESS_SCORE_REASON[score]

            # negative scores are only useful for getting the reason message,
            # set it back to 0 if it's still <0 at this point
            if score < 0:
                score = 0

            # check for mismatches between content-type, file_type and format
            # ideally they should all agree
            if not ct:
                # TODO: use the todo extension to flag this issue
                pass

        except LinkCheckerError, e:
            score_reason = str(e)
        except Exception, e:
            log.error(
                'Unexpected error while calculating openness score %s: %s',
                e.__class__.__name__, unicode(e))
            score_reason = "Unknown error: %s" % str(e)
 def test_good_url(self, url):
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
 def test_non_escaped_url(self, url):
     context = json.dumps({})
     data = json.dumps({'url': url})
     res = link_checker(context, data)
     assert res
 def test_good_url(self, url):
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
 def test_trailing_whitespace(self, url):
     # accept, because browsers accept this
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
 def test_trailing_whitespace(self, url):
     # accept, because browsers accept this
     context = json.dumps({})
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #30
0
 def test_good_url(self, client):
     context = json.dumps({})
     url = client + "/?status=200"
     data = json.dumps({'url': url})
     result = json.loads(link_checker(context, data))
     assert result
Exemple #31
0
def resource_score(context, data):
    """
    Score resources on Sir Tim Berners-Lee\'s five stars of openness
    based on mime-type.

    returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'openness_score_failure_count': the number of consecutive times that
                                        this resource has returned a score of 0
    """
    log = update.get_logger()

    score = 0
    score_reason = ''
    score_failure_count = 0

    # get openness score failure count for task status table if exists
    api_url = urlparse.urljoin(context['site_url'], 'api/action')
    response = requests.post(
        api_url + '/task_status_show',
        json.dumps({'entity_id': data['id'], 'task_type': 'qa',
                    'key': 'openness_score_failure_count'}),
        headers={'Authorization': context['apikey'],
                 'Content-Type': 'application/json'}
    )
    if json.loads(response.content)['success']:
        score_failure_count = int(json.loads(response.content)['result'].get('value', '0'))

    log = update.get_logger()
    # no score for resources that don't have an open license
    if not data.get('is_open'):
        score_reason = 'License not open'
    else:
        try:
            headers = json.loads(link_checker("{}", json.dumps(data)))
            ct = headers.get('content-type')

            # ignore charset if exists (just take everything before the ';')
            if ct and ';' in ct:
                ct = ct.split(';')[0]

            # also get format from resource and by guessing from file extension
            format = data.get('format', '').lower()
            file_type = mimetypes.guess_type(data['url'])[0]

            # file type takes priority for scoring
            score = -1
            if file_type:
                score = MIME_TYPE_SCORE.get(file_type, -1)
            if ct:
                score = max(score, MIME_TYPE_SCORE.get(ct, -1))
            if format:
                # we want to make sure the linked resource is what format
                # claims it is
                type_list = ALLOWED_MIME_TYPES.get(format, [])
                if len(type_list) == 0 or file_type in type_list:
                    score = max(score, MIME_TYPE_SCORE.get(format, -1))

            score_reason = OPENNESS_SCORE_REASON.get(score, "Unable to calculate openness score")

            # negative scores are only useful for getting the reason message,
            # set it back to 0 if it's still <0 at this point
            if score < 0:
                score = 0

            # check for mismatches between content-type, file_type and format
            # ideally they should all agree
            if not ct:
                # TODO: use the todo extension to flag this issue
                pass

        except LinkCheckerError, e:
            score_reason = str(e)
        except Exception, e:
            log.error('Unexpected error while calculating openness score %s: %s', e.__class__.__name__,  unicode(e))
            score_reason = "Unknown error: %s" % str(e)
Exemple #32
0
        # hence setting default user agent to Mozilla/5.0.
        try:
            response = requests.get(resource['url'],
                                    timeout=url_timeout,
                                    headers=_request_headers,
                                    verify=True)
        except Exception, e:
            request_headers['User-Agent'] = 'curl/7.35.0'
            response = requests.get(resource['url'],
                                    timeout=url_timeout,
                                    headers=_request_headers,
                                    verify=False)
        return response

    try:
        headers = json.loads(tasks.link_checker(link_context, link_data))
    except tasks.LinkHeadMethodNotSupported, e:
        res = _download_resource(resource_url=resource['url'],
                                 timeout=url_timeout)
        headers = res.headers
    except tasks.LinkCheckerError, e:
        if any(x in str(e).lower() for x in (
                'internal server error',
                '403',
        )):
            # If the HEAD method is not supported or if a 500
            # error is returned we'll handle the download manually
            res = _download_resource(resource_url=resource['url'],
                                     timeout=url_timeout)
            headers = res.headers
        else: