def test_trailing_whitespace(self, client): # accept, because browsers accept this url = client + '/?status=200 ' context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def download(context, resource, url_timeout=URL_TIMEOUT, max_content_length=MAX_CONTENT_LENGTH, data_formats=DATA_FORMATS): from ckanext.archiver import tasks res = None resource_changed = False link_context = "{}" link_data = json.dumps({ 'url': resource['url'], 'url_timeout': url_timeout }) user_agent_string = config.get('ckanext.archiver.user_agent_string', None) try: headers = json.loads(tasks.link_checker(link_context, link_data)) except tasks.LinkCheckerError, e: if any(x in str(e).lower() for x in ('method not allowed', 'internal server error', )): # If the HEAD method is not supported or if a 500 # error is returned we'll handle the download manually request_headers = {} if user_agent_string is not None: request_headers['User-Agent'] = user_agent_string res = requests.get(resource['url'], timeout=url_timeout, headers=request_headers) headers = res.headers else: raise
def _check_link(self, url): """ Synchronously check the given link, and return dict representing results. Does not handle 30x redirects. """ # If a user enters "www.example.com" then we assume they meant "http://www.example.com" scheme, path = splittype(url) if not scheme: url = 'http://' + path context = {} data = {'url_timeout': 10, 'url': url} result = { 'errors': [], 'url_errors': [], 'format': '', 'mimetype': '', 'size': '', 'last_modified': '', } try: headers = json.loads( link_checker(json.dumps(context), json.dumps(data))) result['format'] = self._extract_file_format(url, headers) result['mimetype'] = self._extract_mimetype(headers) result['size'] = headers.get('content-length', '') result['last_modified'] = self._parse_and_format_date( headers.get('last-modified', '')) except LinkCheckerError as e: result['url_errors'].append(six.text_type(e)) return result
def test_colon_in_query_string(self, url): # accept, because browsers accept this # see discussion: http://trac.ckan.org/ticket/318 context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def _check_link(self, url): """ Synchronously check the given link, and return dict representing results. Does not handle 30x redirects. """ # If a user enters "www.example.com" then we assume they meant "http://www.example.com" scheme, path = urllib.splittype(url) if not scheme: url = 'http://' + path context = {} data = { 'url_timeout': 10, 'url': url } result = { 'errors': [], 'url_errors': [], 'format': '', 'mimetype': '', 'size': '', 'last_modified': '', } try: headers = json.loads(link_checker(json.dumps(context), json.dumps(data))) result['format'] = self._extract_file_format(url, headers) result['mimetype'] = self._extract_mimetype(headers) result['size'] = headers.get('content-length', '') result['last_modified'] = self._parse_and_format_date(headers.get('last-modified', '')) except LinkCheckerError, e: result['url_errors'].append(str(e))
def test_non_escaped_url(self, client): url = client + '/+/http://www.homeoffice.gov.uk/publications/science-research-statistics/research-statistics/' \ + 'drugs-alcohol-research/hosb1310/hosb1310-ann2tabs?view=Binary' context = json.dumps({}) data = json.dumps({'url': url}) res = link_checker(context, data) assert res
def download(context, resource, url_timeout=URL_TIMEOUT, max_content_length=MAX_CONTENT_LENGTH, data_formats=DATA_FORMATS): from ckanext.archiver import tasks res = None resource_changed = False link_context = "{}" link_data = json.dumps({ 'url': resource['url'], 'url_timeout': url_timeout }) user_agent_string = config.get('ckanext.archiver.user_agent_string', None) def _download_resource(resource_url, timeout): request_headers = {} if user_agent_string is not None: request_headers['User-Agent'] = user_agent_string res = requests.get(resource['url'], timeout=url_timeout, headers=request_headers, verify=False) return res try: headers = json.loads(tasks.link_checker(link_context, link_data)) except tasks.LinkHeadMethodNotSupported, e: res = _download_resource(resource_url=resource['url'], timeout=url_timeout) headers = res.headers
def test_url_with_30x_follows_redirect(self, url): redirect_url = url + u'?status=200&content=test&content-type=text/csv' url += u'?status=301&location=%s' % quote_plus(redirect_url) context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def test_colon_in_query_string(self, client): # accept, because browsers accept this # see discussion: http://trac.ckan.org/ticket/318 url = client + '/?time=09:30&status=200' context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def _check_link(self, url): """ Synchronously check the given link, and return dict representing results. Does not handle 30x redirects. """ # If a user enters "www.example.com" then we assume they meant "http://www.example.com" scheme, path = urllib.splittype(url) if not scheme: url = "http://" + path context = {} data = {"url_timeout": 10, "url": url} result = {"errors": [], "url_errors": [], "format": "", "mimetype": "", "size": "", "last_modified": ""} try: headers = json.loads(link_checker(json.dumps(context), json.dumps(data))) result["format"] = self._extract_file_format(url, headers) result["mimetype"] = self._extract_mimetype(headers) result["size"] = headers.get("content-length", "") result["last_modified"] = self._parse_and_format_date(headers.get("last-modified", "")) except LinkCheckerError, e: result["url_errors"].append(str(e))
def download(context, resource, url_timeout=URL_TIMEOUT, max_content_length=MAX_CONTENT_LENGTH, data_formats=DATA_FORMATS): from ckanext.archiver import tasks res = None resource_changed = False link_context = "{}" link_data = json.dumps({ 'url': resource['url'], 'url_timeout': url_timeout }) user_agent_string = config.get('ckanext.archiver.user_agent_string', None) try: headers = json.loads(tasks.link_checker(link_context, link_data)) except tasks.LinkCheckerError, e: if any(x in str(e).lower() for x in ( 'method not allowed', 'internal server error', )): # If the HEAD method is not supported or if a 500 # error is returned we'll handle the download manually request_headers = {} if user_agent_string is not None: request_headers['User-Agent'] = user_agent_string res = requests.get(resource['url'], timeout=url_timeout, headers=request_headers) headers = res.headers else: raise
def resource_score(context, data): """ Score resources on Sir Tim Berners-Lee\'s five stars of openness based on mime-type. returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'openness_score_failure_count': the number of consecutive times that this resource has returned a score of 0 """ log = update.get_logger() score = 0 score_reason = "" score_failure_count = 0 # get openness score failure count for task status table if exists api_url = urlparse.urljoin(context["site_url"], "api/action") response = requests.post( api_url + "/task_status_show", json.dumps({"entity_id": data["id"], "task_type": "qa", "key": "openness_score_failure_count"}), headers={"Authorization": context["apikey"], "Content-Type": "application/json"}, ) if json.loads(response.content)["success"]: score_failure_count = int(json.loads(response.content)["result"].get("value", "0")) # no score for resources that don't have an open license if not data.get("is_open"): score_reason = "License not open" else: try: headers = json.loads(link_checker("{}", json.dumps(data))) ct = headers.get("content-type") # ignore charset if exists (just take everything before the ';') if ct and ";" in ct: ct = ct.split(";")[0] # also get format from resource and by guessing from file extension format = data.get("format", "").lower() file_type = mimetypes.guess_type(data["url"])[0] # file type takes priority for scoring if file_type: score = MIME_TYPE_SCORE.get(file_type, -1) elif ct: score = MIME_TYPE_SCORE.get(ct, -1) elif format: score = MIME_TYPE_SCORE.get(format, -1) score_reason = OPENNESS_SCORE_REASON[score] # negative scores are only useful for getting the reason message, # set it back to 0 if it's still <0 at this point if score < 0: score = 0 # check for mismatches between content-type, file_type and format # ideally they should all agree if not ct: # TODO: use the todo extension to flag this issue pass except LinkCheckerError, e: score_reason = str(e) except Exception, e: log.error("Unexpected error while calculating openness score %s: %s", e.__class__.__name__, unicode(e)) score_reason = "Unknown error: %s" % str(e)
def test_non_escaped_url(self, url): context = json.dumps({}) data = json.dumps({'url': url}) res = link_checker(context, data) assert res
def resource_score(context, data): """ Score resources on Sir Tim Bernes-Lee's five stars of openness based on mime-type. returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'openness_score_failure_count': the number of consecutive times this resource has returned a score of 0 Raises the following exceptions: """ score = 0 score_reason = "" score_failure_count = 0 # get openness score failure count for task status table if exists api_url = urlparse.urljoin(context['site_url'], 'api/action') response = requests.post( api_url + '/task_status_show', json.dumps({'entity_id': data['id'], 'task_type': 'qa', 'key': 'openness_score_failure_count'}), headers = {'Authorization': context['apikey'], 'Content-Type': 'application/json'} ) if json.loads(response.content)['success']: score_failure_count = int(json.loads(response.content)['result'].get('value', '0')) try: headers = json.loads(link_checker("{}", json.dumps(data))) cl = headers.get('content-length') ct = headers.get('content-type') # ignore charset if exists (just take everything before the ';') if ct and ';' in ct: ct = ct.split(';')[0] # also get format from resource and by guessing from file extension format = data.get('format', '').lower() file_type = mimetypes.guess_type(data['url'])[0] # file type takes priority for scoring if file_type: score = MIME_TYPE_SCORE.get(file_type, -1) elif ct: score = MIME_TYPE_SCORE.get(ct, -1) elif format: score = MIME_TYPE_SCORE.get(format, -1) score_reason = OPENNESS_SCORE_REASON[score] # negative scores are only useful for getting the reason message, set it back # to 0 if it's still <0 at this point if score < 0: score = 0 # check for mismatches between content-type, file_type and format # ideally they should all agree if not ct: # TODO: use the todo extension to flag this issue pass else: allowed_formats = [ct.lower().split('/')[-1], ct.lower().split('/')] allowed_formats.append(ct.lower()) if format not in allowed_formats: # TODO: use the todo extension to flag this issue pass if file_type != ct: # TODO: use the todo extension to flag this issue pass except LinkCheckerError, e: score_reason = str(e)
def test_url_with_405(self, client): # 405: method (HEAD) not allowed url = client + '/?status=405' context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkCheckerError): link_checker(context, data)
def test_url_with_404(self, client): url = client + 'http://localhost:9091/?status=404' context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkCheckerError): link_checker(context, data)
def test_url_with_503(self, client): url = client + '/?status=503' context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkCheckerError): link_checker(context, data)
def test_empty_url(self): url = u'' context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkCheckerError): link_checker(context, data)
def test_bad_url(self): url = u'http:www.buckshealthcare.nhs.uk/freedom-of-information.htm' context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkInvalidError): link_checker(context, data)
def test_file_url(self): url = u'file:///home/root/test.txt' # schema not allowed context = json.dumps({}) data = json.dumps({'url': url}) with pytest.raises(LinkInvalidError): link_checker(context, data)
def resource_score(context, data): """ Score resources on Sir Tim Berners-Lee\'s five stars of openness based on mime-type. returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'openness_score_failure_count': the number of consecutive times that this resource has returned a score of 0 """ log = update.get_logger() score = 0 score_reason = '' score_failure_count = 0 # get openness score failure count for task status table if exists api_url = urlparse.urljoin(context['site_url'], 'api/action') response = requests.post(api_url + '/task_status_show', json.dumps({ 'entity_id': data['id'], 'task_type': 'qa', 'key': 'openness_score_failure_count' }), headers={ 'Authorization': context['apikey'], 'Content-Type': 'application/json' }) if json.loads(response.content)['success']: score_failure_count = int( json.loads(response.content)['result'].get('value', '0')) # no score for resources that don't have an open license if not data.get('is_open'): score_reason = 'License not open' else: try: headers = json.loads(link_checker("{}", json.dumps(data))) ct = headers.get('content-type') # ignore charset if exists (just take everything before the ';') if ct and ';' in ct: ct = ct.split(';')[0] # also get format from resource and by guessing from file extension format = data.get('format', '').lower() file_type = mimetypes.guess_type(data['url'])[0] # file type takes priority for scoring if file_type: score = MIME_TYPE_SCORE.get(file_type, -1) elif ct: score = MIME_TYPE_SCORE.get(ct, -1) elif format: score = MIME_TYPE_SCORE.get(format, -1) score_reason = OPENNESS_SCORE_REASON[score] # negative scores are only useful for getting the reason message, # set it back to 0 if it's still <0 at this point if score < 0: score = 0 # check for mismatches between content-type, file_type and format # ideally they should all agree if not ct: # TODO: use the todo extension to flag this issue pass except LinkCheckerError, e: score_reason = str(e) except Exception, e: log.error( 'Unexpected error while calculating openness score %s: %s', e.__class__.__name__, unicode(e)) score_reason = "Unknown error: %s" % str(e)
def test_good_url(self, url): context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def test_trailing_whitespace(self, url): # accept, because browsers accept this context = json.dumps({}) data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def test_good_url(self, client): context = json.dumps({}) url = client + "/?status=200" data = json.dumps({'url': url}) result = json.loads(link_checker(context, data)) assert result
def resource_score(context, data): """ Score resources on Sir Tim Berners-Lee\'s five stars of openness based on mime-type. returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'openness_score_failure_count': the number of consecutive times that this resource has returned a score of 0 """ log = update.get_logger() score = 0 score_reason = '' score_failure_count = 0 # get openness score failure count for task status table if exists api_url = urlparse.urljoin(context['site_url'], 'api/action') response = requests.post( api_url + '/task_status_show', json.dumps({'entity_id': data['id'], 'task_type': 'qa', 'key': 'openness_score_failure_count'}), headers={'Authorization': context['apikey'], 'Content-Type': 'application/json'} ) if json.loads(response.content)['success']: score_failure_count = int(json.loads(response.content)['result'].get('value', '0')) log = update.get_logger() # no score for resources that don't have an open license if not data.get('is_open'): score_reason = 'License not open' else: try: headers = json.loads(link_checker("{}", json.dumps(data))) ct = headers.get('content-type') # ignore charset if exists (just take everything before the ';') if ct and ';' in ct: ct = ct.split(';')[0] # also get format from resource and by guessing from file extension format = data.get('format', '').lower() file_type = mimetypes.guess_type(data['url'])[0] # file type takes priority for scoring score = -1 if file_type: score = MIME_TYPE_SCORE.get(file_type, -1) if ct: score = max(score, MIME_TYPE_SCORE.get(ct, -1)) if format: # we want to make sure the linked resource is what format # claims it is type_list = ALLOWED_MIME_TYPES.get(format, []) if len(type_list) == 0 or file_type in type_list: score = max(score, MIME_TYPE_SCORE.get(format, -1)) score_reason = OPENNESS_SCORE_REASON.get(score, "Unable to calculate openness score") # negative scores are only useful for getting the reason message, # set it back to 0 if it's still <0 at this point if score < 0: score = 0 # check for mismatches between content-type, file_type and format # ideally they should all agree if not ct: # TODO: use the todo extension to flag this issue pass except LinkCheckerError, e: score_reason = str(e) except Exception, e: log.error('Unexpected error while calculating openness score %s: %s', e.__class__.__name__, unicode(e)) score_reason = "Unknown error: %s" % str(e)
# hence setting default user agent to Mozilla/5.0. try: response = requests.get(resource['url'], timeout=url_timeout, headers=_request_headers, verify=True) except Exception, e: request_headers['User-Agent'] = 'curl/7.35.0' response = requests.get(resource['url'], timeout=url_timeout, headers=_request_headers, verify=False) return response try: headers = json.loads(tasks.link_checker(link_context, link_data)) except tasks.LinkHeadMethodNotSupported, e: res = _download_resource(resource_url=resource['url'], timeout=url_timeout) headers = res.headers except tasks.LinkCheckerError, e: if any(x in str(e).lower() for x in ( 'internal server error', '403', )): # If the HEAD method is not supported or if a 500 # error is returned we'll handle the download manually res = _download_resource(resource_url=resource['url'], timeout=url_timeout) headers = res.headers else: