def test_download_file(self, url): resource = self._test_resource(url) result = download(self.fake_context, resource) assert result["saved_file"] assert os.path.exists(result["saved_file"]) _remove_archived_file(result.get("saved_file")) # Modify the resource and check that the resource size gets updated resource["url"] = url.replace("content=test", "content=test2") result = download(self.fake_context, resource) assert_equal(result["size"], len("test2")) _remove_archived_file(result.get("saved_file"))
def test_download_file(self, url): resource = self._test_resource(url) result = download(self.fake_context, resource) assert result['saved_file'] assert os.path.exists(result['saved_file']) _remove_archived_file(result.get('saved_file')) # Modify the resource and check that the resource size gets updated resource['url'] = url.replace('content=test', 'content=test2') result = download(self.fake_context, resource) assert_equal(result['size'], len('test2')) _remove_archived_file(result.get('saved_file'))
def test_download_file(self, client): url = client + '/?status=200&content=test&content-type=csv' resource = self._test_resource(url) result = download(self.fake_context, resource) assert result['saved_file'] assert os.path.exists(result['saved_file']) _remove_archived_file(result.get('saved_file')) # Modify the resource and check that the resource size gets updated resource['url'] = url.replace('content=test', 'content=test2') result = download(self.fake_context, resource) assert result['size'] == len('test2') _remove_archived_file(result.get('saved_file'))
def test_download_file(self, url): context = json.dumps(self.fake_context) resource = self.fake_resource resource['url'] = url result = download(self.fake_context, resource) assert result['saved_file'] assert os.path.exists(result['saved_file']) self._remove_archived_file(result.get('saved_file')) # Modify the resource and check that the resource size gets updated resource['url'] = url.replace('content=test','content=test2') result = download(self.fake_context, resource) assert resource['size'] == unicode(len('test2')), resource['size'] self._remove_archived_file(result.get('saved_file'))
def test_download_file(self, url): context = json.dumps(self.fake_context) resource = self.fake_resource resource['url'] = url result = download(self.fake_context, resource) assert result['saved_file'] assert os.path.exists(result['saved_file']) self._remove_archived_file(result.get('saved_file')) # Modify the resource and check that the resource size gets updated resource['url'] = url.replace('content=test', 'content=test2') result = download(self.fake_context, resource) assert resource['size'] == unicode(len('test2')), resource['size'] self._remove_archived_file(result.get('saved_file'))
def test_head_unsupported(self, url): # This test was more relevant when we did HEAD requests. Now servers # which respond badly to HEAD requests are not an issue. resource = self._test_resource(url) # HEAD request will return a 405 error, but it will persevere # and do a GET request which will work. result = download(self.fake_context, resource) assert result['saved_file']
def test_head_unsupported(self, client): url = client + '/?status=200&method=get&content=test&content-type=csv' # This test was more relevant when we did HEAD requests. Now servers # which respond badly to HEAD requests are not an issue. resource = self._test_resource(url) # HEAD request will return a 405 error, but it will persevere # and do a GET request which will work. result = download(self.fake_context, resource) assert result['saved_file']
def _datastorer_upload(context, resource, logger): from ckanext.archiver.tasks import ChooseNotToDownload from time import sleep max_retries = 5 for i in range(max_retries): try: result = download(context, resource, data_formats=DATA_FORMATS) except ChooseNotToDownload: raise except Exception, e: if i < max_retries: logger.error("Error while performing download: %r. Retrying...", e) sleep(5 * i) continue else: raise else: break
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = any_tableset(f, mimetype=content_type, extension=resource['format'].lower()) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess(row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % ( ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = { 'resource_id': resource['id'], 'fields': [ dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names) ], 'force': True, 'records': data } response = requests.post( datastore_create_request_url, data=json.dumps(request), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info( 'Deleting existing datastore (it may not exist): {0}.'.format( resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({ 'resource_id': resource['id'], 'force': True }), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format( get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list(itertools.imap(dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post(ckan_request_url, data=json.dumps(resource), headers={ 'Content-Type': 'application/json', 'Authorization': context['apikey'] }) if response.status_code not in (201, 200): raise DatastorerException( 'Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) logger.info('Downloaded resource %r' %(resource)) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters extension = resource['format'].lower() fp = open(result['saved_file'], 'rb') if zipfile.is_zipfile(result['saved_file']): fp, zf = open_zipped_tableset(fp, extension=extension) logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file']) else: logger.info('Opened file %s' %(result['saved_file'])) table_sets = any_tableset(fp, extension=extension) if 'sample_size' in context: table_sets.window = max(1000, int(context['sample_size'])) logger.info('Using a sample window of %d', table_sets.window) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'force': True, 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({'resource_id': resource['id'], 'force': True}), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']} ) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post( ckan_request_url, data=json.dumps(resource), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))