def test_umlaut_and_extra_comma(self): self.register_urls(filename="umlaut_and_extra_comma.csv") # This csv has an extra comma which causes the COPY to throw a # psycopg2.DataError and the umlaut can cause problems for logging the # error. We need to check that it correctly reverts to using # messytables to load it data = { "api_key": self.api_key, "job_type": "xloader_to_datastore", "result_url": self.callback_url, "metadata": { "ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id, }, } job_id = "test{}".format(random.randint(0, 1e5)) with mock.patch("ckanext.xloader.jobs.set_resource_metadata"): # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch( "ckanext.xloader.jobs.get_current_job", return_value=mock.Mock(id=job_id), ): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful assert ( responses.calls[-1].request.url == "http://www.ckan.org/api/3/action/xloader_hook" ) job_dict = json.loads(responses.calls[-1].request.body) assert job_dict["status"] == u"complete", job_dict assert job_dict == { u"metadata": { u"datastore_contains_all_records_of_source_file": True, u"datastore_active": True, u"ckan_url": u"http://www.ckan.org/", u"resource_id": u"foo-bar-42", }, u"status": u"complete", } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) assert job["status"] == u"complete" assert job["error"] == None
def test_umlaut_and_extra_comma(self): self.register_urls(filename='umlaut_and_extra_comma.csv') # This csv has an extra comma which causes the COPY to throw a # psycopg2.DataError and the umlaut can cause problems for logging the # error. We need to check that it correctly reverts to using # messytables to load it data = { 'api_key': self.api_key, 'job_type': 'xloader_to_datastore', 'result_url': self.callback_url, 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } job_id = 'test{}'.format(random.randint(0, 1e5)) with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch('ckanext.xloader.jobs.get_current_job', return_value=mock.Mock(id=job_id)): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)['error']['message'] # Check it said it was successful eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') job_dict = json.loads(responses.calls[-1].request.body) assert job_dict['status'] == u'complete', job_dict eq_( job_dict, { u'metadata': { u'datastore_contains_all_records_of_source_file': True, u'datastore_active': True, u'ckan_url': u'http://www.ckan.org/', u'resource_id': u'foo-bar-42' }, u'status': u'complete' }) logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) eq_(job['status'], u'complete') eq_(job['error'], None)
def test_too_large_xls(self): # Test not only the load and xloader_hook is called at the end self.register_urls(filename='simple-large.xls') data = { 'api_key': self.api_key, 'job_type': 'xloader_to_datastore', 'result_url': self.callback_url, 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } job_id = 'test{}'.format(random.randint(0, 1e5)) with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch('ckanext.xloader.jobs.get_current_job', return_value=mock.Mock(id=job_id)): result = jobs.xloader_data_into_datastore(data) assert result is not None, jobs_db.get_job(job_id)['error']['message'] # Check it said it was successful eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') job_dict = json.loads(responses.calls[-1].request.body) assert job_dict['status'] == u'error', job_dict eq_( job_dict, { u'status': u'error', u'metadata': { u'ckan_url': u'http://www.ckan.org/', u'datastore_contains_all_records_of_source_file': False, u'resource_id': u'foo-bar-42' }, u'error': u'Loading file raised an error: array index out of range' }) job = jobs_db.get_job(job_id) eq_(job['status'], u'error') eq_(job['error'], { u'message': u'Loading file raised an error: array index out of range' })
def test_too_large_xls(self): # Test not only the load and xloader_hook is called at the end self.register_urls(filename="simple-large.xls") data = { "api_key": self.api_key, "job_type": "xloader_to_datastore", "result_url": self.callback_url, "metadata": { "ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id, }, } job_id = "test{}".format(random.randint(0, 1e5)) with mock.patch("ckanext.xloader.jobs.set_resource_metadata"): # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch( "ckanext.xloader.jobs.get_current_job", return_value=mock.Mock(id=job_id), ): result = jobs.xloader_data_into_datastore(data) assert result is not None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful assert (responses.calls[-1].request.url == "http://www.ckan.org/api/3/action/xloader_hook") job_dict = json.loads(responses.calls[-1].request.body) assert job_dict["status"] == u"error", job_dict assert job_dict == { u"status": u"error", u"metadata": { u"ckan_url": u"http://www.ckan.org/", u"datastore_contains_all_records_of_source_file": False, u"resource_id": u"foo-bar-42", }, u"error": u"Loading file raised an error: array index out of range", } job = jobs_db.get_job(job_id) assert job["status"] == u"error" assert job["error"] == { u"message": u"Loading file raised an error: array index out of range" }
def test_invalid_byte_sequence(self): self.register_urls(filename='go-realtime.xlsx') # This xlsx throws an Postgres error on INSERT because of # 'invalid byte sequence for encoding "UTF8": 0x00' which causes # the COPY to throw a psycopg2.DataError and umlauts in the file can # cause problems for logging the error. We need to check that # it correctly reverts to using messytables to load it data = { 'api_key': self.api_key, 'job_type': 'xloader_to_datastore', 'result_url': self.callback_url, 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } job_id = "test{}".format(random.randint(0, 1e5)) with mock.patch('ckanext.xloader.jobs.set_datastore_active_flag'): # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch( "ckanext.xloader.jobs.get_current_job", return_value=mock.Mock(id=job_id), ): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful assert responses.calls[-1].request.url == \ 'http://www.ckan.org/api/3/action/xloader_hook' job_dict = json.loads(responses.calls[-1].request.body) assert job_dict['status'] == u'complete', job_dict assert job_dict == \ {u'metadata': {u'ckan_url': u'http://www.ckan.org/', u'resource_id': u'foo-bar-42'}, u'status': u'complete'} logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) assert job['status'] == u'complete' assert job['error'] is None
def test_first_request_is_202_pending_response(self): # when you first get the CSV it returns this 202 response, which is # what this server does: https://data-cdfw.opendata.arcgis.com/datasets responses.add( responses.GET, SOURCE_URL, status=202, body= '{"processingTime":"8.716 seconds","status":"Processing","generating":{}}', content_type='application/json') # subsequent GETs of the CSV work fine self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'xloader_to_datastore', 'result_url': self.callback_url, 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } job_id = 'test{}'.format(random.randint(0, 1e5)) with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch('ckanext.xloader.jobs.get_current_job', return_value=mock.Mock(id=job_id)): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)['error']['message'] # Check it said it was successful eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') job_dict = json.loads(responses.calls[-1].request.body) assert job_dict['status'] == u'complete', job_dict eq_( job_dict, { u'metadata': { u'ckan_url': u'http://www.ckan.org/', u'datastore_contains_all_records_of_source_file': True, u'datastore_active': True, u'resource_id': u'foo-bar-42' }, u'status': u'complete' }) # Check the load data = self.get_datastore_table() eq_(data['headers'], ['_id', '_full_text', 'date', 'temperature', 'place']) eq_(data['header_dict']['date'], 'TEXT') # 'TIMESTAMP WITHOUT TIME ZONE') eq_(data['header_dict']['temperature'], 'TEXT') # 'NUMERIC') eq_(data['header_dict']['place'], 'TEXT') # 'TEXT') eq_(data['num_rows'], 6) eq_(data['rows'][0][2:], (u'2011-01-01', u'1', u'Galway')) # (datetime.datetime(2011, 1, 1), 1, 'Galway')) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() eq_( mocked_set_resource_metadata.call_args[1]['update_dict'], { 'datastore_contains_all_records_of_source_file': True, 'datastore_active': True, 'ckan_url': 'http://www.ckan.org/', 'resource_id': 'foo-bar-42' }) logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) eq_(job['status'], u'complete') eq_(job['error'], None)
def test_too_large_csv(self): # Test not only the load and xloader_hook is called at the end self.register_urls(filename='simple-large.csv') data = { 'api_key': self.api_key, 'job_type': 'xloader_to_datastore', 'result_url': self.callback_url, 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } job_id = 'test{}'.format(random.randint(0, 1e5)) with mock.patch('ckanext.xloader.jobs.set_resource_metadata') \ as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch('ckanext.xloader.jobs.get_current_job', return_value=mock.Mock(id=job_id)): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)['error']['message'] # Check it said it was successful eq_(responses.calls[-1].request.url, 'http://www.ckan.org/api/3/action/xloader_hook') job_dict = json.loads(responses.calls[-1].request.body) assert job_dict['status'] == u'complete', job_dict eq_( job_dict, { u'metadata': { u'datastore_contains_all_records_of_source_file': False, u'datastore_active': True, u'ckan_url': u'http://www.ckan.org/', u'resource_id': u'foo-bar-42' }, u'status': u'complete' }) # Check the load data = self.get_datastore_table() eq_(data['headers'], ['_id', '_full_text', 'id', 'text']) eq_(data['header_dict']['id'], 'TEXT') # 'TIMESTAMP WITHOUT TIME ZONE') eq_(data['header_dict']['text'], 'TEXT') assert data['num_rows'] <= 100 assert data['num_rows'] > 0 eq_(data['rows'][0][2:], (u'1', u'a')) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() eq_( mocked_set_resource_metadata.call_args[1]['update_dict'], { 'datastore_contains_all_records_of_source_file': False, 'datastore_active': True, 'ckan_url': 'http://www.ckan.org/', 'resource_id': 'foo-bar-42' }) logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) eq_(job['status'], u'complete') eq_(job['error'], None) # Check ANALYZE was run last_analyze = self.get_time_of_last_analyze() assert (last_analyze)
def test_first_request_is_202_pending_response(self): # when you first get the CSV it returns this 202 response, which is # what this server does: https://data-cdfw.opendata.arcgis.com/datasets responses.add( responses.GET, SOURCE_URL, status=202, body= '{"processingTime":"8.716 seconds","status":"Processing","generating":{}}', content_type="application/json", ) # subsequent GETs of the CSV work fine self.register_urls() data = { "api_key": self.api_key, "job_type": "xloader_to_datastore", "result_url": self.callback_url, "metadata": { "ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id, }, } job_id = "test{}".format(random.randint(0, 1e5)) with mock.patch("ckanext.xloader.jobs.set_resource_metadata" ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch( "ckanext.xloader.jobs.get_current_job", return_value=mock.Mock(id=job_id), ): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful assert (responses.calls[-1].request.url == "http://www.ckan.org/api/3/action/xloader_hook") job_dict = json.loads(responses.calls[-1].request.body) assert job_dict["status"] == u"complete", job_dict assert job_dict == { u"metadata": { u"ckan_url": u"http://www.ckan.org/", u"datastore_contains_all_records_of_source_file": True, u"datastore_active": True, u"resource_id": u"foo-bar-42", }, u"status": u"complete", } # Check the load data = self.get_datastore_table() assert data["headers"] == [ "_id", "_full_text", "date", "temperature", "place", ] assert data["header_dict"]["date"] == "TEXT" # 'TIMESTAMP WITHOUT TIME ZONE') assert data["header_dict"]["temperature"] == "TEXT" # 'NUMERIC') assert data["header_dict"]["place"] == "TEXT" # 'TEXT') assert data["num_rows"] == 6 assert data["rows"][0][2:] == (u"2011-01-01", u"1", u"Galway") # (datetime.datetime(2011, 1, 1), 1, 'Galway')) # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { "datastore_contains_all_records_of_source_file": True, "datastore_active": True, "ckan_url": "http://www.ckan.org/", "resource_id": "foo-bar-42", } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) assert job["status"] == u"complete" assert job["error"] is None
def test_too_large_csv(self): # Test not only the load and xloader_hook is called at the end self.register_urls(filename="simple-large.csv") data = { "api_key": self.api_key, "job_type": "xloader_to_datastore", "result_url": self.callback_url, "metadata": { "ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id, }, } job_id = "test{}".format(random.randint(0, 1e5)) with mock.patch("ckanext.xloader.jobs.set_resource_metadata" ) as mocked_set_resource_metadata: # in tests we call jobs directly, rather than use rq, so mock # get_current_job() with mock.patch( "ckanext.xloader.jobs.get_current_job", return_value=mock.Mock(id=job_id), ): result = jobs.xloader_data_into_datastore(data) assert result is None, jobs_db.get_job(job_id)["error"]["message"] # Check it said it was successful assert (responses.calls[-1].request.url == "http://www.ckan.org/api/3/action/xloader_hook") job_dict = json.loads(responses.calls[-1].request.body) assert job_dict["status"] == u"complete", job_dict assert job_dict == { u"metadata": { u"datastore_contains_all_records_of_source_file": False, u"datastore_active": True, u"ckan_url": u"http://www.ckan.org/", u"resource_id": u"foo-bar-42", }, u"status": u"complete", } # Check the load data = self.get_datastore_table() assert data["headers"] == ["_id", "_full_text", "id", "text"] assert data["header_dict"]["id"] == "TEXT" # 'TIMESTAMP WITHOUT TIME ZONE') assert data["header_dict"]["text"] == "TEXT" assert data["num_rows"] <= 100 assert data["num_rows"] > 0 assert data["rows"][0][2:] == (u"1", u"a") # Check it wanted to set the datastore_active=True mocked_set_resource_metadata.assert_called_once() assert mocked_set_resource_metadata.call_args[1]["update_dict"] == { "datastore_contains_all_records_of_source_file": False, "datastore_active": True, "ckan_url": "http://www.ckan.org/", "resource_id": "foo-bar-42", } logs = self.get_load_logs(job_id) logs.assert_no_errors() job = jobs_db.get_job(job_id) assert job["status"] == u"complete" assert job["error"] is None # Check ANALYZE was run last_analyze = self.get_time_of_last_analyze() assert last_analyze