def test_too_large_file(self): """It should raise JobError if the data file is too large. If the data file is larger than MAX_CONTENT_LENGTH then the async background job push_to_datastore should raise JobError (ckanserviceprovider will catch this exception and return an error to the client). """ self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } # Override the source_url (already mocked by self.register_urls() # above) with another mock response, this one mocks a response body # that's bigger than MAX_CONTENT_LENGTH. source_url = 'http://www.source.org/static/file' size = jobs.MAX_CONTENT_LENGTH + 1 httpretty.register_uri( httpretty.GET, source_url, body='a' * size, content_type='application/json', forcing_headers={ 'content-length': None }) jobs.push_to_datastore('fake_id', data, True)
def test_too_large_file(self): """It should raise JobError if the data file is too large. If the data file is larger than MAX_CONTENT_LENGTH then the async background job push_to_datastore should raise JobError (ckanserviceprovider will catch this exception and return an error to the client). """ self.register_urls() data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } # Override the source_url (already mocked by self.register_urls() # above) with another mock response, this one mocks a response body # that's bigger than MAX_CONTENT_LENGTH. source_url = "http://www.source.org/static/file" size = jobs.MAX_CONTENT_LENGTH + 1 httpretty.register_uri( httpretty.GET, source_url, body="a" * size, content_length=size, content_type="application/json" ) jobs.push_to_datastore("fake_id", data, True)
def test_simple_csv(self): self.register_urls() data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } jobs.push_to_datastore("fake_id", data)
def test_simple_csv(self): self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } jobs.push_to_datastore('fake_id', data)
def test_simple_xls(self): """Test successfully fetching and parsing a simple XLS file. When given dry_run=True and a resource with a simple XLS (Excel) file the push_to_datastore job should fetch and parse the file and return the right headers and data rows from the file. """ self.register_urls('simple.xls', 'xls', 'application/vnd.ms-excel') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{'type': 'timestamp', 'id': u'date'}, {'type': 'numeric', 'id': u'temperature'}, {'type': 'text', 'id': u'place'}]) assert_equal(len(results), 6) assert_equal(results[0], {u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1})
def test_simple_xls(self): """Test successfully fetching and parsing a simple XLS file. When given dry_run=True and a resource with a simple XLS (Excel) file the push_to_datastore job should fetch and parse the file and return the right headers and data rows from the file. """ self.register_urls("simple.xls", "xls", "application/vnd.ms-excel") data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True) results = list(results) assert_equal( headers, [ {"type": "timestamp", "id": u"date"}, {"type": "numeric", "id": u"temperature"}, {"type": "text", "id": u"place"}, ], ) assert_equal(len(results), 6) assert_equal(results[0], {u"date": datetime.datetime(2011, 1, 1, 0, 0), u"place": u"Galway", u"temperature": 1})
def test_real_csv(self): self.register_urls('october_2011.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{'type': 'text', 'id': u'Directorate'}, {'type': 'text', 'id': u'Service Area'}, {'type': 'text', 'id': u'Expenditure Category'}, {'type': 'timestamp', 'id': u'Payment Date'}, {'type': 'text', 'id': u'Supplier Name'}, {'type': 'numeric', 'id': u'Internal Ref'}, {'type': 'text', 'id': u'Capital/ Revenue'}, {'type': 'text', 'id': u'Cost Centre'}, {'type': 'text', 'id': u'Cost Centre Description'}, {'type': 'numeric', 'id': u'Grand Total'}]) assert_equal(len(results), 230) assert_equal(results[0], {u'Directorate': u'Adult and Culture', u'Service Area': u'Ad Serv-Welfare Rights- ', u'Expenditure Category': u'Supplies & Services', u'Cost Centre Description': u'WELFARE RIGHTS WORKERS M', u'Capital/ Revenue': u'Revenue', u'Grand Total': 828.0, u'Payment Date': datetime.datetime(2011, 10, 24, 0, 0), u'Internal Ref': 5277184, u'Cost Centre': u'1MR48', u'Supplier Name': u'ALBANY OFFICE FURNITURE SOLUTIONS'})
def test_do_not_push_when_same_hash(self): source_url, res_url = self.register_urls() httpretty.register_uri(httpretty.POST, res_url, body=json.dumps({ 'success': True, 'result': { 'id': '32h4345k34h5l345', 'name': 'short name', 'url': source_url, 'format': 'csv', 'hash': '0ccb75d277ec2da41faae58642e3fb11' } }), content_type='application/json') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } res = jobs.push_to_datastore('fake_id', data, True) # res should be None because we didn't get to the part that # returns something assert not res, res
def test_bad_scheme(self): """It should raise HTTPError(JobError) if the resource.url is an invalid scheme. (ckanserviceprovider will catch this exception and return an error to the client). """ self.register_urls(source_url='invalid://example.com') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } jobs.push_to_datastore('fake_id', data, True)
def test_too_large_file(self): self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } source_url = 'http://www.source.org/static/file' size = jobs.MAX_CONTENT_LENGTH + 1 httpretty.register_uri( httpretty.GET, source_url, body='a' * size, content_length=size, content_type='application/json') jobs.push_to_datastore('fake_id', data, True)
def test_content_length_string(self): """If the Content-Length header value is a string, just ignore it. """ self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } source_url = 'http://www.source.org/static/file' httpretty.register_uri( httpretty.GET, source_url, body='aaaaa', content_type='application/json', forcing_headers={ 'Content-Length': 'some string' }) jobs.push_to_datastore('fake_id', data, True)
def test_mostly_numbers(self): self.register_urls('mixedGLB.csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 19) assert_equal(len(results), 133)
def test_long_file(self): self.register_urls('long.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 1) assert_equal(len(results), 4000)
def test_real_csv(self): """Test fetching and parsing a more realistic CSV file. When given dry_run=True and a resource with a CSV file, the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls("october_2011.csv", "csv") data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True) results = list(results) assert_equal( headers, [ {"type": "text", "id": u"Directorate"}, {"type": "text", "id": u"Service Area"}, {"type": "text", "id": u"Expenditure Category"}, {"type": "timestamp", "id": u"Payment Date"}, {"type": "text", "id": u"Supplier Name"}, {"type": "numeric", "id": u"Internal Ref"}, {"type": "text", "id": u"Capital/ Revenue"}, {"type": "text", "id": u"Cost Centre"}, {"type": "text", "id": u"Cost Centre Description"}, {"type": "numeric", "id": u"Grand Total"}, ], ) assert_equal(len(results), 230) assert_equal( results[0], { u"Directorate": u"Adult and Culture", u"Service Area": u"Ad Serv-Welfare Rights- ", u"Expenditure Category": u"Supplies & Services", u"Cost Centre Description": u"WELFARE RIGHTS WORKERS M", u"Capital/ Revenue": u"Revenue", u"Grand Total": 828.0, u"Payment Date": datetime.datetime(2011, 10, 24, 0, 0), u"Internal Ref": 5277184, u"Cost Centre": u"1MR48", u"Supplier Name": u"ALBANY OFFICE FURNITURE SOLUTIONS", }, )
def test_weird_header(self): self.register_urls('weird_head_padding.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 11) assert_equal(len(results), 82) assert_equal(headers[1]['id'].strip(), u'1985') assert_equal(results[1]['column_0'].strip(), u'Gefäßchirurgie')
def test_delete_404(self): self.register_urls() datastore_del_url = 'http://www.ckan.org/api/3/action/datastore_delete' httpretty.register_uri(httpretty.POST, datastore_del_url, status=404, body=json.dumps({'success': False}), content_type='application/json') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True)
def test_mostly_numbers(self): """Test fetching and parsing a CSV file that contains mostly numbers. When given dry_run=True and a resource with a CSV file that contains mostly numbers the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls("mixedGLB.csv") data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True) results = list(results) assert_equal(len(headers), 19) assert_equal(len(results), 133)
def test_long_file(self): """Test fetching and parsing a long CSV file. When given dry_run=True and a resource with a long CSV file the push_to_datastore job should return the right number of headers and rows from the CSV file. """ self.register_urls("long.csv", "csv") data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True) results = list(results) assert_equal(len(headers), 1) assert_equal(len(results), 4000)
def test_simple_csv(self): self.register_urls() data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{'type': 'timestamp', 'id': u'date'}, {'type': 'numeric', 'id': u'temperature'}, {'type': 'text', 'id': u'place'}]) assert_equal(len(results), 6) assert_equal(results[0], {u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1})
def test_do_not_push_when_same_hash(self): """A file should not be pushed if it hasn't changed. If a resource's file has already been added to the datastore and then the datapusher's push_to_datastore job fetchess and parses it again and the file has not changed, then push_to_datastore should return None rather than parsing the file and returning headers and rows. FIXME: This relies on a return statement early in the push_to_datastore function, this doesn't seem like a great way to test that the file was not pushed. """ source_url, res_url = self.register_urls() httpretty.register_uri( httpretty.POST, res_url, body=json.dumps( { "success": True, "result": { "id": "32h4345k34h5l345", "name": "short name", "url": source_url, "format": "csv", "hash": "0ccb75d277ec2da41faae58642e3fb11", }, } ), content_type="application/json", ) data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } res = jobs.push_to_datastore("fake_id", data, True) # res should be None because we didn't get to the part that # returns something assert not res, res
def test_do_not_push_when_same_hash(self): """A file should not be pushed if it hasn't changed. If a resource's file has already been added to the datastore and then the datapusher's push_to_datastore job fetchess and parses it again and the file has not changed, then push_to_datastore should return None rather than parsing the file and returning headers and rows. FIXME: This relies on a return statement early in the push_to_datastore function, this doesn't seem like a great way to test that the file was not pushed. """ source_url, res_url = self.register_urls() httpretty.register_uri( httpretty.POST, res_url, body=json.dumps({ 'success': True, 'result': { 'id': '32h4345k34h5l345', 'name': 'short name', 'url': source_url, 'format': 'csv', 'hash': '0ccb75d277ec2da41faae58642e3fb11' } }), content_type='application/json') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } res = jobs.push_to_datastore('fake_id', data, True) # res should be None because we didn't get to the part that # returns something assert not res, res
def test_weird_header(self): """Test fetching and parsing a CSV file with "weird" header padding. When given dry_run=True and a resource with a CSV file with weird header padding the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls("weird_head_padding.csv", "csv") data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True) results = list(results) assert_equal(len(headers), 9) assert_equal(len(results), 82) assert_equal(headers[0]["id"].strip(), u"1985") assert_equal(results[1]["1993"].strip(), u"379")
def test_delete_404(self): self.register_urls() # Override the mocked datastore_delete URL with another mock response # that returns a 404. datastore_del_url = "http://www.ckan.org/api/3/action/datastore_delete" httpretty.register_uri( httpretty.POST, datastore_del_url, status=404, body=json.dumps({"success": False}), content_type="application/json", ) data = { "api_key": self.api_key, "job_type": "push_to_datastore", "metadata": {"ckan_url": "http://%s/" % self.host, "resource_id": self.resource_id}, } headers, results = jobs.push_to_datastore("fake_id", data, True)
def test_mostly_numbers(self): """Test fetching and parsing a CSV file that contains mostly numbers. When given dry_run=True and a resource with a CSV file that contains mostly numbers the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls('mixedGLB.csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 19) assert_equal(len(results), 133)
def test_long_file(self): """Test fetching and parsing a long CSV file. When given dry_run=True and a resource with a long CSV file the push_to_datastore job should return the right number of headers and rows from the CSV file. """ self.register_urls('long.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 1) assert_equal(len(results), 4000)
def test_simple_ssv(self): """Test successfully fetching and parsing a simple SSV file. When given dry_run=True and a resource with a simple SSV (semicolon-separated values) file the push_to_datastore job should fetch and parse the file and return the right headers and data rows from the file. """ self.register_urls('simple.ssv', 'csv', 'application/csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{ 'type': 'timestamp', 'id': u'date' }, { 'type': 'numeric', 'id': u'temperature' }, { 'type': 'text', 'id': u'place' }]) assert_equal(len(results), 6) assert_equal( results[0], { u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1 })
def test_weird_header(self): """Test fetching and parsing a CSV file with "weird" header padding. When given dry_run=True and a resource with a CSV file with weird header padding the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls('weird_head_padding.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(len(headers), 9) assert_equal(len(results), 82) assert_equal(headers[0]['id'].strip(), u'1985') assert_equal(results[1]['1993'].strip(), u'379')
def test_csv_with_html_content_type(self): """Test successfully fetching and parsing a simple CSV file when the content type specified is text/html not csv. """ self.register_urls(content_type="text/html") data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{'type': 'timestamp', 'id': 'date'}, {'type': 'numeric', 'id': 'temperature'}, {'type': 'text', 'id': 'place'}]) assert_equal(len(results), 6) assert_equal( results[0], {'date': datetime.datetime(2011, 1, 1, 0, 0), 'place': 'Galway', 'temperature': 1})
def test_real_csv(self): """Test fetching and parsing a more realistic CSV file. When given dry_run=True and a resource with a CSV file, the push_to_datastore job should return the right headers and rows from the CSV file. """ self.register_urls('october_2011.csv', 'csv') data = { 'api_key': self.api_key, 'job_type': 'push_to_datastore', 'metadata': { 'ckan_url': 'http://%s/' % self.host, 'resource_id': self.resource_id } } headers, results = jobs.push_to_datastore('fake_id', data, True) results = list(results) assert_equal(headers, [{ 'type': 'text', 'id': u'Directorate' }, { 'type': 'text', 'id': u'Service Area' }, { 'type': 'text', 'id': u'Expenditure Category' }, { 'type': 'timestamp', 'id': u'Payment Date' }, { 'type': 'text', 'id': u'Supplier Name' }, { 'type': 'numeric', 'id': u'Internal Ref' }, { 'type': 'text', 'id': u'Capital/ Revenue' }, { 'type': 'text', 'id': u'Cost Centre' }, { 'type': 'text', 'id': u'Cost Centre Description' }, { 'type': 'numeric', 'id': u'Grand Total' }]) assert_equal(len(results), 230) assert_equal( results[0], { u'Directorate': u'Adult and Culture', u'Service Area': u'Ad Serv-Welfare Rights- ', u'Expenditure Category': u'Supplies & Services', u'Cost Centre Description': u'WELFARE RIGHTS WORKERS M', u'Capital/ Revenue': u'Revenue', u'Grand Total': 828.0, u'Payment Date': datetime.datetime(2011, 10, 24, 0, 0), u'Internal Ref': 5277184, u'Cost Centre': u'1MR48', u'Supplier Name': u'ALBANY OFFICE FURNITURE SOLUTIONS' })