def test_unicode(self): """ Test that unicode characters works """ csv_data = [ [u'Column A', u'Column B'], [u'Some char: \u1234', u'The euro char: \u20ac'] ] file_ = helpers.rows_to_xlsx_file(csv_data) client = self.custodian_1_client self.assertEqual(0, self.ds.record_queryset.count()) with open(file_, 'rb') as fp: data = { 'file': fp, 'strict': False } resp = client.post(self.url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # The records should be saved in order of the row qs = self.ds.record_queryset.order_by('pk') self.assertEqual(len(csv_data) - 1, qs.count()) index = 0 record = qs[index] expected_data = { 'Column A': u'Some char: \u1234', 'Column B': u'The euro char: \u20ac', } self.assertEqual(expected_data, record.data)
def test_headers_are_trimmed_xlsx(self): """ Same as above but with an xlsx file """ fields = ['What', 'When', 'Who'] dataset = self._create_dataset_from_rows([ fields ]) schema = dataset.schema self.assertEqual(schema.headers, fields) # upload record csv_data = [ ['What ', ' When', ' Who '], ['Something', '2018-02-01', 'me'], ] file_ = helpers.rows_to_xlsx_file(csv_data) client = self.custodian_1_client url = reverse('api:dataset-upload', kwargs={'pk': dataset.pk}) with open(file_, 'rb') as fp: data = { 'file': fp, 'strict': True # upload in strict mode } resp = client.post(url, data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # verify stored data record = dataset.record_queryset.first() self.assertEqual(record.data.get('What'), 'Something') self.assertEqual(record.data.get('When'), '2018-02-01') self.assertEqual(record.data.get('Who'), 'me') # verify that the fields with space doesn't exists for f in csv_data[0]: self.assertIsNone(record.data.get(f))
def test_mix_types_infer_most_plausible(self): """ Scenario: column with more integers than string should be infer a type='integer' Given than a column contains 2 strings then 5 integers Then the column type should be 'integer' """ columns = ['How Many'] rows = [columns, [1], ['1 or 2'], ['3 or 4'], [2], [3], [4], [5]] client = self.data_engineer_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # data_package verification self.assertIn('data_package', received) self.verify_inferred_data(received) # verify schema schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) field = schema.get_field_by_name('How Many') self.assertEqual(field.type, 'integer')
def test_generic_string_and_number_simple_xls(self): """ Test that the infer detect numbers and integers type """ columns = ['Name', 'Age', 'Weight', 'Comments'] rows = [ columns, ['Frederic', 56, 80.5, 'a comment'], ['Hilda', 24, 56, ''] ] client = self.data_engineer_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # should be json self.assertEqual(resp.get('content-type'), 'application/json') received = resp.json() # name should be set with the file name self.assertIn('name', received) file_name = path.splitext(path.basename(fp.name))[0] self.assertEqual(file_name, received.get('name')) # type should be 'generic' self.assertIn('type', received) self.assertEqual('generic', received.get('type')) # data_package verification self.assertIn('data_package', received) self.verify_inferred_data(received) # verify schema schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) self.assertEqual(len(schema.fields), len(columns)) self.assertEqual(schema.field_names, columns) field = schema.get_field_by_name('Name') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Age') self.assertEqual(field.type, 'integer') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Weight') self.assertEqual(field.type, 'number') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('Comments') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default')
def test_observation_with_genus_and_species_only_xls(self): """ Scenario: File with column Latitude, Longitude, Genus and Species should be inferred as species observation Given that a column named Latitude, Longitude, Genus and Species exists Then the dataset type should be of type speciesObservation And the column 'Genus' should be of type string, set as required and tag as biosys type genus And the column 'Species' should be of type string, set as required and tag as biosys type species """ columns = [ 'What', 'When', 'Latitude', 'Longitude', 'Genus', 'Species', 'Comments' ] rows = [ columns, ['I saw a dog', '2018-02-02', -32, 117.75, 'Canis', 'lupus', None], [ 'I saw a Chubby bat', '2017-01-02', -32, 116.7, 'Chubby', 'bat', 'Amazing!' ], ['I saw nothing', '2018-01-02', -32.34, 116.7, None, None, None], ] client = self.custodian_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # should be a species observation self.assertEqual(Dataset.TYPE_SPECIES_OBSERVATION, received.get('type')) self.assertIn('data_package', received) schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) # field attributes # genus genus = schema.get_field_by_name('Genus') self.assertIsNotNone(genus) self.assertEqual(genus.type, 'string') self.assertTrue(genus.required) biosys = genus.get('biosys') self.assertIsNotNone(biosys) biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.GENUS_TYPE_NAME) species = schema.get_field_by_name('Species') self.assertIsNotNone(species) self.assertEqual(species.type, 'string') self.assertTrue(species.required) biosys = species.get('biosys') self.assertIsNotNone(biosys) biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.SPECIES_TYPE_NAME) # test that we can create a dataset with the returned data self.verify_inferred_data(received)
def test_upload_xlsx_happy_path(self): csv_data = [ ['Column A', 'Column B'], ['A1', 'B1'], ['A2', 'B2'] ] file_ = helpers.rows_to_xlsx_file(csv_data) client = self.custodian_1_client self.assertEqual(0, self.ds.record_queryset.count()) file_name = path.basename(file_) with open(file_, 'rb') as fp: data = { 'file': fp, 'strict': True # upload in strict mode } resp = client.post(self.url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) # The records should be saved in order of the row qs = self.ds.record_queryset.order_by('pk') self.assertEqual(len(csv_data) - 1, qs.count()) index = 0 record = qs[index] expected_data = { 'Column A': 'A1', 'Column B': 'B1', } self.assertEqual(expected_data, record.data) # test that source_info contains the file_name and row_counter source_info = record.source_info self.assertIsNotNone(source_info) expected_info = { 'file_name': file_name, 'row': index + 2 } self.assertEqual(source_info, expected_info) index = 1 record = qs[index] expected_data = { 'Column A': 'A2', 'Column B': 'B2', } self.assertEqual(expected_data, record.data) # test that source_info contains the file_name and row_counter source_info = record.source_info self.assertIsNotNone(source_info) expected_info = { 'file_name': file_name, 'row': index + 2 } self.assertEqual(source_info, expected_info) self.assertEqual(self.project_1.record_count, len(csv_data) - 1) self.assertEqual(self.ds.record_count, len(csv_data) - 1)
def test_observation_with_lat_long_datum_xls(self): """ Scenario: File with column Latitude, Longitude and Datum Given that columns named Latitude, Longitude and Datum exists Then the dataset type should be inferred as Observation And latitude should be of type 'number', set as required and tag with biosys type latitude And longitude should be of type 'number', set as required and tag with biosys type longitude And datum should be of type 'string', set as not required and with biosys type datum """ columns = ['What', 'Latitude', 'Longitude', 'Datum'] rows = [ columns, ['Observation1', -32, 117.75, 'WGS84'], ['Observation with lat/long as string', '-32', '115.75', None] ] client = self.custodian_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # type observation self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type')) # verify fields attributes schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) lat_field = schema.get_field_by_name('Latitude') self.assertEqual(lat_field.type, 'number') self.assertTrue(lat_field.required) biosys = lat_field.get('biosys') biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.LATITUDE_TYPE_NAME) lon_field = schema.get_field_by_name('Longitude') self.assertEqual(lon_field.type, 'number') self.assertTrue(lon_field.required) biosys = lon_field.get('biosys') biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.LONGITUDE_TYPE_NAME) # datum datum_field = schema.get_field_by_name('Datum') self.assertEqual(datum_field.type, 'string') self.assertFalse(datum_field.required) biosys = datum_field.get('biosys') biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.DATUM_TYPE_NAME) # test that we can save the dataset back. self.verify_inferred_data(received)
def test_observation_with_lat_long_xls(self): """ Scenario: File with column Latitude and Longitude Given that a column named Latitude and Longitude exists Then they should be of type 'number' And they should be set as required And they should be tagged with the appropriate biosys tag And the dataset type should be observation """ columns = ['What', 'Latitude', 'Longitude'] rows = [ columns, ['Observation1', -32, 117.75], ['Observation with lat/long as string', '-32', '115.75'] ] client = self.custodian_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # data_package verification self.assertIn('data_package', received) # verify fields attributes schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) lat_field = schema.get_field_by_name('Latitude') lon_field = schema.get_field_by_name('Longitude') self.assertEqual(lat_field.type, 'number') self.assertEqual(lon_field.type, 'number') self.assertTrue(lat_field.required) self.assertTrue(lon_field.required) # biosys types self.assertTrue( BiosysSchema(lat_field.get( BiosysSchema.BIOSYS_KEY_NAME)).is_latitude()) self.assertTrue( BiosysSchema(lon_field.get( BiosysSchema.BIOSYS_KEY_NAME)).is_longitude()) self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type')) # test biosys validity self.verify_inferred_data(received)
def test_site_no_date(self): csv_data = [['What', 'Site'], ['No Date', self.site.code]] file_ = helpers.rows_to_xlsx_file(csv_data) client = self.custodian_1_client with open(file_, 'rb') as fp: data = { 'file': fp, 'strict': True # upload in strict mode } resp = client.post(self.url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) records = self.dataset.record_queryset.all() self.assertEqual(len(records), 1) record = records[0] self.assertEqual(record.site, self.site) self.assertIsNone(record.datetime) self.assertEqual(record.geometry, self.site.geometry)
def test_upload_xlsx_happy_path(self): csv_data = [[ 'Site Code', 'Site Name', 'Description', 'Latitude', 'Longitude', 'Datum', 'Attribute1', 'Attribute2' ], ['C1', 'Site 1', 'Description1', -32, 116, '', 'attr11', 'attr12'], [ 'C2', 'Site 2', 'Description2', -31, 117, '', 'attr21', 'attr22' ]] xlsx_file = helpers.rows_to_xlsx_file(csv_data) project = self.project_1 client = self.custodian_1_client url = reverse('api:upload-sites', kwargs={'pk': project.pk}) self.assertEqual(0, Site.objects.filter(project=project).count()) with open(xlsx_file, 'rb') as fp: data = {'file': fp} resp = client.post(url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) qs = Site.objects.filter(project=project) self.assertEqual(len(csv_data) - 1, qs.count()) self.assertEqual(['C1', 'C2'], [s.code for s in qs.order_by('code')]) self.assertEqual(['Site 1', 'Site 2'], [s.name for s in qs.order_by('name')]) self.assertEqual( ['Description1', 'Description2'], [s.description for s in qs.order_by('description')]) # test geom and attr s = qs.filter(code='C1').first() self.assertEqual((116, -32), (s.geometry.x, s.geometry.y)) expected_attributes = { 'Latitude': '-32', 'Longitude': '116', 'Datum': '', 'Attribute1': 'attr11', 'Attribute2': 'attr12' } self.assertEqual(expected_attributes, s.attributes) self.assertEqual(project.site_count, len(csv_data) - 1)
def test_generic_date_iso_xls(self): """ Scenario: date column with ISO string 'yyyy-mm-dd' Given that a column is provided with strings of form 'yyyy-mm-dd' Then the column type should be 'date' And the format should be 'any' """ columns = ['What', 'When'] rows = [ columns, ['Something', '2018-01-19'], ['Another thing', dt.date(2017, 12, 29).isoformat()], ['Another thing', '2017-08-01'] ] client = self.data_engineer_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # data_package verification self.assertIn('data_package', received) self.verify_inferred_data(received) # verify schema schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) field = schema.get_field_by_name('What') self.assertEqual(field.type, 'string') self.assertFalse(field.required) self.assertEqual(field.format, 'default') field = schema.get_field_by_name('When') self.assertEqual(field.type, 'date') self.assertFalse(field.required) self.assertEqual(field.format, 'any')
def test_infer_dataset_param(self): """ Test that when the param infer_dataset_type is set to False the type in generic even if we have a valid observation type """ columns = ['What', 'Latitude', 'Longitude'] rows = [ columns, ['Observation1', -32.0, 117.75], ['Observation with lat/long as string', '-32.0', '115.75'] ] client = self.custodian_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: # no param: should infer the type payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type')) # with param: return generic. fp.seek(0) payload = { 'file': fp, 'infer_dataset_type': False } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() self.assertEqual(Dataset.TYPE_GENERIC, received.get('type')) schema_descriptor = Package(received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) lat_field = schema.get_field_by_name('Latitude') lon_field = schema.get_field_by_name('Longitude') # no required constraints self.assertFalse(lat_field.required) self.assertFalse(lon_field.required)
def test_easting_northing_geometry_extraction(self): csv_data = [ [ 'Code', 'Name', 'Description', 'Easting', 'Northing', 'Datum', 'Zone', 'Attribute1', 'Attribute2' ], [ 'C1', 'Site 1', 'Description1', '405542.537', '6459127.469', 'GDA94', '50', 'attr11', 'attr12' ], ] xlsx_file = helpers.rows_to_xlsx_file(csv_data) project = self.project_1 client = self.custodian_1_client url = reverse('api:upload-sites', kwargs={'pk': project.pk}) self.assertEqual(0, Site.objects.filter(project=project).count()) with open(xlsx_file, 'rb') as fp: data = {'file': fp} resp = client.post(url, data=data, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) qs = Site.objects.filter(project=project) self.assertEqual(qs.count(), 1) site = qs.first() self.assertEqual(site.code, 'C1') self.assertEqual(site.name, 'Site 1') self.assertEqual(site.description, 'Description1') # test geom and attr self.assertAlmostEqual(site.geometry.x, 116, places=4) self.assertAlmostEqual(site.geometry.y, -32, places=4) expected_attributes = { 'Easting': '405542.537', 'Northing': '6459127.469', 'Datum': 'GDA94', 'Zone': '50', 'Attribute1': 'attr11', 'Attribute2': 'attr12' } self.assertEqual(expected_attributes, site.attributes)
def test_observation_with_easting_northing_zone_xls(self): """ Scenario: File with column Easting, Northing and Zone Given that a column named Easting , Northing and Zone exist Then the dataset type should be inferred as Observation And the type of Easting and Northing should be 'number' And Easting and Northing should be set as required And they should be tagged with the appropriate biosys tag And Zone should be of type integer and required. """ columns = ['What', 'Easting', 'Northing', 'Zone', 'Comments'] rows = [ columns, ['Something', 12563.233, 568932.345, 50, 'A dog'], [ 'Observation with easting/northing as string', '12563.233', '568932.345', 50, 'A dog' ] ] client = self.custodian_1_client file_ = helpers.rows_to_xlsx_file(rows) with open(file_, 'rb') as fp: payload = { 'file': fp, } resp = client.post(self.url, data=payload, format='multipart') self.assertEqual(status.HTTP_200_OK, resp.status_code) received = resp.json() # should be an observation self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type')) # data_package verification self.assertIn('data_package', received) # verify fields attributes schema_descriptor = Package( received.get('data_package')).resources[0].descriptor['schema'] schema = utils_data_package.GenericSchema(schema_descriptor) east_field = schema.get_field_by_name('Easting') self.assertIsNotNone(east_field) self.assertEqual(east_field.type, 'number') self.assertTrue(east_field.required) biosys = east_field.get('biosys') self.assertIsNotNone(biosys) biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.EASTING_TYPE_NAME) north_field = schema.get_field_by_name('Northing') self.assertIsNotNone(north_field) self.assertEqual(north_field.type, 'number') self.assertTrue(north_field.required) biosys = north_field.get('biosys') self.assertIsNotNone(biosys) biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.NORTHING_TYPE_NAME) zone_field = schema.get_field_by_name('Zone') self.assertIsNotNone(zone_field) self.assertEqual(zone_field.type, 'integer') self.assertTrue(zone_field.required) biosys = zone_field.get('biosys') self.assertIsNotNone(biosys) biosys_type = biosys.get('type') self.assertEqual(biosys_type, BiosysSchema.ZONE_TYPE_NAME) # test that we can save the dataset as returned self.verify_inferred_data(received)