def test_unicode(self):
        """
        Test that unicode characters works
        """
        csv_data = [
            [u'Column A', u'Column B'],
            [u'Some char: \u1234', u'The euro char: \u20ac']
        ]
        file_ = helpers.rows_to_xlsx_file(csv_data)
        client = self.custodian_1_client
        self.assertEqual(0, self.ds.record_queryset.count())
        with open(file_, 'rb') as fp:
            data = {
                'file': fp,
                'strict': False
            }
            resp = client.post(self.url, data=data, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            # The records should be saved in order of the row
            qs = self.ds.record_queryset.order_by('pk')
            self.assertEqual(len(csv_data) - 1, qs.count())

            index = 0
            record = qs[index]
            expected_data = {
                'Column A': u'Some char: \u1234',
                'Column B': u'The euro char: \u20ac',
            }
            self.assertEqual(expected_data, record.data)
    def test_headers_are_trimmed_xlsx(self):
        """
        Same as above but with an xlsx file
        """
        fields = ['What', 'When', 'Who']
        dataset = self._create_dataset_from_rows([
            fields
        ])
        schema = dataset.schema
        self.assertEqual(schema.headers, fields)

        # upload record
        csv_data = [
            ['What ', ' When', ' Who  '],
            ['Something', '2018-02-01', 'me'],
        ]
        file_ = helpers.rows_to_xlsx_file(csv_data)
        client = self.custodian_1_client
        url = reverse('api:dataset-upload', kwargs={'pk': dataset.pk})
        with open(file_, 'rb') as fp:
            data = {
                'file': fp,
                'strict': True  # upload in strict mode
            }
            resp = client.post(url, data, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)

            # verify stored data
            record = dataset.record_queryset.first()
            self.assertEqual(record.data.get('What'), 'Something')
            self.assertEqual(record.data.get('When'), '2018-02-01')
            self.assertEqual(record.data.get('Who'), 'me')
            # verify that the fields with space doesn't exists
            for f in csv_data[0]:
                self.assertIsNone(record.data.get(f))
    def test_mix_types_infer_most_plausible(self):
        """
        Scenario: column with more integers than string should be infer a type='integer'
        Given than a column contains 2 strings then 5 integers
        Then the column type should be 'integer'
        """
        columns = ['How Many']
        rows = [columns, [1], ['1 or 2'], ['3 or 4'], [2], [3], [4], [5]]
        client = self.data_engineer_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # data_package verification
            self.assertIn('data_package', received)
            self.verify_inferred_data(received)

            # verify schema
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            field = schema.get_field_by_name('How Many')
            self.assertEqual(field.type, 'integer')
    def test_generic_string_and_number_simple_xls(self):
        """
        Test that the infer detect numbers and integers type
        """
        columns = ['Name', 'Age', 'Weight', 'Comments']
        rows = [
            columns, ['Frederic', 56, 80.5, 'a comment'],
            ['Hilda', 24, 56, '']
        ]
        client = self.data_engineer_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            # should be json
            self.assertEqual(resp.get('content-type'), 'application/json')
            received = resp.json()

            # name should be set with the file name
            self.assertIn('name', received)
            file_name = path.splitext(path.basename(fp.name))[0]
            self.assertEqual(file_name, received.get('name'))
            # type should be 'generic'
            self.assertIn('type', received)
            self.assertEqual('generic', received.get('type'))

            # data_package verification
            self.assertIn('data_package', received)
            self.verify_inferred_data(received)

            # verify schema
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            self.assertEqual(len(schema.fields), len(columns))
            self.assertEqual(schema.field_names, columns)

            field = schema.get_field_by_name('Name')
            self.assertEqual(field.type, 'string')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'default')

            field = schema.get_field_by_name('Age')
            self.assertEqual(field.type, 'integer')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'default')

            field = schema.get_field_by_name('Weight')
            self.assertEqual(field.type, 'number')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'default')

            field = schema.get_field_by_name('Comments')
            self.assertEqual(field.type, 'string')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'default')
    def test_observation_with_genus_and_species_only_xls(self):
        """
        Scenario: File with column Latitude, Longitude, Genus and Species should be inferred as species observation
         Given that a column named Latitude, Longitude, Genus and Species exists
         Then the dataset type should be of type speciesObservation
         And the column 'Genus' should be of type string, set as required and tag as biosys type genus
         And the column 'Species' should be of type string, set as required and tag as biosys type species
        """
        columns = [
            'What', 'When', 'Latitude', 'Longitude', 'Genus', 'Species',
            'Comments'
        ]
        rows = [
            columns,
            ['I saw a dog', '2018-02-02', -32, 117.75, 'Canis', 'lupus', None],
            [
                'I saw a Chubby bat', '2017-01-02', -32, 116.7, 'Chubby',
                'bat', 'Amazing!'
            ],
            ['I saw nothing', '2018-01-02', -32.34, 116.7, None, None, None],
        ]
        client = self.custodian_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # should be a species observation
            self.assertEqual(Dataset.TYPE_SPECIES_OBSERVATION,
                             received.get('type'))
            self.assertIn('data_package', received)
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            # field attributes
            # genus
            genus = schema.get_field_by_name('Genus')
            self.assertIsNotNone(genus)
            self.assertEqual(genus.type, 'string')
            self.assertTrue(genus.required)
            biosys = genus.get('biosys')
            self.assertIsNotNone(biosys)
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.GENUS_TYPE_NAME)

            species = schema.get_field_by_name('Species')
            self.assertIsNotNone(species)
            self.assertEqual(species.type, 'string')
            self.assertTrue(species.required)
            biosys = species.get('biosys')
            self.assertIsNotNone(biosys)
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.SPECIES_TYPE_NAME)

            # test that we can create a dataset with the returned data
            self.verify_inferred_data(received)
    def test_upload_xlsx_happy_path(self):
        csv_data = [
            ['Column A', 'Column B'],
            ['A1', 'B1'],
            ['A2', 'B2']
        ]
        file_ = helpers.rows_to_xlsx_file(csv_data)
        client = self.custodian_1_client
        self.assertEqual(0, self.ds.record_queryset.count())
        file_name = path.basename(file_)
        with open(file_, 'rb') as fp:
            data = {
                'file': fp,
                'strict': True  # upload in strict mode
            }
            resp = client.post(self.url, data=data, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            # The records should be saved in order of the row
            qs = self.ds.record_queryset.order_by('pk')
            self.assertEqual(len(csv_data) - 1, qs.count())

            index = 0
            record = qs[index]
            expected_data = {
                'Column A': 'A1',
                'Column B': 'B1',
            }
            self.assertEqual(expected_data, record.data)
            # test that source_info contains the file_name and row_counter
            source_info = record.source_info
            self.assertIsNotNone(source_info)
            expected_info = {
                'file_name': file_name,
                'row': index + 2
            }
            self.assertEqual(source_info, expected_info)

            index = 1
            record = qs[index]
            expected_data = {
                'Column A': 'A2',
                'Column B': 'B2',
            }
            self.assertEqual(expected_data, record.data)
            # test that source_info contains the file_name and row_counter
            source_info = record.source_info
            self.assertIsNotNone(source_info)
            expected_info = {
                'file_name': file_name,
                'row': index + 2
            }
            self.assertEqual(source_info, expected_info)

            self.assertEqual(self.project_1.record_count, len(csv_data) - 1)
            self.assertEqual(self.ds.record_count, len(csv_data) - 1)
    def test_observation_with_lat_long_datum_xls(self):
        """
        Scenario: File with column Latitude, Longitude and Datum
         Given that columns named Latitude, Longitude and Datum exists
         Then the dataset type should be inferred as Observation
         And latitude should be of type 'number', set as required and tag with biosys type latitude
         And longitude should be of type 'number', set as required and tag with biosys type longitude
         And datum should be of type 'string', set as not required and with biosys type datum
        """
        columns = ['What', 'Latitude', 'Longitude', 'Datum']
        rows = [
            columns, ['Observation1', -32, 117.75, 'WGS84'],
            ['Observation with lat/long as string', '-32', '115.75', None]
        ]
        client = self.custodian_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # type observation
            self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type'))

            # verify fields attributes
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            lat_field = schema.get_field_by_name('Latitude')
            self.assertEqual(lat_field.type, 'number')
            self.assertTrue(lat_field.required)
            biosys = lat_field.get('biosys')
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.LATITUDE_TYPE_NAME)

            lon_field = schema.get_field_by_name('Longitude')
            self.assertEqual(lon_field.type, 'number')
            self.assertTrue(lon_field.required)
            biosys = lon_field.get('biosys')
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.LONGITUDE_TYPE_NAME)

            # datum
            datum_field = schema.get_field_by_name('Datum')
            self.assertEqual(datum_field.type, 'string')
            self.assertFalse(datum_field.required)
            biosys = datum_field.get('biosys')
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.DATUM_TYPE_NAME)

            # test that we can save the dataset back.
            self.verify_inferred_data(received)
    def test_observation_with_lat_long_xls(self):
        """
        Scenario: File with column Latitude and Longitude
         Given that a column named Latitude and Longitude exists
         Then they should be of type 'number'
         And they should be set as required
         And they should be tagged with the appropriate biosys tag
         And the dataset type should be observation
        """
        columns = ['What', 'Latitude', 'Longitude']
        rows = [
            columns, ['Observation1', -32, 117.75],
            ['Observation with lat/long as string', '-32', '115.75']
        ]
        client = self.custodian_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # data_package verification
            self.assertIn('data_package', received)

            # verify fields attributes
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            lat_field = schema.get_field_by_name('Latitude')
            lon_field = schema.get_field_by_name('Longitude')
            self.assertEqual(lat_field.type, 'number')
            self.assertEqual(lon_field.type, 'number')
            self.assertTrue(lat_field.required)
            self.assertTrue(lon_field.required)
            # biosys types
            self.assertTrue(
                BiosysSchema(lat_field.get(
                    BiosysSchema.BIOSYS_KEY_NAME)).is_latitude())
            self.assertTrue(
                BiosysSchema(lon_field.get(
                    BiosysSchema.BIOSYS_KEY_NAME)).is_longitude())

            self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type'))
            # test biosys validity
            self.verify_inferred_data(received)
Example #9
0
 def test_site_no_date(self):
     csv_data = [['What', 'Site'], ['No Date', self.site.code]]
     file_ = helpers.rows_to_xlsx_file(csv_data)
     client = self.custodian_1_client
     with open(file_, 'rb') as fp:
         data = {
             'file': fp,
             'strict': True  # upload in strict mode
         }
         resp = client.post(self.url, data=data, format='multipart')
         self.assertEqual(status.HTTP_200_OK, resp.status_code)
         records = self.dataset.record_queryset.all()
         self.assertEqual(len(records), 1)
         record = records[0]
         self.assertEqual(record.site, self.site)
         self.assertIsNone(record.datetime)
         self.assertEqual(record.geometry, self.site.geometry)
Example #10
0
    def test_upload_xlsx_happy_path(self):
        csv_data = [[
            'Site Code', 'Site Name', 'Description', 'Latitude', 'Longitude',
            'Datum', 'Attribute1', 'Attribute2'
        ], ['C1', 'Site 1', 'Description1', -32, 116, '', 'attr11', 'attr12'],
                    [
                        'C2', 'Site 2', 'Description2', -31, 117, '', 'attr21',
                        'attr22'
                    ]]
        xlsx_file = helpers.rows_to_xlsx_file(csv_data)
        project = self.project_1
        client = self.custodian_1_client
        url = reverse('api:upload-sites', kwargs={'pk': project.pk})
        self.assertEqual(0, Site.objects.filter(project=project).count())
        with open(xlsx_file, 'rb') as fp:
            data = {'file': fp}
            resp = client.post(url, data=data, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            qs = Site.objects.filter(project=project)
            self.assertEqual(len(csv_data) - 1, qs.count())
            self.assertEqual(['C1', 'C2'],
                             [s.code for s in qs.order_by('code')])
            self.assertEqual(['Site 1', 'Site 2'],
                             [s.name for s in qs.order_by('name')])
            self.assertEqual(
                ['Description1', 'Description2'],
                [s.description for s in qs.order_by('description')])

            # test geom and attr
            s = qs.filter(code='C1').first()
            self.assertEqual((116, -32), (s.geometry.x, s.geometry.y))
            expected_attributes = {
                'Latitude': '-32',
                'Longitude': '116',
                'Datum': '',
                'Attribute1': 'attr11',
                'Attribute2': 'attr12'
            }

            self.assertEqual(expected_attributes, s.attributes)

            self.assertEqual(project.site_count, len(csv_data) - 1)
    def test_generic_date_iso_xls(self):
        """
        Scenario: date column with ISO string 'yyyy-mm-dd'
        Given that a column is provided with strings of form 'yyyy-mm-dd'
        Then the column type should be 'date'
        And the format should be 'any'
        """
        columns = ['What', 'When']
        rows = [
            columns, ['Something', '2018-01-19'],
            ['Another thing',
             dt.date(2017, 12, 29).isoformat()],
            ['Another thing', '2017-08-01']
        ]
        client = self.data_engineer_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # data_package verification
            self.assertIn('data_package', received)
            self.verify_inferred_data(received)

            # verify schema
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            field = schema.get_field_by_name('What')
            self.assertEqual(field.type, 'string')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'default')

            field = schema.get_field_by_name('When')
            self.assertEqual(field.type, 'date')
            self.assertFalse(field.required)
            self.assertEqual(field.format, 'any')
Example #12
0
    def test_infer_dataset_param(self):
        """
        Test that when the param infer_dataset_type is set to False the type in generic even if we have a
        valid observation type
        """
        columns = ['What', 'Latitude', 'Longitude']
        rows = [
            columns,
            ['Observation1', -32.0, 117.75],
            ['Observation with lat/long as string', '-32.0', '115.75']
        ]
        client = self.custodian_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            # no param: should infer the type
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type'))

            # with param: return generic.
            fp.seek(0)
            payload = {
                'file': fp,
                'infer_dataset_type': False
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            self.assertEqual(Dataset.TYPE_GENERIC, received.get('type'))
            schema_descriptor = Package(received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            lat_field = schema.get_field_by_name('Latitude')
            lon_field = schema.get_field_by_name('Longitude')
            # no required constraints
            self.assertFalse(lat_field.required)
            self.assertFalse(lon_field.required)
Example #13
0
    def test_easting_northing_geometry_extraction(self):
        csv_data = [
            [
                'Code', 'Name', 'Description', 'Easting', 'Northing', 'Datum',
                'Zone', 'Attribute1', 'Attribute2'
            ],
            [
                'C1', 'Site 1', 'Description1', '405542.537', '6459127.469',
                'GDA94', '50', 'attr11', 'attr12'
            ],
        ]
        xlsx_file = helpers.rows_to_xlsx_file(csv_data)
        project = self.project_1
        client = self.custodian_1_client
        url = reverse('api:upload-sites', kwargs={'pk': project.pk})
        self.assertEqual(0, Site.objects.filter(project=project).count())
        with open(xlsx_file, 'rb') as fp:
            data = {'file': fp}
            resp = client.post(url, data=data, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            qs = Site.objects.filter(project=project)
            self.assertEqual(qs.count(), 1)
            site = qs.first()
            self.assertEqual(site.code, 'C1')
            self.assertEqual(site.name, 'Site 1')
            self.assertEqual(site.description, 'Description1')

            # test geom and attr
            self.assertAlmostEqual(site.geometry.x, 116, places=4)
            self.assertAlmostEqual(site.geometry.y, -32, places=4)
            expected_attributes = {
                'Easting': '405542.537',
                'Northing': '6459127.469',
                'Datum': 'GDA94',
                'Zone': '50',
                'Attribute1': 'attr11',
                'Attribute2': 'attr12'
            }

            self.assertEqual(expected_attributes, site.attributes)
    def test_observation_with_easting_northing_zone_xls(self):
        """
        Scenario: File with column Easting, Northing and Zone
         Given that a column named Easting , Northing and Zone exist
         Then the dataset type should be inferred as Observation
         And the type of Easting and Northing should be 'number'
         And Easting and Northing should be set as required
         And they should be tagged with the appropriate biosys tag
         And Zone should be of type integer and required.
        """
        columns = ['What', 'Easting', 'Northing', 'Zone', 'Comments']
        rows = [
            columns, ['Something', 12563.233, 568932.345, 50, 'A dog'],
            [
                'Observation with easting/northing as string', '12563.233',
                '568932.345', 50, 'A dog'
            ]
        ]
        client = self.custodian_1_client
        file_ = helpers.rows_to_xlsx_file(rows)
        with open(file_, 'rb') as fp:
            payload = {
                'file': fp,
            }
            resp = client.post(self.url, data=payload, format='multipart')
            self.assertEqual(status.HTTP_200_OK, resp.status_code)
            received = resp.json()
            # should be an observation
            self.assertEqual(Dataset.TYPE_OBSERVATION, received.get('type'))
            # data_package verification
            self.assertIn('data_package', received)

            # verify fields attributes
            schema_descriptor = Package(
                received.get('data_package')).resources[0].descriptor['schema']
            schema = utils_data_package.GenericSchema(schema_descriptor)
            east_field = schema.get_field_by_name('Easting')
            self.assertIsNotNone(east_field)
            self.assertEqual(east_field.type, 'number')
            self.assertTrue(east_field.required)
            biosys = east_field.get('biosys')
            self.assertIsNotNone(biosys)
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.EASTING_TYPE_NAME)

            north_field = schema.get_field_by_name('Northing')
            self.assertIsNotNone(north_field)
            self.assertEqual(north_field.type, 'number')
            self.assertTrue(north_field.required)
            biosys = north_field.get('biosys')
            self.assertIsNotNone(biosys)
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.NORTHING_TYPE_NAME)

            zone_field = schema.get_field_by_name('Zone')
            self.assertIsNotNone(zone_field)
            self.assertEqual(zone_field.type, 'integer')
            self.assertTrue(zone_field.required)
            biosys = zone_field.get('biosys')
            self.assertIsNotNone(biosys)
            biosys_type = biosys.get('type')
            self.assertEqual(biosys_type, BiosysSchema.ZONE_TYPE_NAME)

            # test that we can save the dataset as returned
            self.verify_inferred_data(received)