def test_flattened_dataframe_format_with_no_payload(self): test_dir = os.path.join(test_root_path, "data") input_file_name = os.path.join(test_dir, "json/simple.json") simple = Data( input_file_name, options={ "data_format": "flattened_dataframe", "payload_keys": "data" }, ) self.assertEqual(3, len(simple.data_and_metadata.columns)) self.assertEqual(2, len(simple.data.columns)) self.assertEqual(1, len(simple.metadata.columns)) simple = Data( input_file_name, options={ "data_format": "flattened_dataframe", "payload_keys": "no_data_key_test", }, ) self.assertEqual(3, len(simple.data_and_metadata.columns)) self.assertEqual(3, len(simple.data.columns)) with self.assertWarnsRegex(UserWarning, "No metadata was detected."): self.assertIsNone(simple.metadata)
def test_json_from_string(self): """ Determine if the json file can be loaded with manual data_type setting """ passing_json_strings = [ "[]", "{}", "[1, 2, 3]", '[{"a": 1}, {"a": 2}, {"a": 3}, {"a": 1, "b":2}]', ] failing_json_strings = [ dict( value="[1,[1]]", error="Only JSON which represents structured data is " "supported for this data type (i.e. list-dicts).", ), dict( value='[{"a": 1}, 2, [3]]', error="Only JSON which represents structured data is " "supported for this data type (i.e. list-dicts).", ), dict(value="{", error="No JSON data could be read from these data."), ] for json_string in passing_json_strings: # in memory data must specify the data_type category input_data_obj = Data(data=json_string, data_type="json") self.assertEqual(input_data_obj.data_type, "json") for json_string in failing_json_strings: # in memory data must specify the data_type category with self.assertRaises(ValueError) as assert_raised: Data(data=json_string["value"], data_type="json").data self.assertEqual(str(assert_raised.exception), json_string["error"])
def test_json_from_string(self): """ Determine if the json file can be loaded with manual data_type setting """ passing_json_strings = [ '[]', '{}', '[1, 2, 3]', '[{"a": 1}, {"a": 2}, {"a": 3}, {"a": 1, "b":2}]', ] failing_json_strings = [ dict(value='[1,[1]]', error='Only JSON which represents structured data is ' 'supported for this data type (i.e. list-dicts).'), dict(value='[{"a": 1}, 2, [3]]', error='Only JSON which represents structured data is ' 'supported for this data type (i.e. list-dicts).'), dict(value='{', error='No JSON data could be read from these data.') ] for json_string in passing_json_strings: # in memory data must specify the data_type category input_data_obj = Data(data=json_string, data_type='json') self.assertEqual(input_data_obj.data_type, 'json') for json_string in failing_json_strings: # in memory data must specify the data_type category with self.assertRaises(ValueError) as assert_raised: Data(data=json_string['value'], data_type='json').data self.assertEqual(str(assert_raised.exception), json_string['error'])
def test_read_url_header_overflow(self, mock_request_get): # assumed chunk size c_size = 8192 max_allows_file_size = 1024 ** 3 # 1GB # set valid content length size content_length = 5000 mock_request_get.return_value.__enter__\ .return_value.headers = {'Content-length':content_length} try: # mock the iter_content to return just under 1GB so no error raises mock_request_get.return_value.__enter__.return_value.iter_content.\ return_value = [b'test'] * (int(content_length) // c_size) # stub URL, the line above replaces the content requests.get will see data_obj = Data('https://test.com') except ValueError: self.fail("URL string unexpected overflow error.") # make content length an invalid size content_length = max_allows_file_size + 1 mock_request_get.return_value.__enter__\ .return_value.headers = {'Content-length':content_length} with self.assertRaisesRegex(ValueError, \ 'The downloaded file from the url may not be larger than 1GB'): # stub URL, mock_request_get replaces the content requests.get will see data_obj = Data('https://test.com')
def test_read_url_content_overflow(self, mock_request_get): # assumed chunk size c_size = 8192 max_allows_file_size = 1024**3 # 1GB try: # mock the iter_content to return just under 1GB so no error raises mock_request_get.return_value.__enter__.return_value.iter_content.return_value = [ b"test" ] * ( int(max_allows_file_size) // c_size ) # stub URL, the line above replaces the content requests.get will see data_obj = Data("https://test.com") except ValueError: self.fail("URL string unexpected overflow error.") # mock the iter_content to return up to 1GB + so error raises mock_request_get.return_value.__enter__.return_value.iter_content.return_value = [ b"test" ] * ( int(max_allows_file_size) // c_size + 1 ) with self.assertRaisesRegex( ValueError, "The downloaded file from the url may not be larger than 1GB" ): # stub URL, mock_request_get replaces the content requests.get will see data_obj = Data("https://test.com")
def test_reload_data(self): """ Determine if the parquet file can be reloaded """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path']) input_data_obj.reload(input_file['path']) self.assertEqual(input_data_obj.data_type, 'parquet')
def test_reload_data(self): """ Determine if the json file can be reloaded """ for input_file in self.input_file_names: input_data_obj = Data(input_file["path"]) input_data_obj.reload(input_file["path"]) self.assertEqual(input_data_obj.data_type, 'json')
def test_reload_data(self): """ Determine if the csv file can be reloaded """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path']) input_data_obj.reload(input_file['path']) self.assertEqual(input_data_obj.data_type, 'csv') self.assertEqual(input_data_obj.delimiter, input_file['delimiter'])
def test_reload_data(self): """ Determine if the avro file can be reloaded """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file["path"]) input_data_obj.reload(input_file["path"]) self.assertEqual(input_data_obj.data_type, "avro") self.assertEqual(input_file["path"], input_data_obj.input_file_path)
def test_reload_data(self): """ Determine if the text file can be reloaded """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file['path']) input_data_obj.reload(input_file['path']) self.assertEqual(input_data_obj.data_type, 'text', input_file['path']) self.assertEqual(input_file['path'], input_data_obj.input_file_path)
def test_list_of_dictionaries_in_flattened_dataframe_format(self): test_dir = os.path.join(test_root_path, 'data') input_file_name = os.path.join(test_dir, 'json/iris-utf-8.json') simple = Data(input_file_name) self.assertEqual(6, len(simple.data.columns)) self.assertEqual(150, len(simple.data))
def test_allowed_data_formats(self): """ Determine if the csv file data_formats can be used """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file['path']) for data_format in list(input_data_obj._data_formats.keys()): input_data_obj.data_format = data_format self.assertEqual(input_data_obj.data_format, data_format) data = input_data_obj.data if data_format == "dataframe": import pandas as pd self.assertIsInstance(data, pd.DataFrame) elif data_format in ["records", "json"]: self.assertIsInstance(data, list) self.assertIsInstance(data[0], str)
def test_specifying_data_type(self): """ Determine if the avro file can be loaded with manual data_type setting """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path'], data_type='avro') self.assertEqual(input_data_obj.data_type, 'avro')
def test_avro_file_identification(self): """ Determine if the avro file can be automatically identified """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path']) self.assertEqual(input_data_obj.data_type, 'avro')
def test_auto_file_identification(self): """ Determine if the parquet file can be automatically identified """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file["path"]) self.assertEqual(input_data_obj.data_type, "parquet")
def test_specifying_data_type(self): """ Determine if the parquet file can be loaded with manual data_type setting """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file["path"], data_type="parquet") self.assertEqual(input_data_obj.data_type, "parquet")
def test_json_file_identification(self): """ Determine if the json file can be automatically identified """ for input_file in self.input_file_names: input_data_obj = Data(input_file["path"]) self.assertEqual(input_data_obj.data_type, 'json')
def test_factory_load(self): """ Determine whether factory class Data identifies file correctly """ for input_file in self.file_or_buf_list: data = Data(input_file["path"]) self.assertIsInstance(data, GraphData)
def test_data_formats(self): """ Test the data format options. """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path']) self.assertIsInstance(input_data_obj.data, pd.DataFrame) input_data_obj.data_format = "records" self.assertIsInstance(input_data_obj.data, list) with self.assertRaises(ValueError) as exc: input_data_obj.data_format = "NON_EXISTENT" self.assertEqual( str(exc.exception), "The data format must be one of the following: " + "['dataframe', 'records']")
def test_auto_file_identification(self): """ Determine if the text file can be automatically identified """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file['path']) self.assertEqual('text', input_data_obj.data_type, input_file['path'])
def test_specifying_data_type(self): """ Determine if the csv file can be loaded with manual data_type setting """ for input_file in self.input_file_names: input_data_obj = Data(input_file["path"], data_type='csv') self.assertEqual(input_data_obj.data_type, 'csv') self.assertEqual(input_data_obj.delimiter, input_file['delimiter'])
def test_read_url_verify_ssl(self, mock_request_get): mock_request_get.side_effect = requests.exceptions.SSLError() with self.assertRaises(RuntimeError, msg="The URL given has an untrusted " "SSL certificate. Although highly discouraged, " "you can proceed with reading the data by setting" " 'verify_url' to False in options (i.e. options" "=dict(verify_url=False))."): data_obj = Data('https://test.com')
def test_flattened_dataframe_format_with_dual_payload(self): test_dir = os.path.join(test_root_path, 'data') input_file_name = os.path.join(test_dir, 'json/dual_payloads.json') dual_payload = Data(input_file_name, options={"data_format": "flattened_dataframe"}) # Make sure the larger payload is selected self.assertIn("payload.bigger_list_of_things.id", dual_payload.data.columns) self.assertEqual(2, len(dual_payload.data.columns))
def test_len_data(self): """ Validate that length called on JSONData is appropriately determining the length value. """ for input_file in self.file_or_buf_list: data = Data(input_file["path"]) self.assertEqual(input_file["count"], len(data), msg=input_file["path"]) self.assertEqual(input_file["count"], data.length, msg=input_file["path"])
def test_auto_file_identification(self): """ Determine if the csv file can be automatically identified """ for input_file in self.input_file_names: input_data_obj = Data(input_file['path']) self.assertEqual(input_data_obj.data_type, 'csv') self.assertEqual(input_data_obj.delimiter, input_file['delimiter']) self.assertEqual(len(input_data_obj.data.columns), input_file['num_columns'])
def test_key_separator_in_flattened_dataframe_format(self): test_dir = os.path.join(test_root_path, 'data') input_file_name = os.path.join(test_dir, 'json/simple.json') simple = Data(input_file_name, options={"key_separator": "~~~"}) expected_columns = [ "data~~~list_of_things~~~id", "data~~~list_of_things~~~tags" ] self.assertListEqual(expected_columns, list(simple.data.columns))
def test_flattened_dataframe_format(self): test_dir = os.path.join(test_root_path, 'data') input_file_name = os.path.join(test_dir, 'json/math.json') math = Data(input_file_name) self.assertIn("meta.view.columns.cachedContents.largest", math.data_and_metadata.columns) self.assertEqual( math.metadata["meta.view.columns.cachedContents.largest"][9], "102188") self.assertIn("data.22", math.data.columns) self.assertEqual(math.data["data.22"][167], "77.9")
def test_auto_file_identification(self): """ Determine if the csv file can be automatically identified """ for input_file in self.file_or_buf_list: input_data_obj = Data(input_file['path']) try: self.assertEqual(input_data_obj.delimiter, input_file['delimiter'], input_file['path']) self.assertEqual(len(input_data_obj.data.columns), input_file['num_columns'], input_file['path']) except AttributeError as e: raise AttributeError( repr(e) + ': ' + input_file['path'].split("/")[-1])
def test_complex_nested_json_in_flattened_dataframe_format(self): test_dir = os.path.join(test_root_path, 'data') input_file_name = os.path.join(test_dir, 'json/complex_nested.json') complex = Data(input_file_name) self.assertEqual(8, len(complex.data.columns)) self.assertEqual("Depression", complex.data["payload.Lion.medical_condition"][0]) self.assertEqual(11, len(complex.data_and_metadata.columns)) self.assertEqual("Frodo", complex.data_and_metadata["meta.creator"][0]) self.assertEqual(3, len(complex.metadata.columns)) self.assertEqual("Frodo", complex.data_and_metadata["meta.creator"][0])
def test_len_data(self): """ Validate that length called on ParquetData is appropriately determining the length value. """ for input_file in self.input_file_names: data = Data(input_file["path"]) self.assertEqual(input_file['count'], len(data), msg=input_file['path']) self.assertEqual(input_file['count'], data.length, msg=input_file['path'])