Esempio n. 1
0
    def test_flattened_dataframe_format_with_no_payload(self):
        test_dir = os.path.join(test_root_path, "data")
        input_file_name = os.path.join(test_dir, "json/simple.json")

        simple = Data(
            input_file_name,
            options={
                "data_format": "flattened_dataframe",
                "payload_keys": "data"
            },
        )

        self.assertEqual(3, len(simple.data_and_metadata.columns))
        self.assertEqual(2, len(simple.data.columns))
        self.assertEqual(1, len(simple.metadata.columns))

        simple = Data(
            input_file_name,
            options={
                "data_format": "flattened_dataframe",
                "payload_keys": "no_data_key_test",
            },
        )

        self.assertEqual(3, len(simple.data_and_metadata.columns))
        self.assertEqual(3, len(simple.data.columns))
        with self.assertWarnsRegex(UserWarning, "No metadata was detected."):
            self.assertIsNone(simple.metadata)
Esempio n. 2
0
    def test_json_from_string(self):
        """
        Determine if the json file can be loaded with manual data_type setting
        """
        passing_json_strings = [
            "[]",
            "{}",
            "[1, 2, 3]",
            '[{"a": 1}, {"a": 2}, {"a": 3}, {"a": 1, "b":2}]',
        ]
        failing_json_strings = [
            dict(
                value="[1,[1]]",
                error="Only JSON which represents structured data is "
                "supported for this data type (i.e. list-dicts).",
            ),
            dict(
                value='[{"a": 1}, 2, [3]]',
                error="Only JSON which represents structured data is "
                "supported for this data type (i.e. list-dicts).",
            ),
            dict(value="{",
                 error="No JSON data could be read from these data."),
        ]
        for json_string in passing_json_strings:
            # in memory data must specify the data_type category
            input_data_obj = Data(data=json_string, data_type="json")
            self.assertEqual(input_data_obj.data_type, "json")

        for json_string in failing_json_strings:
            # in memory data must specify the data_type category
            with self.assertRaises(ValueError) as assert_raised:
                Data(data=json_string["value"], data_type="json").data
            self.assertEqual(str(assert_raised.exception),
                             json_string["error"])
Esempio n. 3
0
    def test_json_from_string(self):
        """
        Determine if the json file can be loaded with manual data_type setting
        """
        passing_json_strings = [
            '[]',
            '{}',
            '[1, 2, 3]',
            '[{"a": 1}, {"a": 2}, {"a": 3}, {"a": 1, "b":2}]',
        ]
        failing_json_strings = [
            dict(value='[1,[1]]',
                 error='Only JSON which represents structured data is '
                 'supported for this data type (i.e. list-dicts).'),
            dict(value='[{"a": 1}, 2, [3]]',
                 error='Only JSON which represents structured data is '
                 'supported for this data type (i.e. list-dicts).'),
            dict(value='{',
                 error='No JSON data could be read from these data.')
        ]
        for json_string in passing_json_strings:
            # in memory data must specify the data_type category
            input_data_obj = Data(data=json_string, data_type='json')
            self.assertEqual(input_data_obj.data_type, 'json')

        for json_string in failing_json_strings:
            # in memory data must specify the data_type category
            with self.assertRaises(ValueError) as assert_raised:
                Data(data=json_string['value'], data_type='json').data
            self.assertEqual(str(assert_raised.exception),
                             json_string['error'])
Esempio n. 4
0
    def test_read_url_header_overflow(self, mock_request_get):
        # assumed chunk size
        c_size = 8192
        max_allows_file_size = 1024 ** 3 # 1GB

        # set valid content length size
        content_length = 5000
        mock_request_get.return_value.__enter__\
            .return_value.headers = {'Content-length':content_length}

        try:
            # mock the iter_content to return just under 1GB so no error raises
            mock_request_get.return_value.__enter__.return_value.iter_content.\
                return_value = [b'test'] * (int(content_length) // c_size)

            # stub URL, the line above replaces the content requests.get will see
            data_obj = Data('https://test.com')

        except ValueError:
            self.fail("URL string unexpected overflow error.")

        # make content length an invalid size
        content_length = max_allows_file_size + 1
        mock_request_get.return_value.__enter__\
            .return_value.headers = {'Content-length':content_length}

        with self.assertRaisesRegex(ValueError, \
            'The downloaded file from the url may not be larger than 1GB'):
            # stub URL, mock_request_get replaces the content requests.get will see
            data_obj = Data('https://test.com')
Esempio n. 5
0
    def test_read_url_content_overflow(self, mock_request_get):
        # assumed chunk size
        c_size = 8192
        max_allows_file_size = 1024**3  # 1GB

        try:
            # mock the iter_content to return just under 1GB so no error raises
            mock_request_get.return_value.__enter__.return_value.iter_content.return_value = [
                b"test"
            ] * (
                int(max_allows_file_size) // c_size
            )

            # stub URL, the line above replaces the content requests.get will see
            data_obj = Data("https://test.com")

        except ValueError:
            self.fail("URL string unexpected overflow error.")

        # mock the iter_content to return up to 1GB + so error raises
        mock_request_get.return_value.__enter__.return_value.iter_content.return_value = [
            b"test"
        ] * (
            int(max_allows_file_size) // c_size + 1
        )

        with self.assertRaisesRegex(
            ValueError, "The downloaded file from the url may not be larger than 1GB"
        ):

            # stub URL, mock_request_get  replaces the content requests.get will see
            data_obj = Data("https://test.com")
 def test_reload_data(self):
     """
     Determine if the parquet file can be reloaded
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file['path'])
         input_data_obj.reload(input_file['path'])
         self.assertEqual(input_data_obj.data_type, 'parquet')
Esempio n. 7
0
 def test_reload_data(self):
     """
     Determine if the json file can be reloaded
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file["path"])
         input_data_obj.reload(input_file["path"])
         self.assertEqual(input_data_obj.data_type, 'json')
Esempio n. 8
0
 def test_reload_data(self):
     """
     Determine if the csv file can be reloaded
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file['path'])
         input_data_obj.reload(input_file['path'])
         self.assertEqual(input_data_obj.data_type, 'csv')
         self.assertEqual(input_data_obj.delimiter, input_file['delimiter'])
Esempio n. 9
0
 def test_reload_data(self):
     """
     Determine if the avro file can be reloaded
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file["path"])
         input_data_obj.reload(input_file["path"])
         self.assertEqual(input_data_obj.data_type, "avro")
         self.assertEqual(input_file["path"], input_data_obj.input_file_path)
Esempio n. 10
0
 def test_reload_data(self):
     """
     Determine if the text file can be reloaded
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file['path'])
         input_data_obj.reload(input_file['path'])
         self.assertEqual(input_data_obj.data_type, 'text',
                          input_file['path'])
         self.assertEqual(input_file['path'],
                          input_data_obj.input_file_path)
Esempio n. 11
0
    def test_list_of_dictionaries_in_flattened_dataframe_format(self):
        test_dir = os.path.join(test_root_path, 'data')
        input_file_name = os.path.join(test_dir, 'json/iris-utf-8.json')

        simple = Data(input_file_name)
        self.assertEqual(6, len(simple.data.columns))
        self.assertEqual(150, len(simple.data))
Esempio n. 12
0
 def test_allowed_data_formats(self):
     """
     Determine if the csv file data_formats can be used
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file['path'])
         for data_format in list(input_data_obj._data_formats.keys()):
             input_data_obj.data_format = data_format
             self.assertEqual(input_data_obj.data_format, data_format)
             data = input_data_obj.data
             if data_format == "dataframe":
                 import pandas as pd
                 self.assertIsInstance(data, pd.DataFrame)
             elif data_format in ["records", "json"]:
                 self.assertIsInstance(data, list)
                 self.assertIsInstance(data[0], str)
Esempio n. 13
0
 def test_specifying_data_type(self):
     """
     Determine if the avro file can be loaded with manual data_type setting
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file['path'], data_type='avro')
         self.assertEqual(input_data_obj.data_type, 'avro')
Esempio n. 14
0
 def test_avro_file_identification(self):
     """
     Determine if the avro file can be automatically identified
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file['path'])
         self.assertEqual(input_data_obj.data_type, 'avro')
Esempio n. 15
0
 def test_auto_file_identification(self):
     """
     Determine if the parquet file can be automatically identified
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file["path"])
         self.assertEqual(input_data_obj.data_type, "parquet")
Esempio n. 16
0
 def test_specifying_data_type(self):
     """
     Determine if the parquet file can be loaded with manual data_type setting
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file["path"], data_type="parquet")
         self.assertEqual(input_data_obj.data_type, "parquet")
Esempio n. 17
0
 def test_json_file_identification(self):
     """
     Determine if the json file can be automatically identified
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file["path"])
         self.assertEqual(input_data_obj.data_type, 'json')
 def test_factory_load(self):
     """
     Determine whether factory class Data identifies file correctly
     """
     for input_file in self.file_or_buf_list:
         data = Data(input_file["path"])
         self.assertIsInstance(data, GraphData)
Esempio n. 19
0
    def test_data_formats(self):
        """
        Test the data format options.
        """
        for input_file in self.input_file_names:
            input_data_obj = Data(input_file['path'])
            self.assertIsInstance(input_data_obj.data, pd.DataFrame)

            input_data_obj.data_format = "records"
            self.assertIsInstance(input_data_obj.data, list)

            with self.assertRaises(ValueError) as exc:
                input_data_obj.data_format = "NON_EXISTENT"
            self.assertEqual(
                str(exc.exception),
                "The data format must be one of the following: " +
                "['dataframe', 'records']")
Esempio n. 20
0
 def test_auto_file_identification(self):
     """
     Determine if the text file can be automatically identified
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file['path'])
         self.assertEqual('text', input_data_obj.data_type,
                          input_file['path'])
Esempio n. 21
0
 def test_specifying_data_type(self):
     """
     Determine if the csv file can be loaded with manual data_type setting
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file["path"], data_type='csv')
         self.assertEqual(input_data_obj.data_type, 'csv')
         self.assertEqual(input_data_obj.delimiter, input_file['delimiter'])
Esempio n. 22
0
    def test_read_url_verify_ssl(self, mock_request_get):
        mock_request_get.side_effect = requests.exceptions.SSLError()

        with self.assertRaises(RuntimeError, msg="The URL given has an untrusted "
                               "SSL certificate. Although highly discouraged, "
                               "you can proceed with reading the data by setting"
                               " 'verify_url' to False in options (i.e. options"
                               "=dict(verify_url=False))."):
            data_obj = Data('https://test.com')
Esempio n. 23
0
    def test_flattened_dataframe_format_with_dual_payload(self):
        test_dir = os.path.join(test_root_path, 'data')
        input_file_name = os.path.join(test_dir, 'json/dual_payloads.json')

        dual_payload = Data(input_file_name,
                            options={"data_format": "flattened_dataframe"})
        # Make sure the larger payload is selected
        self.assertIn("payload.bigger_list_of_things.id",
                      dual_payload.data.columns)
        self.assertEqual(2, len(dual_payload.data.columns))
Esempio n. 24
0
    def test_len_data(self):
        """
        Validate that length called on JSONData is appropriately determining the
        length value.
        """

        for input_file in self.file_or_buf_list:
            data = Data(input_file["path"])
            self.assertEqual(input_file["count"], len(data), msg=input_file["path"])
            self.assertEqual(input_file["count"], data.length, msg=input_file["path"])
Esempio n. 25
0
 def test_auto_file_identification(self):
     """
     Determine if the csv file can be automatically identified
     """
     for input_file in self.input_file_names:
         input_data_obj = Data(input_file['path'])
         self.assertEqual(input_data_obj.data_type, 'csv')
         self.assertEqual(input_data_obj.delimiter, input_file['delimiter'])
         self.assertEqual(len(input_data_obj.data.columns),
                          input_file['num_columns'])
Esempio n. 26
0
    def test_key_separator_in_flattened_dataframe_format(self):
        test_dir = os.path.join(test_root_path, 'data')
        input_file_name = os.path.join(test_dir, 'json/simple.json')

        simple = Data(input_file_name, options={"key_separator": "~~~"})
        expected_columns = [
            "data~~~list_of_things~~~id", "data~~~list_of_things~~~tags"
        ]

        self.assertListEqual(expected_columns, list(simple.data.columns))
Esempio n. 27
0
    def test_flattened_dataframe_format(self):
        test_dir = os.path.join(test_root_path, 'data')
        input_file_name = os.path.join(test_dir, 'json/math.json')

        math = Data(input_file_name)
        self.assertIn("meta.view.columns.cachedContents.largest",
                      math.data_and_metadata.columns)
        self.assertEqual(
            math.metadata["meta.view.columns.cachedContents.largest"][9],
            "102188")
        self.assertIn("data.22", math.data.columns)
        self.assertEqual(math.data["data.22"][167], "77.9")
Esempio n. 28
0
 def test_auto_file_identification(self):
     """
     Determine if the csv file can be automatically identified
     """
     for input_file in self.file_or_buf_list:
         input_data_obj = Data(input_file['path'])
         try:
             self.assertEqual(input_data_obj.delimiter,
                              input_file['delimiter'], input_file['path'])
             self.assertEqual(len(input_data_obj.data.columns),
                              input_file['num_columns'], input_file['path'])
         except AttributeError as e:
             raise AttributeError(
                 repr(e) + ': ' + input_file['path'].split("/")[-1])
Esempio n. 29
0
    def test_complex_nested_json_in_flattened_dataframe_format(self):
        test_dir = os.path.join(test_root_path, 'data')
        input_file_name = os.path.join(test_dir, 'json/complex_nested.json')

        complex = Data(input_file_name)
        self.assertEqual(8, len(complex.data.columns))
        self.assertEqual("Depression",
                         complex.data["payload.Lion.medical_condition"][0])

        self.assertEqual(11, len(complex.data_and_metadata.columns))
        self.assertEqual("Frodo", complex.data_and_metadata["meta.creator"][0])

        self.assertEqual(3, len(complex.metadata.columns))
        self.assertEqual("Frodo", complex.data_and_metadata["meta.creator"][0])
Esempio n. 30
0
    def test_len_data(self):
        """
        Validate that length called on ParquetData is appropriately determining
        the length value.
        """

        for input_file in self.input_file_names:
            data = Data(input_file["path"])
            self.assertEqual(input_file['count'],
                             len(data),
                             msg=input_file['path'])
            self.assertEqual(input_file['count'],
                             data.length,
                             msg=input_file['path'])