def test_build_numpy_value(self): dtp = ListDataType(element_data_types=[FloatDataType()]) self.assertTrue((dtp.build_numpy_value([1]) == np.array( (1, ), [('0', '<f8')])).all()) dtp = ListDataType( element_data_types=[FloatDataType(), StringDataType()]) self.assertTrue((dtp.build_numpy_value([1, "tra"]) == np.array( (1, "tra"), [('0', '<f8'), ('1', '<U128')])).all()) dtp = ListDataType(element_data_types=[ FloatDataType(), StringDataType(), ListDataType(element_data_types=[ ArrayDataType(element_data_type=FloatDataType()), StringDataType() ]) ]) input_value = [12.3, "first_string", [[1, 2, 3, 4], "second_string"]] output_value = dtp.build_numpy_value(input_value) self.assertEqual(output_value[0]['0'], input_value[0]) self.assertEqual(output_value[0]['1'], input_value[1]) self.assertTrue( (output_value[0]['2'][0]['0'] == input_value[2][0]).all()) self.assertEqual(output_value[0]['2'][0]['1'], input_value[2][1])
def test_build_python_value(self): dtp = ListDataType(element_data_types=[FloatDataType()]) self.assertTrue((dtp.build_python_value([1]) == [float(1)])) dtp = ListDataType( element_data_types=[FloatDataType(), StringDataType()]) self.assertTrue( dtp.build_python_value([1, "tra"]) == [float(1), "tra"]) dtp = ListDataType(element_data_types=[ FloatDataType(), StringDataType(), ListDataType(element_data_types=[ ArrayDataType(element_data_type=FloatDataType()), StringDataType() ]) ]) input_value = [12.3, "first_string", [[1, 2, 3, 4], "second_string"]] output_value = dtp.build_python_value(input_value) self.assertEqual(output_value[0], input_value[0]) self.assertEqual(output_value[1], input_value[1]) self.assertTrue((output_value[2][0] == input_value[2][0])) self.assertEqual(output_value[2][1], input_value[2][1])
def test_is_nullable(self): dtp = ListDataType(element_data_types=[StringDataType()], nullable=False) self.assertFalse(dtp.is_nullable()) dtp = ListDataType(element_data_types=[StringDataType()], nullable=True) self.assertTrue(dtp.is_nullable())
def sample_dict_for_test_schema_v1(): input_dict = { 'a': 23, 'b': { 'c': "sa", 'd': [{"s": 1}, 12.3], 'e': ["a", "b", "c"] } } expected_output = TreeSchema(base_fork_node=ForkNode(name="base", children=[ ChildNode(name="a", data_type=FloatDataType()), ForkNode(name="b", children=[ ChildNode(name="c", data_type=StringDataType()), ChildNode(name="d", data_type=ListDataType(element_data_types=[ TreeDataType( base_fork=ForkNode(name="d_0", children=[ChildNode(name="s", data_type=FloatDataType())], level=4)), FloatDataType() ], level=3)), ChildNode(name="e", data_type=ArrayDataType(element_data_type=StringDataType())) ], level=2) ], level=1)) return input_dict, expected_output
def test_transform_tree(self): input_data_1 = { "l1-f": "120.9", "l1-s": 34, "l1-d": "2018-01-04", "f": { "l2-f": "-120.9", "l2-s": 'YES', "l2-a": ["2018-01-04"] } } output_data_1_exp = { "l1-f": 120.9, "l1-s": "34", "l1-d": np.datetime64("2018-01-04"), "f": { "l2-f": -120.9, "l2-s": 'YES', "l2-a": [np.datetime64("2018-01-04")], 'l2-missing': 'nan' } } fork_1 = ForkNode('base', [ ChildNode('l1-f', FloatDataType()), ChildNode('l1-s', StringDataType()), ChildNode('l1-d', DateDataType(resolution='D', format_string="%Y-%m-%d")), ForkNode('f', [ ChildNode('l2-f', FloatDataType()), ChildNode('l2-s', StringDataType()), ChildNode( 'l2-a', ArrayDataType( DateDataType(resolution='D', format_string="%Y-%m-%d"))), ChildNode('l2-missing', StringDataType()) ]) ]) tr = TreeRow(input_data_1) self.assertEqual(tr.transform_tree(input_data_1, fork_1, 'numpy'), output_data_1_exp) input_data_2 = {'f': {'float': 20}} fork_2 = ForkNode('base', [ChildNode('f', FloatDataType())]) with self.assertRaises(RuntimeError): tr = TreeRow(input_data_2) tr.transform_tree(input_data_2, fork_2, 'numpy') input_data_3 = {'f': 20} fork_3 = ForkNode( 'base', [ForkNode('f', [ChildNode('float', FloatDataType())])]) with self.assertRaises(RuntimeError): tr = TreeRow(input_data_3) tr.transform_tree(input_data_3, fork_3, 'numpy')
def test_eq(self): dtp1 = ArrayDataType(element_data_type=FloatDataType()) dtp2 = ArrayDataType(element_data_type=FloatDataType()) self.assertEqual(dtp1, dtp2) dtp1 = ArrayDataType(element_data_type=StringDataType()) dtp2 = ArrayDataType(element_data_type=FloatDataType()) self.assertNotEqual(dtp1, dtp2)
def test_get_numpy_type(self): dtp = ListDataType(element_data_types=[FloatDataType()]) self.assertEqual(dtp.get_numpy_type(), np.ndarray) dtp = ListDataType(element_data_types=[StringDataType()]) self.assertEqual(dtp.get_numpy_type(), np.ndarray) dtp = ListDataType(element_data_types=[DateDataType()]) self.assertEqual(dtp.get_numpy_type(), np.ndarray)
def test_get_python_type(self): dtp = ListDataType(element_data_types=[FloatDataType()]) self.assertEqual(dtp.get_python_type(), list) dtp = ListDataType(element_data_types=[StringDataType()]) self.assertEqual(dtp.get_python_type(), list) dtp = ListDataType(element_data_types=[DateDataType()]) self.assertEqual(dtp.get_python_type(), list)
def test_get_python_type(self): dtp = ArrayDataType(element_data_type=FloatDataType()) self.assertEqual(dtp.get_python_type(), list) dtp = ArrayDataType(element_data_type=StringDataType()) self.assertEqual(dtp.get_python_type(), list) dtp = ArrayDataType(element_data_type=DateDataType()) self.assertEqual(dtp.get_python_type(), list)
def test_get_numpy_type(self): dtp = ArrayDataType(element_data_type=FloatDataType()) self.assertEqual(dtp.get_numpy_type(), np.ndarray) dtp = ArrayDataType(element_data_type=StringDataType()) self.assertEqual(dtp.get_numpy_type(), np.ndarray) dtp = ArrayDataType(element_data_type=DateDataType()) self.assertEqual(dtp.get_numpy_type(), np.ndarray)
def test__get_numpy_dtypes(self): dtp = ListDataType(element_data_types=[FloatDataType()]) self.assertEqual(dtp._get_numpy_dtypes(), [('0', '<f8')]) dtp = ListDataType(element_data_types=[ FloatDataType(), ArrayDataType(element_data_type=StringDataType()) ]) self.assertEqual(dtp._get_numpy_dtypes(), [('0', '<f8'), ('1', np.ndarray)]) dtp = ListDataType(element_data_types=[ FloatDataType(), ArrayDataType(element_data_type=StringDataType()), DateDataType(resolution='M') ]) self.assertEqual(dtp._get_numpy_dtypes(), [('0', '<f8'), ('1', np.ndarray), ('2', '<M8[M]')])
def test__transform_child_value(self): # Case 1 value1 = '120.28' leaf1 = ChildNode('case1', FloatDataType()) self.assertEqual( float(value1), TreeRow._transform_child_value(value1, leaf1, 'numpy')) self.assertEqual( float(value1), TreeRow._transform_child_value(value1, leaf1, 'python')) with self.assertRaises(ValueError): TreeRow._transform_child_value(value1, leaf1, 'no') # Case 2 value2 = 40 leaf2 = ChildNode('case2', StringDataType()) self.assertEqual( str(value2), TreeRow._transform_child_value(value2, leaf2, 'numpy')) self.assertEqual( str(value2), TreeRow._transform_child_value(value2, leaf2, 'python')) with self.assertRaises(ValueError): TreeRow._transform_child_value(value2, leaf2, 'no') # Case 3 value3 = '2018-01-04' leaf3 = ChildNode( 'case3', DateDataType(resolution='D', format_string="%Y-%m-%d")) self.assertEqual( np.datetime64(value3), TreeRow._transform_child_value(value3, leaf3, 'numpy')) self.assertEqual( datetime.strptime(value3, "%Y-%m-%d"), TreeRow._transform_child_value(value3, leaf3, 'python')) with self.assertRaises(ValueError): TreeRow._transform_child_value(value3, leaf3, 'no') # Case 4 value4 = None self.assertTrue( np.isnan(TreeRow._transform_child_value(value4, leaf1, 'numpy'))) self.assertTrue( TreeRow._transform_child_value(value4, leaf1, 'python') is None) self.assertEqual( TreeRow._transform_child_value(value4, leaf2, 'numpy'), 'nan') self.assertEqual( TreeRow._transform_child_value(value4, leaf2, 'python'), 'None') self.assertTrue( np.isnat(TreeRow._transform_child_value(value4, leaf3, 'numpy'))) self.assertEqual( TreeRow._transform_child_value(value4, leaf3, 'python'), '')
def test_build_python_value(self): dtp = ArrayDataType(element_data_type=FloatDataType()) self.assertTrue((dtp.build_python_value([1, 2, 3]) == np.array([1, 2, 3], '<f8')).all()) dtp = ArrayDataType(element_data_type=StringDataType()) self.assertTrue( (dtp.build_python_value([1, 2, 3]) == np.array([1, 2, 3], '<U200')).all()) dtp = ArrayDataType(element_data_type=ArrayDataType( element_data_type=StringDataType())) self.assertTrue( (dtp.build_python_value([["tra", "check"], ["what"] ])[0] == np.array(["tra", "check"], '<U200')).all()) self.assertTrue( (dtp.build_python_value([["tra", "check"], ["what"]])[1] == np.array(["what"], "<U200")).all())
def test_eq(self): dtp1 = ListDataType(element_data_types=[FloatDataType()]) dtp2 = ListDataType(element_data_types=[FloatDataType()]) self.assertEqual(dtp1, dtp2) dtp1 = ListDataType( element_data_types=[FloatDataType(), StringDataType()]) dtp2 = ListDataType( element_data_types=[FloatDataType(), StringDataType()]) self.assertEqual(dtp1, dtp2) dtp1 = ListDataType(element_data_types=[ FloatDataType(), StringDataType(), ListDataType(element_data_types=[ ArrayDataType(element_data_type=FloatDataType()), StringDataType() ]) ]) dtp2 = ListDataType(element_data_types=[ FloatDataType(), StringDataType(), ListDataType(element_data_types=[ ArrayDataType(element_data_type=FloatDataType()), StringDataType() ]) ]) self.assertEqual(dtp1, dtp2)
def get_data_types(): dt = DataType(numpy_dtype='<i8', python_dtype=int, numpy_na_value=np.nan, python_na_value=None) sdt = StringDataType() fdt = FloatDataType() ddt_d = DateDataType(resolution='D') ddt_s = DateDataType(resolution='s') adt_f = ArrayDataType(element_data_type=FloatDataType()) adt_s = ArrayDataType(element_data_type=StringDataType()) ldt_fsd = ListDataType(element_data_types=[ FloatDataType(), StringDataType(), DateDataType() ]) ldt_ssd = ListDataType(element_data_types=[ StringDataType(), StringDataType(), DateDataType() ]) return dt, sdt, fdt, ddt_d, ddt_s, adt_f, adt_s, ldt_fsd, ldt_ssd
def test__assert_transformation_possible(self): fork1 = ForkNode('base', [ ChildNode('c1', StringDataType()), ChildNode('c2', FloatDataType()), ForkNode('f1', [ChildNode('c2', DateDataType())]) ]) with self.assertRaises(RuntimeError): TreeRow._assert_transformation_possible(['c2'], fork1) with self.assertRaises(RuntimeError): TreeRow._assert_transformation_possible(['c1', 'c2'], fork1) with self.assertRaises(RuntimeError): TreeRow._assert_transformation_possible(['f1', 'c1', 'c2'], fork1) TreeRow._assert_transformation_possible(['c1'], fork1) TreeRow._assert_transformation_possible(['c1', 'f1'], fork1)
def test_get_numpy_type(self): dtp = StringDataType() self.assertEqual(dtp.get_numpy_type(), np.dtype('<U128'))
def test_get_python_type(self): dtp = StringDataType() self.assertEqual(dtp.get_python_type(), str)
def test_eq(self): dtp1 = StringDataType() dtp2 = StringDataType() self.assertEqual(dtp1, dtp2)
def test_is_nullable(self): dtp = StringDataType(nullable=False) self.assertFalse(dtp.is_nullable()) dtp = StringDataType(nullable=True) self.assertTrue(dtp.is_nullable())
def test_build_numpy_value(self): dtp = StringDataType() self.assertEqual(dtp.build_numpy_value("1234567890123"), "1234567890123") self.assertEqual(dtp.build_numpy_value("123"), "123") self.assertEqual(dtp.build_numpy_value("tra2"), "tra2")
def base_dict_json_same_schema_types(): d = { "level1-string": StringDataType(), "level1-float": FloatDataType(), "level1-date": StringDataType(), "level1-array_float": ArrayDataType(FloatDataType()), "level1-array_string": ArrayDataType(StringDataType()), "level1-list_float_string": ListDataType([FloatDataType()] * 5 + [StringDataType()] * 5, level=2), "level1-fork": { "level2-string": StringDataType(), "level2-float": FloatDataType(), "level2-date": StringDataType(), "level2-array_float": ArrayDataType(FloatDataType()), "level2-array_string": ArrayDataType(StringDataType()), "level2-list_float_string": ListDataType([FloatDataType()] * 5 + [StringDataType()] * 5, level=3), }, "level1-fork2": { "level2-float": FloatDataType(), "level2-fork": { "level3-float": FloatDataType(), "level3-array_tree": ArrayDataType( TreeDataType( base_fork=ForkNode( name="level3-array_tree", children=[ ChildNode(name="level3-array-float", data_type=FloatDataType()), ChildNode(name="level3-array-string", data_type=StringDataType()) ], level=5 ) ) ), "level3-list_tree": ListDataType( [ TreeDataType( base_fork=ForkNode( name="level3-list_tree_{}".format(x), children=[ ChildNode(name="level3-list-float", data_type=FloatDataType()), ChildNode(name="level3-list-string", data_type=StringDataType()) ], level=5 ) ) for x in range(0, 5)] + [ TreeDataType( base_fork=ForkNode( name="level3-list_tree_{}".format(x), children=[ ChildNode(name="level3-list-date", data_type=StringDataType()), ChildNode(name="level3-list-string", data_type=StringDataType()) ], level=5 ) ) for x in range(5, 10)], level=4 ) } } } return d
def test_is_nullable(self): dtp = ArrayDataType(element_data_type=StringDataType(), nullable=False) self.assertFalse(dtp.is_nullable()) dtp = ArrayDataType(element_data_type=StringDataType(), nullable=True) self.assertTrue(dtp.is_nullable())
def test_apply_schema(self): # Case 1 input_data_1 = { "l1-f": "120.9", "l1-s": 34, "l1-d": "2018-01-04", "f": { "l2-f": "-120.9", "l2-s": 'YES', "l2-a": ["2018-01-04"] } } output_data_1_exp = { "l1-f": 120.9, "l1-s": "34.0", "l1-d": np.datetime64("2018-01-04"), "f": { "l2-f": -120.9, "l2-s": 'YES', "l2-a": [np.datetime64("2018-01-04")], 'l2-missing': 'nan' } } fork_1 = ForkNode('base', [ ChildNode('l1-f', FloatDataType()), ChildNode('l1-s', StringDataType()), ChildNode('l1-d', DateDataType(resolution='D', format_string="%Y-%m-%d")), ForkNode('f', [ ChildNode('l2-f', FloatDataType()), ChildNode('l2-s', StringDataType()), ChildNode( 'l2-a', ArrayDataType( DateDataType(resolution='D', format_string="%Y-%m-%d"))), ChildNode('l2-missing', StringDataType()) ]) ]) tr_1 = TreeRow(input_data_1) schema_1 = TreeSchema(base_fork_node=fork_1) assert tr_1.row is None tr_1 = tr_1.build_row(input_data_1, 'numpy') self.assertNotEqual(tr_1.row, output_data_1_exp) self.assertNotEqual(tr_1.get_schema(), schema_1) tr_1 = tr_1.set_schema(schema_1) tr_1 = tr_1.apply_schema('numpy') self.assertEqual(tr_1.row, output_data_1_exp) # Case 2 input_data_2 = {'f': {'float': 20}} fork_2 = ForkNode('base', [ChildNode('f', FloatDataType())]) tr_2 = TreeRow(input_data_2) schema_2 = TreeSchema(base_fork_node=fork_2) assert tr_2.row is None tr_2 = tr_2.build_row(input_data_2, 'numpy') self.assertNotEqual(tr_2.get_schema(), schema_2) tr_2 = tr_2.set_schema(schema_2) with self.assertRaises(RuntimeError): tr_2.apply_schema('numpy') # Case 3 input_data_3 = {'f': 20} fork_3 = ForkNode( 'base', [ForkNode('f', [ChildNode('float', FloatDataType())])]) tr_3 = TreeRow(input_data_3) schema_3 = TreeSchema(base_fork_node=fork_3) assert tr_3.row is None tr_3 = tr_3.build_row(input_data_3, 'numpy') self.assertNotEqual(tr_3.get_schema(), schema_3) tr_3 = tr_3.set_schema(schema_3) with self.assertRaises(RuntimeError): tr_3.apply_schema('numpy')
def test_build_python_value(self): dtp = StringDataType() self.assertEqual(dtp.build_python_value(10), "10") self.assertEqual(dtp.build_python_value(10), "10") self.assertEqual(dtp.build_python_value("tra2"), "tra2")