def test_expand_empty(self): data_test = """{ "ID": "0", "SUBVAL": [] } { "ID": "1", "SUBVAL": [ {"ID_SUB":"0"}, {"ID_SUB":"1"}, {"ID_SUB":"2"} ] } """ data_expected = """{ "ID": "1", "ID_SUB": "0" } { "ID": "1", "ID_SUB": "1" } { "ID": "1", "ID_SUB": "2" } """ df = records.load_jsonl( inpt.from_str(data_test), [ records.SchemaField("ID"), records.SchemaField("SUBVAL"), ], ) df = records.expand_multivalued(df, { "ID_SUB": ["SUBVAL", None, "ID_SUB"], }) df_expected = records.load_jsonl( inpt.from_str(data_expected), [ records.SchemaField("ID"), records.SchemaField("ID_SUB"), ], ) pandas.testing.assert_frame_equal(df_expected, df)
def test_basic_no_drop(self): data_test = """{ "ID": "0", "SUBVAL": [ "0", "1", "2" ] } { "ID": "1", "SUBVAL": [ "0", "1", "2" ] } """ data_expected = """{ "ID": "0", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "0" } { "ID": "0", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "1" } { "ID": "0", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "2" } { "ID": "1", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "0" } { "ID": "1", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "1" } { "ID": "1", "SUBVAL": [ "0", "1", "2" ], "ID_SUB": "2" } """ df = records.load_jsonl( inpt.from_str(data_test), [ records.SchemaField("ID"), records.SchemaField("SUBVAL"), ], ) df = records.expand_multivalued(df, { "ID_SUB": ["SUBVAL", None], }, drop_mv=False) df_expected = records.load_jsonl( inpt.from_str(data_expected), [ records.SchemaField("ID"), records.SchemaField("SUBVAL"), records.SchemaField("ID_SUB"), ], ) pandas.testing.assert_frame_equal(df_expected, df)
def test_vtt_strict(self): df = pandas.DataFrame({"A": range(10), "B": range(10)}, dtype=str) STR_IN = """old-val,new-val 0,10 1,9 2,8 3,7 4,6 5,5 6,4 7,3 8,2 9,1 10,0""" vt = value_translator.ValueTranslator() vt.add_vtt( "B", value_translator.load_from_csv(inpt.from_str(STR_IN), strict=True)) vt.translate(df) pandas.testing.assert_frame_equal( pandas.DataFrame({ "A": range(10), "B": range(10, 0, -1) }, dtype=str), df, )
def test_load_flatten_error(self): inpt_str = '{"A":"0","B_MV":[{"B_MS":[{"B":"1"},{"B":"2"}]}],"C":"2"}' with self.assertRaises(ValueError): df = records.load_jsonl( inpt.from_str(inpt_str), ( records.SchemaField("A"), records.SchemaField("B_MV", transform=records.flatten_mv), records.SchemaField("C"), ), )
def test_load_jsonl_transform(self): inpt_str = '{"A":"test"}' df = records.load_jsonl( inpt.from_str(inpt_str), (records.SchemaField( "A", transform=[ lambda v: f"{v}-suffix", lambda v: f"prefix-{v}", lambda v: v.upper() ]), ), ) self.assertEqual(df.at[0, "A"], "PREFIX-TEST-SUFFIX")
def test_load_csv_basic(self): INPT_STR = "A,B,C,D\n0,1,2,3" df = records.load_csv( inpt.from_str(INPT_STR), [ records.SchemaField("A"), records.SchemaField("B"), records.SchemaField("C"), ], ) self.assertEqual(df.at[0, "A"], "0") self.assertEqual(df.at[0, "B"], "1") self.assertEqual(df.at[0, "C"], "2")
def test_load_flatten(self): inpt_str = '{"A":"0","B_MV":[{"B_MS":[{"B":"1"}]}],"C":"2"}' df = records.load_jsonl( inpt.from_str(inpt_str), ( records.SchemaField("A"), records.SchemaField("B_MV", transform=records.flatten_mv), records.SchemaField("C"), ), ) self.assertEqual(df.at[0, "A"], "0") self.assertEqual(df.at[0, "B_MV"], "1") self.assertEqual(df.at[0, "C"], "2")
def test_load_jsonl_basic(self): INPT_STR = '{"A":"0","B":"1","C":"2"}' df = records.load_jsonl( inpt.from_str(INPT_STR), [ records.SchemaField("A"), records.SchemaField("B"), records.SchemaField("C"), ], ) self.assertEqual(df.at[0, "A"], "0") self.assertEqual(df.at[0, "B"], "1") self.assertEqual(df.at[0, "C"], "2")
def test_from_str(self): df = pandas.DataFrame({"A": range(10), "B": range(10)}, dtype=str) STR_IN = """old-val,new-val 0,10 1,9 2,8 3,7 4,6""" vt = value_translator.ValueTranslator() vt.add_vtt("B", value_translator.load_from_csv(inpt.from_str(STR_IN))) vt.translate(df) pandas.testing.assert_frame_equal( pandas.DataFrame( { "A": range(10), "B": list(range(10, 5, -1)) + list(range(5, 10)) }, dtype=str, ), df, )
def test_from_str(self): STR_IN = "This is a test" with inpt.from_str(STR_IN).open() as f: self.assertEqual(f.read(), STR_IN)