def test_actuals_are_loaded(api_w_actuals): api = api_w_actuals expected = markdown_to_df( """ | card_id | name | school_name | class_name | season | | - | - | - | - | - | | stu1 | Buffy | Sunnydale | Applied Stabby | Fall 2001 | # BasicDenormalization | stu2 | Willow | Sunnydale | Good Spells | Spring 2002 | | stu3 | Bill | San Dimas | Station | Fall 2002 | | stu4 | Ted | San Dimas | Being Excellent | Fall 2002 | | stu1 | Buffy | Sunnydale | Applied Stabby | Summer 2002 | # MissingClasses | stu2 | Willow | Sunnydale | Good Spells | Summer 2002 | | stu1 | Buffy | Sunnydale | Applied Stabby | Summer 2002 | | stu2 | Willow | Sunnydale | Good Spells | Summer 2002 | # MultipleClasses | stu2 | Willow | Sunnydale | Season 6 Spells | Summer 2002 | | stu3 | Bill | San Dimas | Station | Summer 2002 | | stu4 | Ted | San Dimas | Being Excellent | Summer 2002 | | stu4 | Ted | San Dimas | Station | Summer 2002 | | stu1 | Buffy | Sunnydale | Applied Stabby | Fall 2001 | # IdConcatenation | stu2 | Willow | Sunnydale | Good Spells | Spring 2002 | | stu3 | Bill | San Dimas | Station | Fall 2002 | | stu4 | Ted | San Dimas | Being Excellent | Fall 2002 | """ ) actual = api.spec["targets"]["student_classes"].data[expected.columns] assert_frame_equal(actual, expected)
def test_overriding_defaults(identifiers, cases): source = Source( defaults={"last_name": "Jones"}, id_mapping={ "id": { "identifier": identifiers["student"], "attribute": "id" } }, ) source.stack( cases[0], markdown_to_df(""" | id | first_name | last_name | | - | - | - | | s1 | Bob | Not Jones | | s2 | Nancy | Not Jones | """), ) actual = source.data expected = markdown_to_df(""" | id | first_name | last_name | | - | - | - | | {s1} | Bob | Not Jones | | {s2} | Nancy | Not Jones | """.format( s1=identifiers["student"].generate(case=cases[0], named_id="s1")["id"], s2=identifiers["student"].generate(case=cases[0], named_id="s2")["id"], )) assert_frame_equal(actual, expected)
def test_setting_values(identifiers, cases): source = Source(id_mapping={ "id": { "identifier": identifiers["student"], "attribute": "id" } }) source.stack( cases[0], markdown_to_df(""" | id | first_name | | - | - | | s1 | Bob | | s2 | Nancy | """), values={"last_name": "Summers"}, ) actual = source.data expected = markdown_to_df(""" | id | first_name | last_name | | - | - | - | | {s1} | Bob | Summers | | {s2} | Nancy | Summers | """.format( s1=identifiers["student"].generate(case=cases[0], named_id="s1")["id"], s2=identifiers["student"].generate(case=cases[0], named_id="s2")["id"], )) assert_frame_equal(actual, expected)
def test_factories_stack_a_source(identifiers, sources): factory = Factory( data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Buffy | | s2 | Willow | """ } }, sources=sources, ) factory.generate("TestCase") expected = markdown_to_df(""" | id | first_name | | - | - | | {s1} | Buffy | | {s2} | Willow | """.format( s1=identifiers["student"].generate(case="TestCase", named_id="s1")["id"], s2=identifiers["student"].generate(case="TestCase", named_id="s2")["id"], )) actual = sources["students"].data[expected.columns] assert_frame_equal(actual, expected)
def test_multiple_identifers_are_translated(source_w_multiple_ids, identifiers, cases): source_w_multiple_ids.stack( cases[0], markdown_to_df(""" | id | uuid | organization_id |first_name | | - | - | - | - | | s1 | s1 | o1 | Bob | | s2 | s2 | o1 | Nancy | """), ) actual = source_w_multiple_ids.data expected = markdown_to_df(""" | id | uuid | organization_id | first_name | | - | - | - | - | | {s1} | {su1} | {o1} | Bob | | {s2} | {su2} | {o1} | Nancy | """.format( s1=identifiers["student"].generate(case=cases[0], named_id="s1")["id"], s2=identifiers["student"].generate(case=cases[0], named_id="s2")["id"], su1=identifiers["student"].generate(case=cases[0], named_id="s1")["uuid"], su2=identifiers["student"].generate(case=cases[0], named_id="s2")["uuid"], o1=identifiers["organization"].generate(case=cases[0], named_id="o1")["id"], )) assert_frame_equal(actual, expected)
def test_multiple_embedded_identifiers_are_translated(identifiers, cases): source = Source( id_mapping={ "id": { "identifier": identifiers["student"], "attribute": "id" } }, identifiers=identifiers, ) source.stack( cases[0], markdown_to_df(""" | id | prefixed_id | first_name | | - | - | - | | s1 | {organization.id[o1]}-{student.id[s1]} | Bob | | s2 | {organization.id[o1]}-{student.id[s2]} | Nancy | """), ) actual = source.data expected = markdown_to_df(""" | id | prefixed_id | first_name | | - | - | - | | {s1} | {o1}-{s1} | Bob | | {s2} | {o1}-{s2} | Nancy | """.format( s1=identifiers["student"].generate(case=cases[0], named_id="s1")["id"], s2=identifiers["student"].generate(case=cases[0], named_id="s2")["id"], o1=identifiers["organization"].generate(case=cases[0], named_id="o1")["id"], )) assert_frame_equal(actual, expected)
def test_inheritance_w_new_data(sources): base_factory = Factory( data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Buffy | | s2 | Willow | """ } }, sources=sources, ) modified_table = """ | id | first_name | last_name | | - | - | - | | s1 | Buffy | Summers | | s2 | Xander | Harris | """ composite_factory = Factory( data={"students": { "table": deepcopy(modified_table) }}, inherit_from=[base_factory], sources=sources, ) expected = markdown_to_df(modified_table) actual = composite_factory.data["students"]["dataframe"] assert_frame_equal(actual, expected)
def test_null_identifiers_go_to_the_right_case(multiple_identifier_target, stu, cases): """ If an identifying column can be null, then there is no way to associate it with a case unless there is another non-null identifying column. """ multiple_identifier_target.load_actual([ { "id": stu["c1stu1"]["id"], "uuid": stu["c1stu1"]["uuid"], "first_name": "Buffy", }, { "id": stu["c2stu2"]["id"], "uuid": None, "first_name": "Willow" }, ]) actual = multiple_identifier_target.case_data(cases[1]) expected = markdown_to_df(""" | id | uuid | first_name | | - | - | - | | stu2 | {NULL} | Willow | """) assert_frame_equal(actual, expected)
def test_empty_data_can_be_loaded_with_columns_specified(simple_target): simple_target.load_actual([], columns=["id", "first_name"]) actual = simple_target.data.drop(columns="__dtspec_case__") expected = markdown_to_df(""" | id | first_name | | - | - | """) assert_frame_equal(actual, expected)
def test_scenario_case_factories_can_override(identifiers, sources, student_factory, organization_factory): scenario = Scenario( cases={ "StudentOrg": Case(factory=Factory( sources=sources, inherit_from=[student_factory, organization_factory], data={ "students": { "table": """ | id | organization_id | first_name | | - | - | - | | s1 | o1 | Bill | | s2 | o1 | Ted | """ } }, )) }) scenario.generate() expected = markdown_to_df(""" | id | organization_id | first_name | | - | - | - | | {s1} | {o1} | Bill | | {s2} | {o1} | Ted | """.format( s1=identifiers["student"].generate(case=scenario.cases["StudentOrg"], named_id="s1")["id"], s2=identifiers["student"].generate(case=scenario.cases["StudentOrg"], named_id="s2")["id"], o1=identifiers["organization"].generate( case=scenario.cases["StudentOrg"], named_id="o1")["id"], )) actual = sources["students"].data assert_frame_equal(actual, expected) expected = markdown_to_df(""" | id | name | | - | - | | {o1} | San Dimas High | | {o2} | Alaska Military Academy | """.format( o1=identifiers["organization"].generate( case=scenario.cases["StudentOrg"], named_id="o1")["id"], o2=identifiers["organization"].generate( case=scenario.cases["StudentOrg"], named_id="o2")["id"], )) actual = sources["organizations"].data assert_frame_equal(actual, expected)
def test_target_can_be_split_into_case(simple_target, simple_data, cases): simple_target.load_actual(simple_data) actual = simple_target.case_data(cases[1]) expected = markdown_to_df(""" | id | first_name | | - | - | | stu1 | Faith | | stu2 | Willow | """) assert_frame_equal(actual, expected)
def test_factories_stack_sources(identifiers, sources): factory = Factory( data={ "students": { "table": """ | id | organization_id | first_name | | - | - | - | | s1 | o1 | Buffy | | s2 | o1 | Willow | """ }, "organizations": { "table": """ | id | name | | - | - | | o1 | Sunnydale High | """ }, }, sources=sources, ) factory.generate("TestCase") expected_students = markdown_to_df(""" | id | organization_id | first_name | | - | - | - | | {s1} | {o1} | Buffy | | {s2} | {o1} | Willow | """.format( s1=identifiers["student"].generate(case="TestCase", named_id="s1")["id"], s2=identifiers["student"].generate(case="TestCase", named_id="s2")["id"], o1=identifiers["organization"].generate(case="TestCase", named_id="o1")["id"], )) actual_students = sources["students"].data.drop(columns=["external_id"]) expected_organizations = markdown_to_df(""" | id | name | | - | - | | {o1} | Sunnydale High | """.format(o1=identifiers["organization"].generate( case="TestCase", named_id="o1")["id"])) actual_organizations = sources["organizations"].data.drop(columns=["uuid"]) assert_frame_equal(actual_students, expected_students) assert_frame_equal(actual_organizations, expected_organizations)
def test_actual_data_is_loaded_ids_translated(simple_target, simple_data): simple_target.load_actual(simple_data) actual = simple_target.data.drop(columns=["__dtspec_case__"]) expected = markdown_to_df(""" | id | first_name | | - | - | | stu1 | Buffy | | stu2 | Willow | | stu1 | Faith | | stu2 | Willow | """) assert_frame_equal(actual, expected)
def test_source_without_identifier_generates_data(cases): table = """ | date | season | | - | - | | 2001-09-08 | Fall 2001 | | 2002-01-09 | Spring 2002 | """ source = Source() source.stack(cases[0], markdown_to_df(table)) actual = source.data expected = markdown_to_df(table) assert_frame_equal(actual, expected)
def test_inheritance_w_multiple_composite_sources(sources): base_factory = Factory( data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Buffy | | s2 | Willow | """ } }, sources=sources, ) modified_students_table = """ | id | first_name | last_name | | - | - | - | | s1 | Buffy | Summers | | s2 | Xander | Harris | """ new_organizations_table = """ | id | name | | - | - | | o1 | Sunnydale High | """ composite_factory = Factory( data={ "students": { "table": deepcopy(modified_students_table) }, "organizations": { "table": deepcopy(new_organizations_table) }, }, inherit_from=[base_factory], sources=sources, ) expected = markdown_to_df(modified_students_table) actual = composite_factory.data["students"]["dataframe"] assert_frame_equal(actual, expected) expected = markdown_to_df(new_organizations_table) actual = composite_factory.data["organizations"]["dataframe"] assert_frame_equal(actual, expected)
def test_inheritance_defaults_are_overridden(identifiers, sources): base_factory = Factory( data={ "students": { "table": """ | id | | - | | s1 | """, "values": { "first_name": "Bob" }, } }, sources=sources, ) composite_factory = Factory( data={ "students": { "table": """ | id | | - | | s1 | """, "values": { "last_name": "Loblaw" }, } }, inherit_from=[base_factory], sources=sources, ) composite_factory.generate("TestCase") expected = markdown_to_df(""" | id | first_name | last_name | | - | - | - | | {s1} | Bob | Loblaw | """.format(s1=identifiers["student"].generate(case="TestCase", named_id="s1")["id"])) actual = sources["students"].data.drop( columns=["external_id", "organization_id"]) assert_frame_equal(actual, expected)
def test_defaults_override_identifiers(identifiers, cases): """ If a column is marked as an identifier, but is given a default, then the default will be used (e.g., it will not revert to anonymous id generation). """ source = Source( id_mapping={ "id": { "identifier": identifiers["student"], "attribute": "id" } }, defaults={"id": "stu1"}, ) source.stack( cases[0], markdown_to_df(""" | first_name | | - | | Bob | | Still Bob | """), ) generated_id = list(identifiers["student"].cached_ids[id( cases[0])].named_ids.values())[0]["id"] actual = source.data expected = markdown_to_df(""" | first_name | id | | - | - | | Bob | {s1} | | Still Bob | {s1} | """.format(s1=generated_id)) assert_frame_equal(actual, expected) generated_name_id = list(identifiers["student"].cached_ids[id( cases[0])].named_ids.keys())[0] assert generated_name_id == source.defaults["id"]
def test_scenarios_generate_case_data_over_multiple_cases( identifiers, sources, student_factory, organization_factory): scenario = Scenario( cases={ "SimpleStudent": Case(factory=Factory(sources=sources, inherit_from=[student_factory])), "SimpleOrganization": Case(factory=Factory(sources=sources, inherit_from=[organization_factory])), }) scenario.generate() expected = markdown_to_df(""" | id | first_name | | - | - | | {s1} | Bill | | {s2} | Ted | """.format( s1=identifiers["student"].generate( case=scenario.cases["SimpleStudent"], named_id="s1")["id"], s2=identifiers["student"].generate( case=scenario.cases["SimpleStudent"], named_id="s2")["id"], )) actual = sources["students"].data.drop(columns="organization_id") assert_frame_equal(actual, expected) expected = markdown_to_df(""" | id | name | | - | - | | {o1} | San Dimas High | | {o2} | Alaska Military Academy | """.format( o1=identifiers["organization"].generate( case=scenario.cases["SimpleOrganization"], named_id="o1")["id"], o2=identifiers["organization"].generate( case=scenario.cases["SimpleOrganization"], named_id="o2")["id"], )) actual = sources["organizations"].data assert_frame_equal(actual, expected)
def test_scenarios_stack_case_data(identifiers, sources, student_factory): scenario = Scenario( cases={ "SimpleStudent": Case(factory=Factory(sources=sources, inherit_from=[student_factory])), "AltStudent": Case(factory=Factory( sources=sources, inherit_from=[student_factory], data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Napoleon | """ } }, )), }) scenario.generate() expected = markdown_to_df(""" | id | first_name | | - | - | | {s1} | Bill | | {s2} | Ted | | {as1} | Napoleon | """.format( s1=identifiers["student"].generate( case=scenario.cases["SimpleStudent"], named_id="s1")["id"], s2=identifiers["student"].generate( case=scenario.cases["SimpleStudent"], named_id="s2")["id"], as1=identifiers["student"].generate(case=scenario.cases["AltStudent"], named_id="s1")["id"], )) actual = sources["students"].data.drop(columns="organization_id") assert_frame_equal(actual, expected)
def test_inheritance_wo_new_data(sources): base_factory = Factory( data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Buffy | | s2 | Willow | """ } }, sources=sources, ) composite_factory = Factory(inherit_from=[base_factory], sources=sources) expected = base_factory.data["students"]["dataframe"] actual = composite_factory.data["students"]["dataframe"] assert_frame_equal(actual, expected)
def test_sources_stack(simple_source, identifiers, cases): simple_source.stack( cases[0], markdown_to_df(""" | id | first_name | | - | - | | s1 | Bob | | s2 | Nancy | """), ) simple_source.stack( cases[1], markdown_to_df(""" | id | first_name | | - | - | | s1 | Bobob | | s2 | Nanci | """), ) actual = simple_source.data expected = markdown_to_df(""" | id | first_name | | - | - | | {s11} | Bob | | {s12} | Nancy | | {s21} | Bobob | | {s22} | Nanci | """.format( s11=identifiers["student"].generate(case=cases[0], named_id="s1")["id"], s12=identifiers["student"].generate(case=cases[0], named_id="s2")["id"], s21=identifiers["student"].generate(case=cases[1], named_id="s1")["id"], s22=identifiers["student"].generate(case=cases[1], named_id="s2")["id"], )) assert_frame_equal(actual, expected)
def test_multiple_inheritance(sources): base1_factory = Factory( data={ "students": { "table": """ | id | first_name | | - | - | | s1 | Buffy | | s2 | Willow | """ } }, sources=sources, ) base2_factory = Factory( data={ "organizations": { "table": """ | id | name | | - | - | | o1 | Sunnydale High | """ } }, sources=sources, ) composite_factory = Factory(inherit_from=[base1_factory, base2_factory], sources=sources) expected = base1_factory.data["students"]["dataframe"] actual = composite_factory.data["students"]["dataframe"] assert_frame_equal(actual, expected) expected = base2_factory.data["organizations"]["dataframe"] actual = composite_factory.data["organizations"]["dataframe"] assert_frame_equal(actual, expected)
def test_identifiers_generate_defaults(identifiers, cases): """ If a column is marked as an identifier column, but is not given a specific named id, then "anonymous" named ids will be generated when the data is stacked. """ source = Source(id_mapping={ "id": { "identifier": identifiers["student"], "attribute": "id" } }) source.stack( cases[0], markdown_to_df(""" | first_name | | - | | Bob | | Nancy | """), ) anonymous_ids = [ v["id"] for v in identifiers["student"].cached_ids[id( cases[0])].named_ids.values() ] actual = source.data expected = markdown_to_df(""" | first_name | id | | - | - | | Bob | {s1} | | Nancy | {s2} | """.format(s1=anonymous_ids[0], s2=anonymous_ids[1])) assert_frame_equal(actual, expected)