def run_cleaning_per_file_name(file_name):

    print(file_name)
    logger = logging.getLogger("svm_writer")
    sr_list, file_name = ExcelLoader.load_excel(file_name)
    dh = DataHolder(file_name.split(".")[0])
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dh, meta_dh = HeaderFinder.find_headers(dh)

    # save original state
    dh.create_memento()

    # Find and remove deviating rows
    dh = DevRowFinder.delete_deviating_rows(dh)

    # Identify and add date col
    DateColIdentifier.identify_and_gen_date_cols(dh, svm_logger=logger)
Example #2
0
class DataHolderTest(TestCase):

    def setUp(self):
        self.names = ["first", "second"]
        self.dh = DataHolder("test")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["3", "4"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
        self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="1")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["1", "1"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [1, 1]})
        self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="2")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["15", "16"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [15, 16]})
        self.dh.add_sheet(self.names[1], d1, d2, orig_sheet_name="2")

    def test_set_card_ids(self):
        trngs = []
        trngs.append({"group_id": "one"})
        trngs.append({"group_id": "two"})
        RowParser.set_card_ids(trngs, self.dh)
        # print(self.dh)
        true_ids = [0, 1, 1]
        for ds, ind in zip(self.dh.data_struct_list, true_ids):
            self.assertEqual(ds.card_id, ind)

    def test_updating(self):

        for dh_ind, df_data, df_profiles in self.dh.enumerate():
            if dh_ind == 0:
                df_data.loc[0, "col2"] = "5"
                df_profiles.loc[0, "col2"] = 5

            if dh_ind == 1:
                df_data.loc[0, "col1"] = "5"
                df_profiles.loc[0, "col1"] = 5

            if dh_ind == 2:
                df_data.loc[1, "col1"] = "19"
                df_profiles.loc[1, "col1"] = 19

            self.dh.update_with_ind(dh_ind, df_data, df_profiles)

        assert_frame_equal(self.dh.data_dict[self.names[0]][0].df_data, self.dh.data_struct_list[0].df_data)
        assert_frame_equal(self.dh.data_dict[self.names[0]][1].df_data, self.dh.data_struct_list[1].df_data)
        assert_frame_equal(self.dh.data_dict[self.names[1]][0].df_data, self.dh.data_struct_list[2].df_data)

        assert_frame_equal(self.dh.data_dict[self.names[0]][0].df_profiles, self.dh.data_struct_list[0].df_profiles)
        assert_frame_equal(self.dh.data_dict[self.names[0]][1].df_profiles, self.dh.data_struct_list[1].df_profiles)
        assert_frame_equal(self.dh.data_dict[self.names[1]][0].df_profiles, self.dh.data_struct_list[2].df_profiles)
        for key in self.dh.data_dict:
            for d_struct in self.dh.data_dict[key]:
                self.assertEqual(key, d_struct.name)

        for ds in self.dh.data_struct_list:
            self.assertEqual(ds, self.dh.id_dict[ds.id])


    def test_mementos(self):

        self.dh.create_memento()
        for dh_ind, df_data, df_profiles in self.dh.enumerate():
            if dh_ind == 0:
                df_data.loc[0, "col2"] = "5"
                df_profiles.loc[0, "col2"] = 5

            if dh_ind == 1:
                df_data.loc[0, "col1"] = "5"
                df_profiles.loc[0, "col1"] = 5

            if dh_ind == 2:
                df_data.loc[1, "col1"] = "19"
                df_profiles.loc[1, "col1"] = 19

            self.dh.update_with_ind(dh_ind, df_data, df_profiles)

        self.dh.create_memento()

        diff_dict_list = SheetStateComparer.compare_states(self.dh.mementos[0], self.dh.mementos[1])
        #for diff_dict in diff_dict_list:
            #pass#diff_
        for i in range(2):
            for j in range(2):
                el = diff_dict_list[0]["diff_array"][i][j]
                if i == 0 and j == 1:
                    self.assertEqual(el.change, "Corrected")
                else:
                    self.assertEqual(el.change, "No change")

    def test_serialization(self):
        self.dh.data_struct_list[0].roles.append("Claims Paid")
        self.dh.data_struct_list[0].df_data.sort_values("col1", ascending=False, inplace=True)
        serialized = self.dh.encode()
        data_framed = DataHolder.decode(serialized)

        assert_frame_equal(self.dh.data_struct_list[0].df_data, data_framed.data_struct_list[0].df_data)
        assert_frame_equal(self.dh.data_struct_list[1].df_data, data_framed.data_struct_list[1].df_data)
        assert_frame_equal(self.dh.data_struct_list[2].df_data, data_framed.data_struct_list[2].df_data)

        assert_frame_equal(self.dh.data_struct_list[0].df_profiles, data_framed.data_struct_list[0].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[1].df_profiles, data_framed.data_struct_list[1].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[2].df_profiles, data_framed.data_struct_list[2].df_profiles)
        self.assertEqual(data_framed.data_struct_list[0].roles[0], "Claims Paid")
        # Test conservation of ids
        for ind in range(len(self.dh.data_struct_list)):
            self.assertEqual(data_framed.data_struct_list[ind].id, self.dh.data_struct_list[ind].id)