def build_triangle_from_table(dh):
     new_dh = DataHolder(dh.name)
     pool = DebuggablePool(pp.N_CORES)
     # First find all date cols and see if one of them has target structure.
     for dh_ind, ds in enumerate(dh.data_struct_list):
         id_col, hori_date_col, vert_date_col = TriangleFromTableBuilder.do_the_magic(
             ds, pool)
         # cut each id into one row
         cut_list = TriangleFromTableBuilder.make_cut_list(
             ds.df_data[id_col])
         # use the cut_list to insert all elements
         tr_cols = pd.Series(ds.df_profiles.iloc[0, :] ==
                             SheetTypeDefinitions.TRIANGLE_ELEMENT,
                             index=ds.df_profiles.columns)
         pad_header_mapping = TriangleFromTableBuilder.make_pad_header_mapping(
             ds, hori_date_col)
         vert_col_tup = (vert_date_col, ds.df_data[vert_date_col])
         hori_col_tup = (hori_date_col, ds.df_data[hori_date_col])
         id_col_tup = (id_col, ds.df_data[id_col])
         func = partial(TriangleFromTableBuilder.apply_cuts, cut_list,
                        vert_col_tup, hori_col_tup, id_col_tup,
                        pad_header_mapping)
         tr_col_tup_list = [(col_name, ds.df_data[col_name])
                            for col_name in tr_cols.index[tr_cols]]
         out = pool.map(func, tr_col_tup_list)
         #for name, tr_col in ds.df_data[tr_cols.index[tr_cols]].iteritems():
         for temp_df_data, temp_df_profiles, name in out:
             new_dh.add_sheet(name, temp_df_data, temp_df_profiles)
         #new_dh.add_sheet(name, temp_df_data, temp_df_profiles)
     pool.close()
     return new_dh
    def name_and_scrub_triangle(dh, new_dh_dict, meta_dh=None):
        new_dh = DataHolder(dh.name)
        word_set_list = list()
        for ds in dh:
            word_set_list.append(
                SubTriangler.identify_category_name(ds, meta_dh))
        if meta_dh != None:
            if meta_dh.n > 0:
                SubTriangler.divide_meta_data(dh, meta_dh, word_set_list)
        # Find the most unique name
        for i in range(len(word_set_list)):
            ds = dh.data_struct_list[i]
            difference = word_set_list[i].copy()
            for j in range(len(word_set_list)):
                if j != i and ds.orig_sheet_name == dh.data_struct_list[
                        j].orig_sheet_name:
                    difference = difference.difference(word_set_list[j])
            if len(difference) > 0:
                stringified = sorted([str(el) for el in difference])
                name = " ".join(stringified)
                name = name.translate(SubTriangler.remove_digits)
            else:
                name = str(i)
            if ds.name != ds.orig_sheet_name:
                name = ds.name + " " + name

            new_dh.add_sheet(name,
                             ds.df_data,
                             ds.df_profiles,
                             orig_sheet_name=ds.orig_sheet_name)
        new_dh_dict[dh.name] = new_dh
Esempio n. 3
0
    def test_fill_hollow_str_cols(self):
        dh = DataHolder("test")
        df_data = pd.DataFrame(data={'col1': [1, "", "", "j", "", "", "6", "", "b",
                                              "g", "", "", "j", "", "", "6", "", "b", "", ""],
                                     '1992': ["", "", "", "j", "", "", "6", "", "b",
                                              "g", "", "", "j", "", "", "6", "", "b", "g", "hrumpff"]})

        df_profiles = pd.DataFrame(data={'col1': [2, 0, 0, 1, 0, 0, 1, 0, 1,
                                                  1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],

                                         '1992': [0, 0, 0, 1, 0, 0, 1, 0, 1,
                                                  1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1]})
        dh.add_sheet("test", df_data, df_profiles)
        StringFiller.fill_hollow_str_cols(dh)
        dh_sol = DataHolder("test")
        df_data_sol = pd.DataFrame(data={'col1': [1, 1, 1, "j", "j", "j", "6", "6", "b",
                                              "g", "g", "g", "j", "j", "j", "6", "6", "b", "b", "b"],
                                     '1992': ["", "", "", "j", "j", "j", "6", "6", "b",
                                              "g", "g", "g", "j", "j", "j", "6", "6", "b", "g", "hrumpff"]})

        df_profiles_sol = pd.DataFrame(data={'col1': [2, 2, 2, 1, 1, 1, 1, 1, 1,
                                                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],

                                         '1992': [0, 0, 0, 1, 1, 1, 1, 1, 1,
                                                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

        dh_sol.add_sheet("test", df_data_sol, df_profiles_sol)
        self.assertTrue(dh.equals(dh_sol))
Esempio n. 4
0
    def numify_dates(dh):
        meta_dh = DataHolder(dh.name + "_meta")
        for ds in dh:
            d_cols = DateColIdentifier.identify_marked_date_cols(ds)
            date_data = ds.df_data[d_cols.index[d_cols]]

            meta_data = pd.DataFrame()
            meta_profiles = pd.DataFrame()
            for name, col in date_data.iteritems():
                ds.df_profiles.loc[:, name] = SheetTypeDefinitions.STRING_DATE
                types = col.map(lambda x: isinstance(x, float))
                if types.all():
                    #do nothing basically
                    ds.df_data[name] = col.astype(int)
                else:
                    meta_data[name] = col.copy()
                    meta_profiles[name] = ds.df_profiles[name].copy()
                    temp_col = col.copy()
                    temp_col[types] = temp_col[types]
                    #SOME PROBLEM WITH KEEPING TYEPS INT IN SHEET W
                    temp_col[np.logical_not(types)] = temp_col[np.logical_not(types)].map(lambda x: longest_numeral(x))
                    temp_col = temp_col.astype(int)
                    ds.df_data[name] = temp_col
                # meta_backed = False
                # for index, val in col.iteritems():
                #     if not isinstance(val, int):
                #         if not meta_backed:
                #             meta_backed = True
                #             meta_data[name] = col.copy()
                #             meta_profiles[name] = ds.df_profiles[name].copy()
                #         num = longest_numeral(val)
                #         ds.df_data.loc[index, name] = num
                    meta_dh.add_sheet("date_backup", meta_data, meta_profiles, orig_sheet_name=ds.orig_sheet_name)
            return dh, meta_dh
Esempio n. 5
0
    def make_sol_dict():
        """
        Run present pipeline and save the merge results
        :return:
        """
        file_names = ["FORMAT3_Copy of KommuneMTPLforTriangle.xls",
            "C Triangulations analysis R2017 GC20161109.xls",
            "EVOLUTION 2017 _ M+F - Triangles cat nat brut net.xls",
            "Bsp8 _ Dreiecke aus GCNA für CU1.4.1.xls",
            "Analysis MTPL MOD.xls",
            "Bsp6 _ Dreiecke aus GCNA für CU1.4.1.xls",
            "FORMAT6_sinistres.xls",
            "FORMAT1_LOSSES-MTPL-OVER-500-GROUP-2005_modified.xls"]
        solutions_dict = dict()
        raw_dict = dict()
        for file_name in file_names:
            sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR + "/raw_test_files/" + file_name)
            dh = DataHolder()
            for sr in sr_list:
                dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                             pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name)

            dh = SheetPreProcessor.separate_components(dh)
            raw_dict[file_name] = dh.encode()
            dh = HorizontalMerger.horizontal_merge(dh)
            #temp_path = pdir.RESOURCES_DIR + "/temp/"
            #dh.write_excel(temp_path + file_name)
            solutions_dict[file_name] = dh
        solutions_dict = MergePararametersOptimizer.make_ind_col_dict(solutions_dict)
        with open(pdir.RESOURCES_DIR + "/test/merge_solutions.obj", "wb") as temp_file:
            pickle.dump(solutions_dict, temp_file)
        with open(pdir.RESOURCES_DIR + "/test/raw_test.obj", "wb") as temp_file:
            pickle.dump(raw_dict, temp_file)
    def chop_triangles_horizontally(dh):
        """
        checks for repeating header and splits ds:s
        :param dh: DataHolder
        :return: DataHolder
        """
        chop, chop_lists = TriangleChopper.make_occurrence_list(dh)

        if not chop:
            return dh
        else:
            new_dh = DataHolder(dh.name)
            for ind, ds in enumerate(dh):
                occurrence_list = chop_lists[ind]
                if any(occurrence_list):
                    for i in range(1, np.max(occurrence_list) + 1):
                        bools = np.logical_or(occurrence_list == 0,
                                              occurrence_list == i)
                        df_data = ds.df_data[ds.df_data.columns[bools]].copy()
                        df_profiles = ds.df_profiles[
                            ds.df_profiles.columns[bools]].copy()
                        new_dh.add_sheet(ds.name,
                                         df_data,
                                         df_profiles,
                                         orig_sheet_name=ds.orig_sheet_name)

                else:
                    new_dh.add_ds(ds)
            return new_dh
Esempio n. 7
0
    def merge_with_merges_list(dh, merges):
        new_dh = DataHolder(dh.name)
        for merge in merges:
            profiles = None
            data = None
            name_set = set()
            for ind in merge:

                ds = dh.id_dict[ind]
                name_set.update([ds.name])
                if profiles is None:
                    profiles = ds.df_profiles.copy()
                    data = ds.df_data.copy()
                else:
                    temp_profiles = ds.df_profiles.copy()
                    temp_data = ds.df_data.copy()
                    # TODO: generalize to other positions then the first position
                    #if profiles.shape[1] > temp_profiles.shape[1]:
                    #    for header in profiles.columns[temp_profiles.shape[1]:]:
                    #        temp_profiles[header] = SheetTypeDefinitions.ZERO_FLOAT
                    #        temp_data[header] = 0.0
                    profiles = pd.concat([profiles, temp_profiles], sort=True)
                    profiles.fillna(SheetTypeDefinitions.ZERO_FLOAT,
                                    inplace=True)
                    data = pd.concat([data, temp_data], sort=True)
            new_dh.add_sheet("_".join(list(name_set)),
                             data,
                             profiles,
                             orig_sheet_name=ds.orig_sheet_name)
        return new_dh
    def vertical_category_division(ds, new_dh_dict, meta_dh):
        # find the category column
        # Should be strings (for now) (used)
        # Kind of periodic (thus repetitive entries) (used)
        # some entries may change slightly (used)
        # period may change slightly (not checked for now)(should be checked in new if statment)
        # Should get tag matches in dict (not checked for now)
        df_data = ds.df_data
        df_profiles = ds.df_profiles
        orig_name = ds.orig_sheet_name
        for col_name, col in df_data.iteritems():

            string_ratio = np.sum(
                df_profiles[col_name].values == SheetTypeDefinitions.STRING
            ) / df_profiles[col_name].values.size
            if string_ratio > pp.MIN_STRING_RATIO_CAT_COL:
                # check periodic potential
                string_col = col.astype(str)
                unique, counts = np.unique(string_col, return_counts=True)
                ratio = np.max(counts) / col.size
                if ratio < pp.MAX_RATIO_LARGEST_CAT and ratio > pp.MIN_RATIO_LARGEST_CAT and len(
                        unique) < pp.MAX_N_CATS:
                    if col_name in new_dh_dict:
                        new_dh = new_dh_dict[col_name]
                    else:
                        new_dh = DataHolder(col_name)
                        new_dh_dict[col_name] = new_dh
                    #period_label_bool = counts * period > string_col.size - period
                    # now get the remaining
                    #sub_period_label = unique[period_label_bool == False]
                    match_dict = SubTriangler.component_finder(unique)

                    # now load the new_dh

                    for name in match_dict:
                        cond = np.array([
                            string_col.values == sub_name
                            for sub_name in match_dict[name]
                        ]).any(axis=0)
                        sub_df_data = df_data[cond].drop(
                            columns=[string_col.name])
                        sub_df_profiles = df_profiles[cond].drop(
                            columns=[string_col.name])
                        if name == "" or np.sum(cond) < 4:
                            new_ds = DataStruct(sub_df_data,
                                                sub_df_profiles,
                                                name,
                                                orig_sheet_name=orig_name)
                            for split in new_ds.col_split_ds():
                                if not np.all(split.df_profiles ==
                                              SheetTypeDefinitions.EMPTY_STRING
                                              ) and not (np.all(
                                                  split.df_data == "")):
                                    meta_dh.add_ds(split)
                        else:
                            new_dh.add_sheet(ds.name + " - " + name,
                                             sub_df_data,
                                             sub_df_profiles,
                                             orig_sheet_name=orig_name)
Esempio n. 9
0
def main(file, settings):

    print(file)
    sr_list, file_name = ExcelLoader.load_excel(file)
    dh = DataHolder(file_name)
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dummy, new_dh = TrianglePipeline.table_triangle_pipeline_dh(dh)
    temp_path = pdir.RESOURCES_DIR + "/temp/"
    new_dh.write_excel(temp_path + file_name)
Esempio n. 10
0
 def perform_horizontal_merge(dh, distances):
     new_dh = DataHolder(dh.name)
     # make a greedy merge
     merged_set = set()
     for part in distances:
         if part[0] > pp.MAX_HORIZONTAL_MERGE_DISTANCE:
             break
         merge = [part[1]] + part[2]
         if len(set(merge).intersection(merged_set)) == 0:
             if len(part[2]) > 1:
                 # Merge vertically!
                 df_data_list = [
                     dh.id_dict[df_id].df_data for df_id in part[2]
                 ]
                 df_profiles_list = [
                     dh.id_dict[df_id].df_profiles for df_id in part[2]
                 ]
                 df_data = pd.concat(df_data_list, axis=0, sort=True)
                 df_profiles = pd.concat(df_profiles_list,
                                         axis=0,
                                         sort=True)
             else:
                 df_data = dh.id_dict[part[2][0]].df_data
                 df_profiles = dh.id_dict[part[2][0]].df_profiles
             df_data = pd.concat([dh.id_dict[part[1]].df_data, df_data],
                                 axis=1,
                                 sort=True)
             df_profiles = pd.concat(
                 [dh.id_dict[part[1]].df_profiles, df_profiles],
                 axis=1,
                 sort=True)
             df_data = df_data.reindex(sorted(df_data.columns), axis=1)
             df_profiles = df_profiles.reindex(sorted(df_profiles.columns),
                                               axis=1)
             merged_set.update(merge)
             new_dh.add_sheet(
                 dh.id_dict[merge[0]].name,
                 df_data,
                 df_profiles,
                 orig_sheet_name=dh.id_dict[merge[0]].orig_sheet_name)
     # add the remaining
     for id_key in dh.id_dict:
         if id_key not in merged_set:
             new_dh.add_sheet(
                 dh.id_dict[id_key].name,
                 dh.id_dict[id_key].df_data,
                 dh.id_dict[id_key].df_profiles,
                 orig_sheet_name=dh.id_dict[id_key].orig_sheet_name)
     return new_dh
def run_test_per_file_name(file_name):

    print(file_name)
    logger = logging.getLogger("svm_writer")
    sr_list, file_name = ExcelLoader.load_excel(file_name)
    dh = DataHolder(file_name.split(".")[0])
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dh = SheetPreProcessor.pre_strip(dh)
    DateColIdentifier.identify_and_gen_date_cols(dh,
                                                 replace_col=False,
                                                 svm_logger=logger)
Esempio n. 12
0
    def testFindTriangleHeaders(self):
        names = ["First", "Second"]
        dh = DataHolder("test")
        d1 = pd.DataFrame(data={'col1' + ps.HEADER_PLACE_HOLDER: ["1", "2", 1],
                                'col2' + ps.HEADER_PLACE_HOLDER: [3, "2", "3b"],
                                'col3' + ps.HEADER_PLACE_HOLDER: ["brum2", "4", 4],
                                'col4' + ps.HEADER_PLACE_HOLDER: [24, "4", "brum25"],
                                })
        d2 = d1.copy()
        d2.iloc[:, :]=1

        dh.add_sheet(names[0], d1, d2)
        dh = TriangleHeaderFinder.find_triangle_headers(dh, test_settings=True)
        headers = list(dh.data_struct_list[0].df_data.columns)
        self.assertEqual(headers, ["col11", "col23", "col3brum2", "col424"])
Esempio n. 13
0
 def testDistrMatching(self):
     test_distr = np.array([1,1,1,1,1,2,2,2,2,3,3,3,4,4,5])
     test_distr = test_distr/mad(test_distr)
     test_distr = test_distr - np.mean(test_distr)
     distr = {
         ps.CAT_RESERVED_NAME: test_distr
     }
     dh = DataHolder("test")
     dh.add_sheet(self.names[0], pd.DataFrame(data=[1, 1, 1, 1, 2, 2, 2, 3, 3, 4]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*10))
     dh.add_sheet(self.names[0], pd.DataFrame(data=[5, 9, 3, 7, 18]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*5))
     matches = list()
     for id in dh.id_dict:
         matches.append(InputMatcher.compare_with_distribution(id, ps.CAT_RESERVED_NAME, dh, distr))
     # First entry is more similar to the reference, therefore, first match should be better
     self.assertTrue(matches[0] > matches[1])
Esempio n. 14
0
    def post(self, request):
        sr_list = jsonpickle.decode(request.data['sr_list'])
        dhName = request.data['dhName']
        selected_sheets = request.data['selected_sheets']

        data_holder = DataHolder(dhName)

        for sr in sr_list:
            if sr.sheet_name in selected_sheets:
                data_holder.add_sheet(sr.sheet_name,
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.row_vals),
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.xls_types),
                                      orig_sheet_name=sr.sheet_name)

        encoded = data_holder.encode()

        return Response(encoded, status=200)
Esempio n. 15
0
    def test_col_identification(self):
        df_data = pd.DataFrame(data={'col1': ["1", "", "1991", "1992", "2007", "rew", "1993", "1994", "1995x"],
                                     'col2': [43, 1994, 2015, 1994, 7, 2015, 1994, 1999, 2015],
                                     '1991': [1993, 1, 6, 1993, 1, 6, 1993, 1, 6],
                                     '1992': ["g", "r", "h", "j", "t", "f", "6", "p", "6"],
                                     '1993': ["1", "1993", "6", "1993", "", "rew", "1993", "1994", ""]})

        df_profiles = pd.DataFrame(data={'col1': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         'col2': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1991': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1992': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         '1993': [1, 1, 1, 1, 1, 1, 1, 1, 1]})
        dh = DataHolder('test')
        dh.add_sheet('test', df_data, df_profiles)
        dh = DateFiller.identify_and_gen_date_cols(dh, replace_col=False)
        dh = ColTypeIdentifier.identify_col_types(dh)
        profiles = dh.data_struct_list[0].df_profiles
        self.assertTrue(profiles.iloc[1,1], SheetTypeDefinitions.STRING_DATE)
        self.assertTrue(profiles.iloc[0, 2], SheetTypeDefinitions.TRIANGLE_ELEMENT)
        self.assertTrue(profiles.iloc[0, 3], SheetTypeDefinitions.ID_ELEMENT)
def run_cleaning_per_file_name(file_name):

    print(file_name)
    logger = logging.getLogger("svm_writer")
    sr_list, file_name = ExcelLoader.load_excel(file_name)
    dh = DataHolder(file_name.split(".")[0])
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dh, meta_dh = HeaderFinder.find_headers(dh)

    # save original state
    dh.create_memento()

    # Find and remove deviating rows
    dh = DevRowFinder.delete_deviating_rows(dh)

    # Identify and add date col
    DateColIdentifier.identify_and_gen_date_cols(dh, svm_logger=logger)
Esempio n. 17
0
    def testTurnTriangle(self):
        dh = DataHolder("test")
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [0, 0, 2], '1991': [0, 0, 0], '1992': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [1, 1, 0], '1991': [0, 0, 0], '1992': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [3, 1, 0], '1991': [0, 0, 0], '1992-': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992-': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        tr_cols_dict = {tuple(dh.data_struct_list[0].df_data.columns) + dh.data_struct_list[0].df_data.shape:
                pd.Series([False, True, True, True], index=dh.data_struct_list[0].df_data.columns),
                        tuple(dh.data_struct_list[2].df_data.columns) + dh.data_struct_list[0].df_data.shape:
                            pd.Series([False, True, True, True], index=dh.data_struct_list[2].df_data.columns)
        }
        dh_copy = dh.copy_without_memory()
        TriangleStripper.turn_triangle(dh, tr_cols_dict, alt_min_score=0.6)

        for ds in dh_copy.data_struct_list:
            tr_cols = tr_cols_dict[tuple(ds.df_data.columns) + ds.df_data.shape]
            tri_part = ds.df_data[tr_cols.index[tr_cols]].values
            ds.df_data[tr_cols.index[tr_cols]] = np.transpose(tri_part)

        for ds, ds_copy in zip(dh.data_struct_list, dh_copy.data_struct_list):
            self.assertTrue(ds.df_data.equals(ds_copy.df_data))
            self.assertTrue(ds.df_profiles.equals(ds_copy.df_profiles))
Esempio n. 18
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        if self.request.session.get('data_holder'):
            sr_list = jsonpickle.decode(
                self.request.session.get('data_holder'))
            data_holder = DataHolder()
            for sr in sr_list:
                data_holder.add_sheet(
                    sr.sheet_name,
                    pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                    pd.DataFrame(columns=sr.headers, data=sr.xls_types))
            context[
                "diff_dicts"], data_holder = CleaningPipeline.clean_data_dh(
                    data_holder)
        else:
            sheets = DataSheet.objects.filter(
                owner=self.request.user).order_by('sheet_name')
            context["diff_dicts"], data_holder = CleaningPipeline.clean_data(
                sheets)
        #sheets = DataSheet.objects.filter(owner=self.request.user).order_by('sheet_name')
        #context["diff_dicts"], mem_dict = CleaningPipeline.clean_data(sheets)

        return context
    def find_triangles(dh, **kwargs):
        return_meta = False
        if 'return_meta' in kwargs:
            return_meta = kwargs['return_meta']

        triangle_dh = DataHolder(dh.name)
        rest_dh = DataHolder(dh.name + '_non-triangular')
        #bool_array = np.zeros(triangle_dh.n, dtype=bool)
        for dh_ind, ds in enumerate(dh.data_struct_list):
            df_data, df_profiles = ds.df_data, ds.df_profiles
            # now select triangles in som smart way
            bool = TriangleFinder.is_triangle(ds, **kwargs)
            if bool:
                TriangleFinder.add_triangle_to_dh(ds, triangle_dh)
            else:
                rest_dh.add_sheet(ds.name,
                                  df_data,
                                  df_profiles,
                                  orig_sheet_name=ds.orig_sheet_name)
        # Now get the triangle similiar data structs
        triangle_similar = TriangleFinder.find_triangles_by_similarity(
            triangle_dh, rest_dh)
        if len(triangle_similar) > 0:
            rest_copy = rest_dh.copy_without_memory()
            rest_dh = DataHolder(rest_copy.name)
            for ds in rest_copy:
                if ds.id in triangle_similar:
                    TriangleFinder.add_triangle_to_dh(ds, triangle_dh)
                else:
                    rest_dh.add_sheet(ds.name,
                                      ds.df_data,
                                      ds.df_profiles,
                                      orig_sheet_name=ds.orig_sheet_name)
        if return_meta:
            return triangle_dh, rest_dh
        else:
            return triangle_dh
Esempio n. 20
0
 def run_test_per_file_name(in_obj, in_tup, form):
     """
     Performs a fixture test for one file. The encapsulators determine the scope of the test.
     :param in_obj:
     :param in_tup:
     :param form:
     :return:
     """
     if not pp.LOG_SVM_FEATURES:
         print(in_tup)
     sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR +
                                                 "/raw_test_files/" +
                                                 in_tup[0])
     dh = DataHolder(file_name.split(".")[0])
     for sr in sr_list:
         dh.add_sheet(sr.sheet_name,
                      pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                      pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                      orig_sheet_name=sr.sheet_name)
     #Choose encapsulator class to determine test scope
     #dhce = DataHolderCallTestEncapsulator(pdir.RESOURCES_DIR + '/temp/pickles/', pdir.RESOURCES_DIR + '/test/pickles/')
     #dhce = DataHolderCallSaveEncapsulator(pdir.RESOURCES_DIR + '/test/pickles/')
     #dhce = DataHolderCallOutputEncapsulator(pdir.RESOURCES_DIR + '/left_triangles/' + in_tup[0], in_tup[1])
     dhce = DataHolderCallEncapsulator()
     #dhce = DataHolderCallTimeEncapsulator()
     if form == "triangle_table":
         dummy, dh = TrianglePipeline.table_triangle_pipeline_dh(dh, dhce)
     elif form == "triangle":
         dummy, dh = TrianglePipeline.triangle_pipeline_dh(
             dh,
             dhce,
             tri_type=in_tup[1]["tri_type"],
             n_outputs=in_tup[1]["n_outputs"])
     elif form == "cleaning":
         dummy, dh = CleaningPipeline.clean_data_dh(dh)
     ToDiscComparer.compare_to_disc(in_obj, file_name, dh)
Esempio n. 21
0
class SubTrianglerTest(TestCase):

    def setUp(self):
        self.names = ["first", "second"]
        self.dh = DataHolder("test")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["3", "4"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
        self.dh.add_sheet(self.names[0], d1, d2)
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["1", "1"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [1, 1]})
        self.dh.add_sheet(self.names[0], d1, d2)
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["15", "16"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [15, 16]})
        self.dh.add_sheet(self.names[1], d1, d2)


    def test_component_finder(self):
        # make a list of similar and dissimilar strings
        strings = ["asdfgh", "sdfgh", "qwert", "asdgh", "qwerty", "asdfgh"]
        match_dict = SubTriangler.component_finder(strings)
        test_set1 = set(["asdfgh", "sdfgh", "asdgh", "asdfgh"])
        test_set2 = set(["qwert", "qwerty"])
        self.assertEqual(test_set1, set(match_dict["sdfgh"]))
        self.assertEqual(test_set2, set(match_dict["qwert"]))
Esempio n. 22
0
class DataHolderTest(TestCase):

    def setUp(self):
        self.names = ["first", "second"]
        self.dh = DataHolder("test")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["3", "4"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
        self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="1")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["1", "1"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [1, 1]})
        self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="2")
        d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["15", "16"]})
        d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [15, 16]})
        self.dh.add_sheet(self.names[1], d1, d2, orig_sheet_name="2")

    def test_set_card_ids(self):
        trngs = []
        trngs.append({"group_id": "one"})
        trngs.append({"group_id": "two"})
        RowParser.set_card_ids(trngs, self.dh)
        # print(self.dh)
        true_ids = [0, 1, 1]
        for ds, ind in zip(self.dh.data_struct_list, true_ids):
            self.assertEqual(ds.card_id, ind)

    def test_updating(self):

        for dh_ind, df_data, df_profiles in self.dh.enumerate():
            if dh_ind == 0:
                df_data.loc[0, "col2"] = "5"
                df_profiles.loc[0, "col2"] = 5

            if dh_ind == 1:
                df_data.loc[0, "col1"] = "5"
                df_profiles.loc[0, "col1"] = 5

            if dh_ind == 2:
                df_data.loc[1, "col1"] = "19"
                df_profiles.loc[1, "col1"] = 19

            self.dh.update_with_ind(dh_ind, df_data, df_profiles)

        assert_frame_equal(self.dh.data_dict[self.names[0]][0].df_data, self.dh.data_struct_list[0].df_data)
        assert_frame_equal(self.dh.data_dict[self.names[0]][1].df_data, self.dh.data_struct_list[1].df_data)
        assert_frame_equal(self.dh.data_dict[self.names[1]][0].df_data, self.dh.data_struct_list[2].df_data)

        assert_frame_equal(self.dh.data_dict[self.names[0]][0].df_profiles, self.dh.data_struct_list[0].df_profiles)
        assert_frame_equal(self.dh.data_dict[self.names[0]][1].df_profiles, self.dh.data_struct_list[1].df_profiles)
        assert_frame_equal(self.dh.data_dict[self.names[1]][0].df_profiles, self.dh.data_struct_list[2].df_profiles)
        for key in self.dh.data_dict:
            for d_struct in self.dh.data_dict[key]:
                self.assertEqual(key, d_struct.name)

        for ds in self.dh.data_struct_list:
            self.assertEqual(ds, self.dh.id_dict[ds.id])


    def test_mementos(self):

        self.dh.create_memento()
        for dh_ind, df_data, df_profiles in self.dh.enumerate():
            if dh_ind == 0:
                df_data.loc[0, "col2"] = "5"
                df_profiles.loc[0, "col2"] = 5

            if dh_ind == 1:
                df_data.loc[0, "col1"] = "5"
                df_profiles.loc[0, "col1"] = 5

            if dh_ind == 2:
                df_data.loc[1, "col1"] = "19"
                df_profiles.loc[1, "col1"] = 19

            self.dh.update_with_ind(dh_ind, df_data, df_profiles)

        self.dh.create_memento()

        diff_dict_list = SheetStateComparer.compare_states(self.dh.mementos[0], self.dh.mementos[1])
        #for diff_dict in diff_dict_list:
            #pass#diff_
        for i in range(2):
            for j in range(2):
                el = diff_dict_list[0]["diff_array"][i][j]
                if i == 0 and j == 1:
                    self.assertEqual(el.change, "Corrected")
                else:
                    self.assertEqual(el.change, "No change")

    def test_serialization(self):
        self.dh.data_struct_list[0].roles.append("Claims Paid")
        self.dh.data_struct_list[0].df_data.sort_values("col1", ascending=False, inplace=True)
        serialized = self.dh.encode()
        data_framed = DataHolder.decode(serialized)

        assert_frame_equal(self.dh.data_struct_list[0].df_data, data_framed.data_struct_list[0].df_data)
        assert_frame_equal(self.dh.data_struct_list[1].df_data, data_framed.data_struct_list[1].df_data)
        assert_frame_equal(self.dh.data_struct_list[2].df_data, data_framed.data_struct_list[2].df_data)

        assert_frame_equal(self.dh.data_struct_list[0].df_profiles, data_framed.data_struct_list[0].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[1].df_profiles, data_framed.data_struct_list[1].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[2].df_profiles, data_framed.data_struct_list[2].df_profiles)
        self.assertEqual(data_framed.data_struct_list[0].roles[0], "Claims Paid")
        # Test conservation of ids
        for ind in range(len(self.dh.data_struct_list)):
            self.assertEqual(data_framed.data_struct_list[ind].id, self.dh.data_struct_list[ind].id)
Esempio n. 23
0
    def horizontal_category_division(ds, new_dh_dict, meta_dh):
        # find potential category rows
        # for now, look for strings
        str_ratio = (ds.df_profiles == SheetTypeDefinitions.STRING).sum(
            axis=1) / ds.df_profiles.shape[1]
        cat_cols = str_ratio >= pp.MIN_STRING_RATIO_CAT_ROW
        for ind in cat_cols.index[cat_cols]:
            cat_row = ds.df_data.loc[ind, :]
            unique, counts = np.unique(cat_row, return_counts=True)
            ratio = np.max(counts) / cat_row.size
            if ratio < 0.5 and len(unique) / cat_row.size < 0.5:
                row_name = "Row " + str(ind)
                if row_name in new_dh_dict:
                    new_dh = new_dh_dict[row_name]
                else:
                    new_dh = DataHolder(row_name)
                    new_dh_dict[row_name] = new_dh
                match_dict = SubTriangler.component_finder(unique)
                rev_match_dict = dict()
                for key, val in match_dict.items():
                    for item in val:
                        rev_match_dict[item] = key
                count_dict = {}
                for key, val in match_dict.items():
                    active = np.isin(unique, val)
                    count_dict[key] = np.sum(counts[active])
                # get number of data_structs to make
                headers_dict = {}
                for key, val in count_dict.items():
                    if val > pp.MIN_YEARS_SPANNED:
                        headers_dict[key] = []
                len_array = np.zeros(len(headers_dict), dtype=int)
                for enum, key in enumerate(headers_dict):
                    for name, val in cat_row.iteritems():
                        if rev_match_dict[
                                val] not in headers_dict or rev_match_dict[
                                    val] == key:
                            headers_dict[key].append(name)
                    len_array[enum] = len(headers_dict[key])

                # Now fill the dh
                # First, if same length, find optimal header naming
                same_length = np.std(len_array) == 0
                if same_length:
                    out_headers = deepcopy(headers_dict)
                    for i in range(len_array[0]):
                        i_headers = np.array(
                            [val[i] for val in headers_dict.values()])
                        missing = np.array([
                            "Missing header" in header for header in i_headers
                        ])
                        if np.any(missing) and np.any(np.logical_not(missing)):
                            header = i_headers[np.logical_not(missing)][0]
                            for key in out_headers:
                                out_headers[key][i] = header

                for key, val in headers_dict.items():
                    df_data = ds.df_data.loc[ds.df_data.index != ind, val]
                    df_profiles = ds.df_profiles.loc[ds.df_data.index != ind,
                                                     val]
                    if same_length:
                        df_data = pd.DataFrame(df_data.values,
                                               index=df_data.index,
                                               columns=out_headers[key])
                        df_profiles = pd.DataFrame(df_profiles.values,
                                                   index=df_profiles.index,
                                                   columns=out_headers[key])
                    new_dh.add_sheet(ds.name + " - " + key,
                                     df_data,
                                     df_profiles,
                                     orig_sheet_name=ds.orig_sheet_name)
Esempio n. 24
0
class InputMatcherTest(TestCase):
    def setUp(self):
        self.trngs = [{
            'headers': ["Year", "unit"],
            'categories': [{
                'name': 'Claim - Incurred',
                'type': 'sum',
                'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME]
            }, {
                'name': ps.CAT_PAID_NAME,
                'type': 'independent',
                'from': []
            }, {
                'name': ps.CAT_RESERVED_NAME,
                'type': 'independent',
                'from': []
            }],
            "group_id":
            0,
            "type":
            "single loss"
        }, {
            'headers': ["Year", "unit"],
            'categories': [{
                'name': ps.CAT_PREMIUM_NAME,
                'type': 'independent',
                'from': []
            }],
            "group_id":
            0,
            "type":
            "single loss"
        }]

        self.names = [
            "Premium_", "Premium", "Total Outstanding 2004", "Paid",
            "Total Incurred"
        ]
        self.dh = DataHolder()
        self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))

    def testTextualMatching(self):
        self.dh, dummy, dummy = RowParser.set_card_ids(self.trngs, self.dh)
        InputMatcher.match_triangles_to_output(self.trngs, self.dh)
        self.assertEqual(
            self.dh.id_dict[self.trngs[0]["connection"][ps.CAT_PAID_NAME]
                            ["data_struct_ids"][0]].name, self.names[3])
        self.assertEqual(
            self.dh.id_dict[self.trngs[0]["connection"][ps.CAT_RESERVED_NAME]
                            ["data_struct_ids"][0]].name, self.names[2])
        self.assertEqual(
            self.dh.id_dict[self.trngs[1]["connection"][ps.CAT_PREMIUM_NAME]
                            ["data_struct_ids"][0]].name, self.names[1])
Esempio n. 25
0
    def post(self, request):

        #Recieve name of file
        filename = request.data.get('fileName')
        # Build data holder
        sr_list = jsonpickle.decode(request.data['sr_list'])
        selected_sheets = request.data['selected_sheets']

        data_holder = DataHolder(filename)

        for sr in sr_list:
            if sr.sheet_name in selected_sheets:
                data_holder.add_sheet(sr.sheet_name,
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.row_vals),
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.xls_types),
                                      orig_sheet_name=sr.sheet_name)

        response_data = {}
        #dump this shiiiit
        #data_holder.to_pickle_file(pdir.TEMP_DIR + "from_views.pickle")
        if data_holder is None:
            raise ValueError("No data holder found")
        elif data_holder.n == 0:
            raise ValueError("No sheets in data holder")

        #Recieve triangle formats
        triangles = request.data.get('triangles')
        user_defined_triangles = triangles['templates']
        outputFormats = triangles['output_formats']
        n_outputs = triangles['number_of_outputs']
        input_format = triangles['inputFormat']
        tr_type = user_defined_triangles[0]['type']
        # if tr_type == "single":
        #     n_outputs = len(user_defined_triangles)
        # else:
        #     n_outputs = int(len(user_defined_triangles)/2)
        try:
            if input_format[0] == 'triangle':
                #print(tr_type, n_outputs)
                data_holder_dict, data_holder = TrianglePipeline.triangle_pipeline_dh(
                    data_holder, tri_type=tr_type, n_outputs=n_outputs)
            else:
                data_holder_dict, data_holder = TrianglePipeline.table_triangle_pipeline_dh(
                    data_holder)
            #DataHolder manipulation
            data_holder, group_ids, sheet_names = RowParser.set_card_ids(
                user_defined_triangles, data_holder)
            user_defined_triangles = InputMatcher.match_triangles_to_output(
                user_defined_triangles, data_holder)
            user_defined_triangles = RowParser.parse_output_from_triangle_forms(
                user_defined_triangles, data_holder)
        except DataHolderException as err:
            data = {}
            data['message'] = err.message
            data['dh'] = err.dh
            return Response({'response_error': data})

        SheetWriter.trngs_to_existing_excel(
            user_defined_triangles, pdir.TEMP_DIR + ps.OUTPUT_NAME + filename)

        #Unsure if all neded
        response_data["group_ids"] = group_ids
        response_data['output_triangles'] = user_defined_triangles
        #Building list for initial rendering

        response_data[
            "unit_triangles"] = ConnectDataAPIView.make_unit_triangle_list(
                data_holder)
        response_data["str_data_holder"] = data_holder.encode()
        if len(data_holder_dict) > 1:
            response_data["str_data_holder_dict"] = {
                key: val.encode()
                for key, val in data_holder_dict.items()
            }
        else:
            response_data["str_data_holder_dict"] = {
                data_holder.name: response_data["str_data_holder"]
            }

        return Response({'data': response_data})
Esempio n. 26
0
class InputMatcherTest(TestCase):

    def setUp(self):
        # self.trngs = [{'headers': ["Year", "unit"],
        #                               'categories': [
        #                                   {'name': 'Claim - Incurred',
        #                                    'type': 'sum',
        #                                    'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME]},
        #                                   {'name': ps.CAT_PAID_NAME,
        #                                    'type': 'independent',
        #                                    'from': []},
        #                                   {'name': ps.CAT_RESERVED_NAME,
        #                                    'type': 'independent',
        #                                    'from': []}
        #                               ],
        #                 "group_id": 0,
        #                "type": "aggregate"
        #                               },
        #                             {'headers': ["Year", "unit"],
        #                              'categories': [
        #                                  {'name': ps.CAT_PREMIUM_NAME,
        #                                   'type': 'independent',
        #                                   'from': []}],
        #                              "group_id": 0,
        #                              "type": "aggregate"
        #                              }]
        #info_dict = {'tri_type': 'aggregate', 'n_outputs': 1}
        #print(info_dict)
        #user_defined_triangles = OutputTriangleParser.generate_output_triangles(info_dict)

        self.names = ["Premium_", "Premium", "Total Outstanding 2004", "Paid", "Total Incurred"]
        self.dh = DataHolder("test")
        self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))

    # def testFitForOutput(self):
    #     dh = DataHolder("test")
    #     data = pd.DataFrame(data={'001. col2': [0, 0], '002. 1991': [0, 0]})
    #     prof = pd.DataFrame(data={'001. col2': [9, 9], '002. 1991': [9, 9]})
    #     dh.add_sheet("test", data, prof)
    #     data = pd.DataFrame(data={'001. col2': ["", 0], '002. 1991': [0, 0]})
    #     prof = pd.DataFrame(data={'001. col2': [9, 9], '002. 1991': [9, 9]})
    #     dh.add_sheet("test", data, prof)
    #     data = pd.DataFrame(data={'001. col3': [0, 0], '002. 1991': [0, 0]})
    #     prof = pd.DataFrame(data={'001. col3': [9, 9], '002. 1991': [9, 9]})
    #     dh.add_sheet("test", data, prof)
    #     data = pd.DataFrame(data={'001. col2': [0, 0], '002. 1991': [0, 0]})
    #     prof = pd.DataFrame(data={'001. col2': [9, 9], '002. 1991': [9, 9]})
    #     dh.add_sheet("test", data, prof)
    #     card_ids = [0, 0, 0, 1]
    #     for ds, id in zip(dh, card_ids):
    #         ds.card_id = id
    #     InputMatcher.set_fit_for_output(dh)
    #     solution = [True, False, False, True]
    #     for ds, sol in zip(dh, solution):
    #         self.assertEqual(ds.fit_for_output, sol)

    def testDistrMatching(self):
        test_distr = np.array([1,1,1,1,1,2,2,2,2,3,3,3,4,4,5])
        test_distr = test_distr/mad(test_distr)
        test_distr = test_distr - np.mean(test_distr)
        distr = {
            ps.CAT_RESERVED_NAME: test_distr
        }
        dh = DataHolder("test")
        dh.add_sheet(self.names[0], pd.DataFrame(data=[1, 1, 1, 1, 2, 2, 2, 3, 3, 4]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*10))
        dh.add_sheet(self.names[0], pd.DataFrame(data=[5, 9, 3, 7, 18]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*5))
        matches = list()
        for id in dh.id_dict:
            matches.append(InputMatcher.compare_with_distribution(id, ps.CAT_RESERVED_NAME, dh, distr))
        # First entry is more similar to the reference, therefore, first match should be better
        self.assertTrue(matches[0] > matches[1])

    def test_headers_are_padded(self):
        n_padded = 11
        headers = [str(num + 1).zfill(pp.N_DIGITS_HEADER_PADDING) + ". " + ps.HEADER_PLACE_HOLDER for num in range(n_padded)]
        headers.append("date001.2")
        bools = InputMatcher.headers_are_padded(headers)
        truth = [True for i in range(n_padded)]
        truth.append(False)
        self.assertTrue(np.all(np.array(truth) == np.array(bools)))