def build_triangle_from_table(dh):
     new_dh = DataHolder(dh.name)
     pool = DebuggablePool(pp.N_CORES)
     # First find all date cols and see if one of them has target structure.
     for dh_ind, ds in enumerate(dh.data_struct_list):
         id_col, hori_date_col, vert_date_col = TriangleFromTableBuilder.do_the_magic(
             ds, pool)
         # cut each id into one row
         cut_list = TriangleFromTableBuilder.make_cut_list(
             ds.df_data[id_col])
         # use the cut_list to insert all elements
         tr_cols = pd.Series(ds.df_profiles.iloc[0, :] ==
                             SheetTypeDefinitions.TRIANGLE_ELEMENT,
                             index=ds.df_profiles.columns)
         pad_header_mapping = TriangleFromTableBuilder.make_pad_header_mapping(
             ds, hori_date_col)
         vert_col_tup = (vert_date_col, ds.df_data[vert_date_col])
         hori_col_tup = (hori_date_col, ds.df_data[hori_date_col])
         id_col_tup = (id_col, ds.df_data[id_col])
         func = partial(TriangleFromTableBuilder.apply_cuts, cut_list,
                        vert_col_tup, hori_col_tup, id_col_tup,
                        pad_header_mapping)
         tr_col_tup_list = [(col_name, ds.df_data[col_name])
                            for col_name in tr_cols.index[tr_cols]]
         out = pool.map(func, tr_col_tup_list)
         #for name, tr_col in ds.df_data[tr_cols.index[tr_cols]].iteritems():
         for temp_df_data, temp_df_profiles, name in out:
             new_dh.add_sheet(name, temp_df_data, temp_df_profiles)
         #new_dh.add_sheet(name, temp_df_data, temp_df_profiles)
     pool.close()
     return new_dh
Ejemplo n.º 2
0
    def name_and_scrub_triangle(dh, new_dh_dict, meta_dh=None):
        new_dh = DataHolder(dh.name)
        word_set_list = list()
        for ds in dh:
            word_set_list.append(
                SubTriangler.identify_category_name(ds, meta_dh))
        if meta_dh != None:
            if meta_dh.n > 0:
                SubTriangler.divide_meta_data(dh, meta_dh, word_set_list)
        # Find the most unique name
        for i in range(len(word_set_list)):
            ds = dh.data_struct_list[i]
            difference = word_set_list[i].copy()
            for j in range(len(word_set_list)):
                if j != i and ds.orig_sheet_name == dh.data_struct_list[
                        j].orig_sheet_name:
                    difference = difference.difference(word_set_list[j])
            if len(difference) > 0:
                stringified = sorted([str(el) for el in difference])
                name = " ".join(stringified)
                name = name.translate(SubTriangler.remove_digits)
            else:
                name = str(i)
            if ds.name != ds.orig_sheet_name:
                name = ds.name + " " + name

            new_dh.add_sheet(name,
                             ds.df_data,
                             ds.df_profiles,
                             orig_sheet_name=ds.orig_sheet_name)
        new_dh_dict[dh.name] = new_dh
Ejemplo n.º 3
0
    def setUp(self):
        # self.trngs = [{'headers': ["Year", "unit"],
        #                               'categories': [
        #                                   {'name': 'Claim - Incurred',
        #                                    'type': 'sum',
        #                                    'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME]},
        #                                   {'name': ps.CAT_PAID_NAME,
        #                                    'type': 'independent',
        #                                    'from': []},
        #                                   {'name': ps.CAT_RESERVED_NAME,
        #                                    'type': 'independent',
        #                                    'from': []}
        #                               ],
        #                 "group_id": 0,
        #                "type": "aggregate"
        #                               },
        #                             {'headers': ["Year", "unit"],
        #                              'categories': [
        #                                  {'name': ps.CAT_PREMIUM_NAME,
        #                                   'type': 'independent',
        #                                   'from': []}],
        #                              "group_id": 0,
        #                              "type": "aggregate"
        #                              }]
        #info_dict = {'tri_type': 'aggregate', 'n_outputs': 1}
        #print(info_dict)
        #user_defined_triangles = OutputTriangleParser.generate_output_triangles(info_dict)

        self.names = ["Premium_", "Premium", "Total Outstanding 2004", "Paid", "Total Incurred"]
        self.dh = DataHolder("test")
        self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]), pd.DataFrame(data=[0]))
Ejemplo n.º 4
0
    def make_sol_dict():
        """
        Run present pipeline and save the merge results
        :return:
        """
        file_names = ["FORMAT3_Copy of KommuneMTPLforTriangle.xls",
            "C Triangulations analysis R2017 GC20161109.xls",
            "EVOLUTION 2017 _ M+F - Triangles cat nat brut net.xls",
            "Bsp8 _ Dreiecke aus GCNA für CU1.4.1.xls",
            "Analysis MTPL MOD.xls",
            "Bsp6 _ Dreiecke aus GCNA für CU1.4.1.xls",
            "FORMAT6_sinistres.xls",
            "FORMAT1_LOSSES-MTPL-OVER-500-GROUP-2005_modified.xls"]
        solutions_dict = dict()
        raw_dict = dict()
        for file_name in file_names:
            sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR + "/raw_test_files/" + file_name)
            dh = DataHolder()
            for sr in sr_list:
                dh.add_sheet(sr.sheet_name, pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                             pd.DataFrame(columns=sr.headers, data=sr.xls_types), orig_sheet_name=sr.sheet_name)

            dh = SheetPreProcessor.separate_components(dh)
            raw_dict[file_name] = dh.encode()
            dh = HorizontalMerger.horizontal_merge(dh)
            #temp_path = pdir.RESOURCES_DIR + "/temp/"
            #dh.write_excel(temp_path + file_name)
            solutions_dict[file_name] = dh
        solutions_dict = MergePararametersOptimizer.make_ind_col_dict(solutions_dict)
        with open(pdir.RESOURCES_DIR + "/test/merge_solutions.obj", "wb") as temp_file:
            pickle.dump(solutions_dict, temp_file)
        with open(pdir.RESOURCES_DIR + "/test/raw_test.obj", "wb") as temp_file:
            pickle.dump(raw_dict, temp_file)
Ejemplo n.º 5
0
    def numify_dates(dh):
        meta_dh = DataHolder(dh.name + "_meta")
        for ds in dh:
            d_cols = DateColIdentifier.identify_marked_date_cols(ds)
            date_data = ds.df_data[d_cols.index[d_cols]]

            meta_data = pd.DataFrame()
            meta_profiles = pd.DataFrame()
            for name, col in date_data.iteritems():
                ds.df_profiles.loc[:, name] = SheetTypeDefinitions.STRING_DATE
                types = col.map(lambda x: isinstance(x, float))
                if types.all():
                    #do nothing basically
                    ds.df_data[name] = col.astype(int)
                else:
                    meta_data[name] = col.copy()
                    meta_profiles[name] = ds.df_profiles[name].copy()
                    temp_col = col.copy()
                    temp_col[types] = temp_col[types]
                    #SOME PROBLEM WITH KEEPING TYEPS INT IN SHEET W
                    temp_col[np.logical_not(types)] = temp_col[np.logical_not(types)].map(lambda x: longest_numeral(x))
                    temp_col = temp_col.astype(int)
                    ds.df_data[name] = temp_col
                # meta_backed = False
                # for index, val in col.iteritems():
                #     if not isinstance(val, int):
                #         if not meta_backed:
                #             meta_backed = True
                #             meta_data[name] = col.copy()
                #             meta_profiles[name] = ds.df_profiles[name].copy()
                #         num = longest_numeral(val)
                #         ds.df_data.loc[index, name] = num
                    meta_dh.add_sheet("date_backup", meta_data, meta_profiles, orig_sheet_name=ds.orig_sheet_name)
            return dh, meta_dh
Ejemplo n.º 6
0
    def find_triangle_headers(dh, **kwargs):
        test_settings = False
        if 'test_settings' in kwargs:
            test_settings = kwargs['test_settings']
        return_meta = False
        meta_dh = DataHolder(dh.name + "_meta")
        if 'return_meta' in kwargs:
            return_meta = kwargs["return_meta"]

        for ds in dh.data_struct_list:
            # only do this for potential triangles:
            if ds.df_data.shape[0] >= pp.MIN_ROWS_TRIANGLE or test_settings:
                headers, pd_ind = TriangleHeaderFinder.find_ds_headers(ds)
                HeaderFinder.insert_headers(headers, pd_ind, ds.df_data, ds.df_profiles)
                ds.df_data = ds.df_data.reindex(sorted(ds.df_data.columns), axis=1)
                ds.df_profiles = ds.df_profiles.reindex(sorted(ds.df_profiles.columns), axis=1)
                # now remove unnecessary rows
                ds = TriangleHeaderFinder.remove_stray_rows(ds, pd_ind)
                if return_meta and ds is not None:
                    for split in ds.col_split_ds():
                        if not np.all(split.df_profiles == SheetTypeDefinitions.EMPTY_STRING) and not (np.all(split.df_data == "")):
                            meta_dh.add_ds(split)
        if return_meta:
            return dh, meta_dh
        else:
            return dh
Ejemplo n.º 7
0
    def merge_with_merges_list(dh, merges):
        new_dh = DataHolder(dh.name)
        for merge in merges:
            profiles = None
            data = None
            name_set = set()
            for ind in merge:

                ds = dh.id_dict[ind]
                name_set.update([ds.name])
                if profiles is None:
                    profiles = ds.df_profiles.copy()
                    data = ds.df_data.copy()
                else:
                    temp_profiles = ds.df_profiles.copy()
                    temp_data = ds.df_data.copy()
                    # TODO: generalize to other positions then the first position
                    #if profiles.shape[1] > temp_profiles.shape[1]:
                    #    for header in profiles.columns[temp_profiles.shape[1]:]:
                    #        temp_profiles[header] = SheetTypeDefinitions.ZERO_FLOAT
                    #        temp_data[header] = 0.0
                    profiles = pd.concat([profiles, temp_profiles], sort=True)
                    profiles.fillna(SheetTypeDefinitions.ZERO_FLOAT,
                                    inplace=True)
                    data = pd.concat([data, temp_data], sort=True)
            new_dh.add_sheet("_".join(list(name_set)),
                             data,
                             profiles,
                             orig_sheet_name=ds.orig_sheet_name)
        return new_dh
Ejemplo n.º 8
0
    def vertical_category_division(ds, new_dh_dict, meta_dh):
        # find the category column
        # Should be strings (for now) (used)
        # Kind of periodic (thus repetitive entries) (used)
        # some entries may change slightly (used)
        # period may change slightly (not checked for now)(should be checked in new if statment)
        # Should get tag matches in dict (not checked for now)
        df_data = ds.df_data
        df_profiles = ds.df_profiles
        orig_name = ds.orig_sheet_name
        for col_name, col in df_data.iteritems():

            string_ratio = np.sum(
                df_profiles[col_name].values == SheetTypeDefinitions.STRING
            ) / df_profiles[col_name].values.size
            if string_ratio > pp.MIN_STRING_RATIO_CAT_COL:
                # check periodic potential
                string_col = col.astype(str)
                unique, counts = np.unique(string_col, return_counts=True)
                ratio = np.max(counts) / col.size
                if ratio < pp.MAX_RATIO_LARGEST_CAT and ratio > pp.MIN_RATIO_LARGEST_CAT and len(
                        unique) < pp.MAX_N_CATS:
                    if col_name in new_dh_dict:
                        new_dh = new_dh_dict[col_name]
                    else:
                        new_dh = DataHolder(col_name)
                        new_dh_dict[col_name] = new_dh
                    #period_label_bool = counts * period > string_col.size - period
                    # now get the remaining
                    #sub_period_label = unique[period_label_bool == False]
                    match_dict = SubTriangler.component_finder(unique)

                    # now load the new_dh

                    for name in match_dict:
                        cond = np.array([
                            string_col.values == sub_name
                            for sub_name in match_dict[name]
                        ]).any(axis=0)
                        sub_df_data = df_data[cond].drop(
                            columns=[string_col.name])
                        sub_df_profiles = df_profiles[cond].drop(
                            columns=[string_col.name])
                        if name == "" or np.sum(cond) < 4:
                            new_ds = DataStruct(sub_df_data,
                                                sub_df_profiles,
                                                name,
                                                orig_sheet_name=orig_name)
                            for split in new_ds.col_split_ds():
                                if not np.all(split.df_profiles ==
                                              SheetTypeDefinitions.EMPTY_STRING
                                              ) and not (np.all(
                                                  split.df_data == "")):
                                    meta_dh.add_ds(split)
                        else:
                            new_dh.add_sheet(ds.name + " - " + name,
                                             sub_df_data,
                                             sub_df_profiles,
                                             orig_sheet_name=orig_name)
Ejemplo n.º 9
0
    def testTurnTriangle(self):
        dh = DataHolder("test")
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [0, 0, 2], '1991': [0, 0, 0], '1992': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [1, 1, 0], '1991': [0, 0, 0], '1992': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        data = pd.DataFrame(data={'col1': ["", "", 0], 'col2': [3, 1, 0], '1991': [0, 0, 0], '1992-': [1, 0, 0]})
        prof = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992-': [2, 6, 6]})
        dh.add_sheet("test", data, prof)
        tr_cols_dict = {tuple(dh.data_struct_list[0].df_data.columns) + dh.data_struct_list[0].df_data.shape:
                pd.Series([False, True, True, True], index=dh.data_struct_list[0].df_data.columns),
                        tuple(dh.data_struct_list[2].df_data.columns) + dh.data_struct_list[0].df_data.shape:
                            pd.Series([False, True, True, True], index=dh.data_struct_list[2].df_data.columns)
        }
        dh_copy = dh.copy_without_memory()
        TriangleStripper.turn_triangle(dh, tr_cols_dict, alt_min_score=0.6)

        for ds in dh_copy.data_struct_list:
            tr_cols = tr_cols_dict[tuple(ds.df_data.columns) + ds.df_data.shape]
            tri_part = ds.df_data[tr_cols.index[tr_cols]].values
            ds.df_data[tr_cols.index[tr_cols]] = np.transpose(tri_part)

        for ds, ds_copy in zip(dh.data_struct_list, dh_copy.data_struct_list):
            self.assertTrue(ds.df_data.equals(ds_copy.df_data))
            self.assertTrue(ds.df_profiles.equals(ds_copy.df_profiles))
Ejemplo n.º 10
0
    def test_fill_hollow_str_cols(self):
        dh = DataHolder("test")
        df_data = pd.DataFrame(data={'col1': [1, "", "", "j", "", "", "6", "", "b",
                                              "g", "", "", "j", "", "", "6", "", "b", "", ""],
                                     '1992': ["", "", "", "j", "", "", "6", "", "b",
                                              "g", "", "", "j", "", "", "6", "", "b", "g", "hrumpff"]})

        df_profiles = pd.DataFrame(data={'col1': [2, 0, 0, 1, 0, 0, 1, 0, 1,
                                                  1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],

                                         '1992': [0, 0, 0, 1, 0, 0, 1, 0, 1,
                                                  1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1]})
        dh.add_sheet("test", df_data, df_profiles)
        StringFiller.fill_hollow_str_cols(dh)
        dh_sol = DataHolder("test")
        df_data_sol = pd.DataFrame(data={'col1': [1, 1, 1, "j", "j", "j", "6", "6", "b",
                                              "g", "g", "g", "j", "j", "j", "6", "6", "b", "b", "b"],
                                     '1992': ["", "", "", "j", "j", "j", "6", "6", "b",
                                              "g", "g", "g", "j", "j", "j", "6", "6", "b", "g", "hrumpff"]})

        df_profiles_sol = pd.DataFrame(data={'col1': [2, 2, 2, 1, 1, 1, 1, 1, 1,
                                                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],

                                         '1992': [0, 0, 0, 1, 1, 1, 1, 1, 1,
                                                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

        dh_sol.add_sheet("test", df_data_sol, df_profiles_sol)
        self.assertTrue(dh.equals(dh_sol))
Ejemplo n.º 11
0
 def setUp(self):
     self.names = ["first", "second"]
     self.dh = DataHolder("test")
     d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["3", "4"]})
     d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
     self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="1")
     d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["1", "1"]})
     d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [1, 1]})
     self.dh.add_sheet(self.names[0], d1, d2, orig_sheet_name="2")
     d1 = pd.DataFrame(data={'col1': ["1", "2"], 'col2': ["15", "16"]})
     d2 = pd.DataFrame(data={'col1': [1, 2], 'col2': [15, 16]})
     self.dh.add_sheet(self.names[1], d1, d2, orig_sheet_name="2")
Ejemplo n.º 12
0
def main(file, settings):

    print(file)
    sr_list, file_name = ExcelLoader.load_excel(file)
    dh = DataHolder(file_name)
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dummy, new_dh = TrianglePipeline.table_triangle_pipeline_dh(dh)
    temp_path = pdir.RESOURCES_DIR + "/temp/"
    new_dh.write_excel(temp_path + file_name)
def run_test_per_file_name(file_name):

    print(file_name)
    logger = logging.getLogger("svm_writer")
    sr_list, file_name = ExcelLoader.load_excel(file_name)
    dh = DataHolder(file_name.split(".")[0])
    for sr in sr_list:
        dh.add_sheet(sr.sheet_name,
                     pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                     pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                     orig_sheet_name=sr.sheet_name)
    dh = SheetPreProcessor.pre_strip(dh)
    DateColIdentifier.identify_and_gen_date_cols(dh,
                                                 replace_col=False,
                                                 svm_logger=logger)
Ejemplo n.º 14
0
    def testFindTriangleHeaders(self):
        names = ["First", "Second"]
        dh = DataHolder("test")
        d1 = pd.DataFrame(data={'col1' + ps.HEADER_PLACE_HOLDER: ["1", "2", 1],
                                'col2' + ps.HEADER_PLACE_HOLDER: [3, "2", "3b"],
                                'col3' + ps.HEADER_PLACE_HOLDER: ["brum2", "4", 4],
                                'col4' + ps.HEADER_PLACE_HOLDER: [24, "4", "brum25"],
                                })
        d2 = d1.copy()
        d2.iloc[:, :]=1

        dh.add_sheet(names[0], d1, d2)
        dh = TriangleHeaderFinder.find_triangle_headers(dh, test_settings=True)
        headers = list(dh.data_struct_list[0].df_data.columns)
        self.assertEqual(headers, ["col11", "col23", "col3brum2", "col424"])
Ejemplo n.º 15
0
    def setUp(self):
        self.trngs = [{
            'headers': ["Year", "unit"],
            'categories': [{
                'name': 'Claim - Incurred',
                'type': 'sum',
                'from': [ps.CAT_PAID_NAME, ps.CAT_RESERVED_NAME]
            }, {
                'name': ps.CAT_PAID_NAME,
                'type': 'independent',
                'from': []
            }, {
                'name': ps.CAT_RESERVED_NAME,
                'type': 'independent',
                'from': []
            }],
            "group_id":
            0,
            "type":
            "single loss"
        }, {
            'headers': ["Year", "unit"],
            'categories': [{
                'name': ps.CAT_PREMIUM_NAME,
                'type': 'independent',
                'from': []
            }],
            "group_id":
            0,
            "type":
            "single loss"
        }]

        self.names = [
            "Premium_", "Premium", "Total Outstanding 2004", "Paid",
            "Total Incurred"
        ]
        self.dh = DataHolder()
        self.dh.add_sheet(self.names[0], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[1], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[2], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[3], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
        self.dh.add_sheet(self.names[4], pd.DataFrame(data=[0]),
                          pd.DataFrame(data=[0]))
Ejemplo n.º 16
0
 def find_headers(dh):
     meta_dh = DataHolder(dh.name + "_meta")
     for ds in dh:
         df_data, df_profiles = ds.df_data, ds.df_profiles
         bin_mat = df_profiles.values[:pp.N_POSSIBLE_HEADER_ROWS, ] == 1
         one_nums = np.sum(bin_mat, axis=1)
         # subtract identical strings
         for i in range(pp.N_POSSIBLE_HEADER_ROWS):
             sub = df_data.shape[1] - len(df_data.iloc[i, :].unique())
             one_nums[i] -= sub
         header_ind = np.argmax(one_nums)
         pd_ind = df_profiles.index[header_ind]
         headers = df_data.loc[[pd_ind]]
         HeaderFinder.insert_headers(headers, pd_ind, df_data, df_profiles)
         meta_ds = HeaderFinder.remove_leading_rows(ds, pd_ind)
         meta_dh.add_ds(meta_ds)
     return dh, meta_dh
Ejemplo n.º 17
0
    def post(self, request):
        sr_list = jsonpickle.decode(request.data['sr_list'])
        dhName = request.data['dhName']
        selected_sheets = request.data['selected_sheets']

        data_holder = DataHolder(dhName)

        for sr in sr_list:
            if sr.sheet_name in selected_sheets:
                data_holder.add_sheet(sr.sheet_name,
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.row_vals),
                                      pd.DataFrame(columns=sr.headers,
                                                   data=sr.xls_types),
                                      orig_sheet_name=sr.sheet_name)

        encoded = data_holder.encode()

        return Response(encoded, status=200)
Ejemplo n.º 18
0
    def test_col_identification(self):
        df_data = pd.DataFrame(data={'col1': ["1", "", "1991", "1992", "2007", "rew", "1993", "1994", "1995x"],
                                     'col2': [43, 1994, 2015, 1994, 7, 2015, 1994, 1999, 2015],
                                     '1991': [1993, 1, 6, 1993, 1, 6, 1993, 1, 6],
                                     '1992': ["g", "r", "h", "j", "t", "f", "6", "p", "6"],
                                     '1993': ["1", "1993", "6", "1993", "", "rew", "1993", "1994", ""]})

        df_profiles = pd.DataFrame(data={'col1': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         'col2': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1991': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1992': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         '1993': [1, 1, 1, 1, 1, 1, 1, 1, 1]})
        dh = DataHolder('test')
        dh.add_sheet('test', df_data, df_profiles)
        dh = DateFiller.identify_and_gen_date_cols(dh, replace_col=False)
        dh = ColTypeIdentifier.identify_col_types(dh)
        profiles = dh.data_struct_list[0].df_profiles
        self.assertTrue(profiles.iloc[1,1], SheetTypeDefinitions.STRING_DATE)
        self.assertTrue(profiles.iloc[0, 2], SheetTypeDefinitions.TRIANGLE_ELEMENT)
        self.assertTrue(profiles.iloc[0, 3], SheetTypeDefinitions.ID_ELEMENT)
Ejemplo n.º 19
0
    def chop_triangles_horizontally(dh):
        """
        checks for repeating header and splits ds:s
        :param dh: DataHolder
        :return: DataHolder
        """
        chop, chop_lists = TriangleChopper.make_occurrence_list(dh)

        if not chop:
            return dh
        else:
            new_dh = DataHolder(dh.name)
            for ind, ds in enumerate(dh):
                occurrence_list = chop_lists[ind]
                if any(occurrence_list):
                    for i in range(1, np.max(occurrence_list) + 1):
                        bools = np.logical_or(occurrence_list == 0,
                                              occurrence_list == i)
                        df_data = ds.df_data[ds.df_data.columns[bools]].copy()
                        df_profiles = ds.df_profiles[
                            ds.df_profiles.columns[bools]].copy()
                        new_dh.add_sheet(ds.name,
                                         df_data,
                                         df_profiles,
                                         orig_sheet_name=ds.orig_sheet_name)

                else:
                    new_dh.add_ds(ds)
            return new_dh
Ejemplo n.º 20
0
 def post(self, request):
     user_defined_triangles = request.data.get("output")
     input_json = request.data.get("input")
     dh = DataHolder.decode(input_json)
     #dh, group_ids, sheet_names = RowParser.set_card_ids(user_defined_triangles, dh)
     change = request.data.get("change")
     filename = request.data.get("filename")
     # Update connection with the change variable
     RowParser.make_changes(dh, user_defined_triangles, change)
     user_defined_triangles = RowParser.parse_output_from_triangle_forms(
         user_defined_triangles, dh)
     SheetWriter.trngs_to_existing_excel(
         user_defined_triangles, pdir.TEMP_DIR + ps.OUTPUT_NAME + filename)
     return Response({'output': user_defined_triangles})
Ejemplo n.º 21
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)
        if self.request.session.get('data_holder'):
            sr_list = jsonpickle.decode(
                self.request.session.get('data_holder'))
            data_holder = DataHolder()
            for sr in sr_list:
                data_holder.add_sheet(
                    sr.sheet_name,
                    pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                    pd.DataFrame(columns=sr.headers, data=sr.xls_types))
            context[
                "diff_dicts"], data_holder = CleaningPipeline.clean_data_dh(
                    data_holder)
        else:
            sheets = DataSheet.objects.filter(
                owner=self.request.user).order_by('sheet_name')
            context["diff_dicts"], data_holder = CleaningPipeline.clean_data(
                sheets)
        #sheets = DataSheet.objects.filter(owner=self.request.user).order_by('sheet_name')
        #context["diff_dicts"], mem_dict = CleaningPipeline.clean_data(sheets)

        return context
Ejemplo n.º 22
0
    def find_triangles(dh, **kwargs):
        return_meta = False
        if 'return_meta' in kwargs:
            return_meta = kwargs['return_meta']

        triangle_dh = DataHolder(dh.name)
        rest_dh = DataHolder(dh.name + '_non-triangular')
        #bool_array = np.zeros(triangle_dh.n, dtype=bool)
        for dh_ind, ds in enumerate(dh.data_struct_list):
            df_data, df_profiles = ds.df_data, ds.df_profiles
            # now select triangles in som smart way
            bool = TriangleFinder.is_triangle(ds, **kwargs)
            if bool:
                TriangleFinder.add_triangle_to_dh(ds, triangle_dh)
            else:
                rest_dh.add_sheet(ds.name,
                                  df_data,
                                  df_profiles,
                                  orig_sheet_name=ds.orig_sheet_name)
        # Now get the triangle similiar data structs
        triangle_similar = TriangleFinder.find_triangles_by_similarity(
            triangle_dh, rest_dh)
        if len(triangle_similar) > 0:
            rest_copy = rest_dh.copy_without_memory()
            rest_dh = DataHolder(rest_copy.name)
            for ds in rest_copy:
                if ds.id in triangle_similar:
                    TriangleFinder.add_triangle_to_dh(ds, triangle_dh)
                else:
                    rest_dh.add_sheet(ds.name,
                                      ds.df_data,
                                      ds.df_profiles,
                                      orig_sheet_name=ds.orig_sheet_name)
        if return_meta:
            return triangle_dh, rest_dh
        else:
            return triangle_dh
Ejemplo n.º 23
0
 def run_test_per_file_name(in_obj, in_tup, form):
     """
     Performs a fixture test for one file. The encapsulators determine the scope of the test.
     :param in_obj:
     :param in_tup:
     :param form:
     :return:
     """
     if not pp.LOG_SVM_FEATURES:
         print(in_tup)
     sr_list, file_name = ExcelLoader.load_excel(pdir.RESOURCES_DIR +
                                                 "/raw_test_files/" +
                                                 in_tup[0])
     dh = DataHolder(file_name.split(".")[0])
     for sr in sr_list:
         dh.add_sheet(sr.sheet_name,
                      pd.DataFrame(columns=sr.headers, data=sr.row_vals),
                      pd.DataFrame(columns=sr.headers, data=sr.xls_types),
                      orig_sheet_name=sr.sheet_name)
     #Choose encapsulator class to determine test scope
     #dhce = DataHolderCallTestEncapsulator(pdir.RESOURCES_DIR + '/temp/pickles/', pdir.RESOURCES_DIR + '/test/pickles/')
     #dhce = DataHolderCallSaveEncapsulator(pdir.RESOURCES_DIR + '/test/pickles/')
     #dhce = DataHolderCallOutputEncapsulator(pdir.RESOURCES_DIR + '/left_triangles/' + in_tup[0], in_tup[1])
     dhce = DataHolderCallEncapsulator()
     #dhce = DataHolderCallTimeEncapsulator()
     if form == "triangle_table":
         dummy, dh = TrianglePipeline.table_triangle_pipeline_dh(dh, dhce)
     elif form == "triangle":
         dummy, dh = TrianglePipeline.triangle_pipeline_dh(
             dh,
             dhce,
             tri_type=in_tup[1]["tri_type"],
             n_outputs=in_tup[1]["n_outputs"])
     elif form == "cleaning":
         dummy, dh = CleaningPipeline.clean_data_dh(dh)
     ToDiscComparer.compare_to_disc(in_obj, file_name, dh)
Ejemplo n.º 24
0
    def test_serialization(self):
        self.dh.data_struct_list[0].roles.append("Claims Paid")
        self.dh.data_struct_list[0].df_data.sort_values("col1", ascending=False, inplace=True)
        serialized = self.dh.encode()
        data_framed = DataHolder.decode(serialized)

        assert_frame_equal(self.dh.data_struct_list[0].df_data, data_framed.data_struct_list[0].df_data)
        assert_frame_equal(self.dh.data_struct_list[1].df_data, data_framed.data_struct_list[1].df_data)
        assert_frame_equal(self.dh.data_struct_list[2].df_data, data_framed.data_struct_list[2].df_data)

        assert_frame_equal(self.dh.data_struct_list[0].df_profiles, data_framed.data_struct_list[0].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[1].df_profiles, data_framed.data_struct_list[1].df_profiles)
        assert_frame_equal(self.dh.data_struct_list[2].df_profiles, data_framed.data_struct_list[2].df_profiles)
        self.assertEqual(data_framed.data_struct_list[0].roles[0], "Claims Paid")
        # Test conservation of ids
        for ind in range(len(self.dh.data_struct_list)):
            self.assertEqual(data_framed.data_struct_list[ind].id, self.dh.data_struct_list[ind].id)
Ejemplo n.º 25
0
    def encapsulate_call(self, function, dh, **kwargs):
        out = function(dh, **kwargs)

        if isinstance(out, DataHolder):
            dh = out
        elif isinstance(out, tuple):
            dh = out[0]
            # Read the true solution
        sol = DataHolder.from_pickle_file(self.sol_path + dh.name + "_" +
                                          str(self.counter) + ".pickle")
        dh = dh.merge_in_original_sheets(save_sheet_names=True)
        if not dh.equals(sol):
            print(function)
            dh.write_excel(self.out_path + "candidate.xls")
            sol.write_excel(self.out_path + "solution.xls")
            self.assertTrue(dh.equals(sol))
        self.counter += 1
        return out
Ejemplo n.º 26
0
 def perform_horizontal_merge(dh, distances):
     new_dh = DataHolder(dh.name)
     # make a greedy merge
     merged_set = set()
     for part in distances:
         if part[0] > pp.MAX_HORIZONTAL_MERGE_DISTANCE:
             break
         merge = [part[1]] + part[2]
         if len(set(merge).intersection(merged_set)) == 0:
             if len(part[2]) > 1:
                 # Merge vertically!
                 df_data_list = [
                     dh.id_dict[df_id].df_data for df_id in part[2]
                 ]
                 df_profiles_list = [
                     dh.id_dict[df_id].df_profiles for df_id in part[2]
                 ]
                 df_data = pd.concat(df_data_list, axis=0, sort=True)
                 df_profiles = pd.concat(df_profiles_list,
                                         axis=0,
                                         sort=True)
             else:
                 df_data = dh.id_dict[part[2][0]].df_data
                 df_profiles = dh.id_dict[part[2][0]].df_profiles
             df_data = pd.concat([dh.id_dict[part[1]].df_data, df_data],
                                 axis=1,
                                 sort=True)
             df_profiles = pd.concat(
                 [dh.id_dict[part[1]].df_profiles, df_profiles],
                 axis=1,
                 sort=True)
             df_data = df_data.reindex(sorted(df_data.columns), axis=1)
             df_profiles = df_profiles.reindex(sorted(df_profiles.columns),
                                               axis=1)
             merged_set.update(merge)
             new_dh.add_sheet(
                 dh.id_dict[merge[0]].name,
                 df_data,
                 df_profiles,
                 orig_sheet_name=dh.id_dict[merge[0]].orig_sheet_name)
     # add the remaining
     for id_key in dh.id_dict:
         if id_key not in merged_set:
             new_dh.add_sheet(
                 dh.id_dict[id_key].name,
                 dh.id_dict[id_key].df_data,
                 dh.id_dict[id_key].df_profiles,
                 orig_sheet_name=dh.id_dict[id_key].orig_sheet_name)
     return new_dh
Ejemplo n.º 27
0
 def perform_vertical_chop(dh, chop_bools, chop_lists):
     new_dh = DataHolder(dh.name)
     for ind, ds in enumerate(dh):
         if chop_bools[ind]:
             cut = chop_lists[ind]
             # Don't cut too much
             if len(cut) < pp.MAX_NUM_VERTICAL_CHOPS:
                 cut = [0] + cut.tolist()
                 for i in range(len(cut) - 1):
                     temp_df_data = ds.df_data.iloc[cut[i]:cut[i + 1], :]
                     temp_df_profiles = ds.df_profiles.iloc[
                         cut[i]:cut[i + 1], :]
                     new_ds = DataStruct(temp_df_data,
                                         temp_df_profiles,
                                         ds.name,
                                         orig_sheet_name=ds.orig_sheet_name)
                     new_dh.add_ds(new_ds)
             else:
                 new_dh.add_ds(ds)
         else:
             new_dh.add_ds(ds)
     return new_dh
Ejemplo n.º 28
0
 def testDistrMatching(self):
     test_distr = np.array([1,1,1,1,1,2,2,2,2,3,3,3,4,4,5])
     test_distr = test_distr/mad(test_distr)
     test_distr = test_distr - np.mean(test_distr)
     distr = {
         ps.CAT_RESERVED_NAME: test_distr
     }
     dh = DataHolder("test")
     dh.add_sheet(self.names[0], pd.DataFrame(data=[1, 1, 1, 1, 2, 2, 2, 3, 3, 4]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*10))
     dh.add_sheet(self.names[0], pd.DataFrame(data=[5, 9, 3, 7, 18]), pd.DataFrame(data=[SheetTypeDefinitions.TRIANGLE_ELEMENT]*5))
     matches = list()
     for id in dh.id_dict:
         matches.append(InputMatcher.compare_with_distribution(id, ps.CAT_RESERVED_NAME, dh, distr))
     # First entry is more similar to the reference, therefore, first match should be better
     self.assertTrue(matches[0] > matches[1])
Ejemplo n.º 29
0
 def test_output(self):
     path = pdir.RESOURCES_DIR + "left_triangles/outtake/"
     #path = pdir.RESOURCES_DIR + "left_triangles/"
     for file in os.listdir(path):
         if file.endswith(".pickle"):
             print(file)
             with open(path + file, 'rb') as f:
                 read_data = pickle.load(f)
             dh = DataHolder.decode(read_data["DataHolder"])
             info_dict = read_data["extra_content"]
             print(info_dict)
             user_defined_triangles = OutputTriangleParser.generate_output_triangles(
                 info_dict)
             data_holder, group_ids, sheet_names = RowParser.set_card_ids(
                 user_defined_triangles, dh)
             user_defined_triangles = InputMatcher.match_triangles_to_output(
                 user_defined_triangles, data_holder)
             user_defined_triangles = RowParser.parse_output_from_triangle_forms(
                 user_defined_triangles, data_holder)
             head, sep, tail = file.partition(".xls")
             SheetWriter.trngs_to_excel(user_defined_triangles, head + sep)
Ejemplo n.º 30
0
    def post(self, request):
        # Need to post - str_data_holder, output triangles (templates)

        str_data_holder = request.data.get('str_data_holder')
        data_holder = DataHolder.decode(str_data_holder)

        response_data = {}

        if data_holder is None:
            raise ValueError("No data holder found")
        elif data_holder.n == 0:
            raise ValueError("No sheets in data holder")

        #Recieve triangle formats
        user_defined_triangles = request.data.get('templates')

        try:
            #DataHolder manipulation
            data_holder, group_ids, sheet_names = RowParser.set_card_ids(
                user_defined_triangles, data_holder)
            user_defined_triangles = InputMatcher.match_triangles_to_output(
                user_defined_triangles, data_holder)
            user_defined_triangles = RowParser.parse_output_from_triangle_forms(
                user_defined_triangles, data_holder)
        except DataHolderException as err:
            data = {}
            data['message'] = err.message
            data['dh'] = err.dh
            return Response({'response_error': data})

        #SheetWriter.trngs_to_existing_excel(user_defined_triangles, pdir.TEMP_DIR + ps.OUTPUT_NAME + filename)

        response_data["group_ids"] = group_ids
        response_data['output_triangles'] = user_defined_triangles
        response_data[
            "unit_triangles"] = ChangeDimensionAPIView.make_unit_triangle_list(
                data_holder)

        return Response({'data': response_data})