def vertical_category_division(ds, new_dh_dict, meta_dh):
        # find the category column
        # Should be strings (for now) (used)
        # Kind of periodic (thus repetitive entries) (used)
        # some entries may change slightly (used)
        # period may change slightly (not checked for now)(should be checked in new if statment)
        # Should get tag matches in dict (not checked for now)
        df_data = ds.df_data
        df_profiles = ds.df_profiles
        orig_name = ds.orig_sheet_name
        for col_name, col in df_data.iteritems():

            string_ratio = np.sum(
                df_profiles[col_name].values == SheetTypeDefinitions.STRING
            ) / df_profiles[col_name].values.size
            if string_ratio > pp.MIN_STRING_RATIO_CAT_COL:
                # check periodic potential
                string_col = col.astype(str)
                unique, counts = np.unique(string_col, return_counts=True)
                ratio = np.max(counts) / col.size
                if ratio < pp.MAX_RATIO_LARGEST_CAT and ratio > pp.MIN_RATIO_LARGEST_CAT and len(
                        unique) < pp.MAX_N_CATS:
                    if col_name in new_dh_dict:
                        new_dh = new_dh_dict[col_name]
                    else:
                        new_dh = DataHolder(col_name)
                        new_dh_dict[col_name] = new_dh
                    #period_label_bool = counts * period > string_col.size - period
                    # now get the remaining
                    #sub_period_label = unique[period_label_bool == False]
                    match_dict = SubTriangler.component_finder(unique)

                    # now load the new_dh

                    for name in match_dict:
                        cond = np.array([
                            string_col.values == sub_name
                            for sub_name in match_dict[name]
                        ]).any(axis=0)
                        sub_df_data = df_data[cond].drop(
                            columns=[string_col.name])
                        sub_df_profiles = df_profiles[cond].drop(
                            columns=[string_col.name])
                        if name == "" or np.sum(cond) < 4:
                            new_ds = DataStruct(sub_df_data,
                                                sub_df_profiles,
                                                name,
                                                orig_sheet_name=orig_name)
                            for split in new_ds.col_split_ds():
                                if not np.all(split.df_profiles ==
                                              SheetTypeDefinitions.EMPTY_STRING
                                              ) and not (np.all(
                                                  split.df_data == "")):
                                    meta_dh.add_ds(split)
                        else:
                            new_dh.add_sheet(ds.name + " - " + name,
                                             sub_df_data,
                                             sub_df_profiles,
                                             orig_sheet_name=orig_name)
Esempio n. 2
0
    def testIdentifyDateCols(self):
        data = pd.DataFrame(data={'col1': [1, 1, 6], 'col2': [6, 6, 6], '1991': [6, 6, 6], '1992'+ps.TRANSFORMED_DATE_COL_NAME: [1, 6, 6]})
        ds = DataStruct(data, data, "test")
        cols = DateColIdentifier.identify_marked_date_cols(ds)
        cols = cols[cols].index
        self.assertTrue(cols[0] == '1992'+ps.TRANSFORMED_DATE_COL_NAME)

        data = pd.DataFrame(data={'col1': [1993, 1, 6, 1993, 1, 6, 1993, 1, 6],
                                  'col2': ["1994 Brum", "Hä", 2015, "1994 Brum", "mjua", 2015, "1994 Brum", "1999", 2015],
                                  '1991': [6, 6, 6, 6, 6, 6, 6, 6, 6]})
        ds = DataStruct(data, data, "test")
        cols = DateColIdentifier.identify_marked_date_cols(ds)
        cols = cols[cols].index
        self.assertTrue(cols[0] == 'col2')
    def remove_leading_rows(ds, dh_ind):

        leading = ds.df_data.index[ds.df_data.index < dh_ind]
        meta_data = ds.df_data.loc[leading, :]
        meta_profiles = ds.df_profiles.loc[leading, :]
        meta_ds = DataStruct(meta_data, meta_profiles, ds.name + "_meta")
        ds.df_data.drop(leading, inplace=True)
        ds.df_profiles.drop(leading, inplace=True)
        return meta_ds
Esempio n. 4
0
    def testRightAdjust(self):
        data = pd.DataFrame(data={'col1': ["one", "two", "three"], 'col2': [1, 1, 1], 'col3': [1, 0, 0], 'col4': [1, 0, 0]})
        profiles = pd.DataFrame(data={'col1': [1, 1, 1], 'col2': [9, 9, 9], 'col3': [9, 9, 9], 'col4': [9, 9, 9]})
        ds = DataStruct(data, profiles, "test")
        tr_cols = pd.Series([False, True, True, True], index=["col1", "col2", "col3", "col4"])
        TriangleStripper.right_adjust(ds, tr_cols)
        data_solution = pd.DataFrame(data={'col1': ["one", "two", "three"], 'col2': [1, 0, 0], 'col3': [1, 1, 0], 'col4': [1, 0, 1]})

        self.assertTrue(ds.df_data.equals(data_solution))
Esempio n. 5
0
    def triangle_score(dh, key, val):
        ds_key = dh.id_dict[key]
        ds_val = dh.id_dict[val]
        concat_data = pd.concat([ds_key.df_data, ds_val.df_data], sort=True)
        concat_prof = pd.concat([ds_key.df_profiles, ds_val.df_profiles],
                                sort=True)
        concat_ds = DataStruct(concat_data, concat_prof, "temp")

        ind1 = TriangleFinder.is_triangle(ds_key, yield_float=True)
        ind2 = TriangleFinder.is_triangle(ds_val, yield_float=True)
        ind3 = TriangleFinder.is_triangle(concat_ds, yield_float=True)
        total = (ind1 + ind2) / 2 - ind3
        return np.exp(-total), ind1 + ind2
Esempio n. 6
0
 def test_identify_month_cols(self):
     data = pd.DataFrame([[0, 12, 48, 24, 24, 36, 25, 72, 0, 36, 36],
                          [0, 1, 3, 5, 4, 4, 12, 12, 19, 23, np.nan],
                          ["", "", "", "", "", "", "", "", "", "", ""]])
     profiles = pd.DataFrame([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
                          [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
     mat = profiles.T.copy()
     mat.loc[:, :] = 0
     ds = DataStruct(data.T, profiles.T, 'test')
     m_cols = DateColIdentifier.identify_month_cols(ds, mat)
     truth = pd.Series([True, False, False])
     self.assertTrue(m_cols.equals(truth))
    def remove_stray_rows(ds, h_ind):

        # save the junk as meta-data
        leading = []
        for ind in ds.df_data.index:
            if ind > h_ind:
                break
            leading.append(ind)
        if leading:
            df_data_meta = ds.df_data.loc[leading, :]
            df_profiles_meta = ds.df_profiles.loc[leading, :]
            meta_ds = DataStruct(df_data_meta, df_profiles_meta, ds.name, orig_sheet_name=ds.orig_sheet_name)

            # now drop all garbage
            ds.df_data.drop(leading, inplace=True)
            ds.df_profiles.drop(leading, inplace=True)
            return meta_ds
Esempio n. 8
0
 def test_select_date_col(self):
     df_data = pd.DataFrame(
         {"string_dates": ["1995", 2007, "1998", 513, "2013"],
          "string_dates_empty": ["", "", "", "", ""],
          "xls_date": ["1995-06-03", "2007-06-03", "1999-06-03", "2001-06-03", "2015-06-03"],
          "bs": ["b", "s", "i", "s", "n"]
     }
     )
     str_d = SheetTypeDefinitions.STRING_DATE
     xl_d = SheetTypeDefinitions.XL_DATE
     string_t = SheetTypeDefinitions.STRING
     df_profiles = pd.DataFrame(
         {"string_dates": [str_d, str_d, str_d, str_d, str_d],
          "string_dates_empty": [str_d, str_d, str_d, str_d, str_d],
          "xls_date": [xl_d, xl_d, xl_d, xl_d, xl_d],
          "bs": [string_t, string_t, string_t, string_t, string_t]
          }
     )
     connected_ds = DataStruct(df_data, df_profiles, "test")
     cols = connected_ds.df_profiles.iloc[0, :]
     date_header = RowParser.select_date_col(cols, connected_ds)
     self.assertEqual(date_header, 'string_dates')
Esempio n. 9
0
    def testRemoveStrayRows(self):
        df_data = pd.DataFrame(data={'col1': ["1", "", "1991", "1992", "2007", "rew", "1993", "1994", "1995x"],
                                     'col2': [43, 1994, 2015, 1994, 7, 2015, 1994, 1999, 2015],
                                     '1991': [1993, 1, 6, 1993, 1, 6, 1993, 1, 6],
                                     '1992': ["g", "r", "h", "j", "t", "f", "6", "p", "6"],
                                     '1993': ["1", "1993", "6", "1993", "", "rew", "1993", "1994", ""]})

        df_profiles = pd.DataFrame(data={'col1': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         'col2': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1991': [2, 2, 2, 2, 2, 2, 2, 2, 2],
                                         '1992': [1, 1, 1, 1, 1, 1, 1, 1, 1],
                                         '1993': [1, 1, 1, 1, 1, 1, 1, 1, 1]})
        ds = DataStruct(df_data, df_profiles, 'test')

        TriangleHeaderFinder.remove_stray_rows(ds,0)
        solution = pd.DataFrame(data={'col1': ["", "1991", "1992", "2007", "rew", "1993", "1994", "1995x"],
                                     'col2': [1994, 2015, 1994, 7, 2015, 1994, 1999, 2015],
                                     '1991': [1, 6, 1993, 1, 6, 1993, 1, 6],
                                     '1992': ["r", "h", "j", "t", "f", "6", "p", "6"],
                                     '1993': ["1993", "6", "1993", "", "rew", "1993", "1994", ""]},
                                index=[1, 2, 3, 4, 5, 6, 7, 8])
        self.assertTrue(ds.df_data.equals(solution))
 def perform_vertical_chop(dh, chop_bools, chop_lists):
     new_dh = DataHolder(dh.name)
     for ind, ds in enumerate(dh):
         if chop_bools[ind]:
             cut = chop_lists[ind]
             # Don't cut too much
             if len(cut) < pp.MAX_NUM_VERTICAL_CHOPS:
                 cut = [0] + cut.tolist()
                 for i in range(len(cut) - 1):
                     temp_df_data = ds.df_data.iloc[cut[i]:cut[i + 1], :]
                     temp_df_profiles = ds.df_profiles.iloc[
                         cut[i]:cut[i + 1], :]
                     new_ds = DataStruct(temp_df_data,
                                         temp_df_profiles,
                                         ds.name,
                                         orig_sheet_name=ds.orig_sheet_name)
                     new_dh.add_ds(new_ds)
             else:
                 new_dh.add_ds(ds)
         else:
             new_dh.add_ds(ds)
     return new_dh