def test_extra_fields_pd(self): pdf = PanDatFactory(boger=[["a"], ["b", "c"]]) dat = pdf.PanDat(boger=pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6], "c": ['a', 'b', 'c'] })) schema = "test_pd_extra_fields" pdf.pgsql.write_schema(self.engine, schema, forced_field_types={ ("boger", "c"): "text", ("boger", "a"): "float" }) pdf.pgsql.write_data(dat, self.engine, schema) pdf2 = PanDatFactory(boger=[["a"], ["b"]]) dat2 = pdf2.pgsql.create_pan_dat(self.engine, schema) self.assertTrue( list(dat2.boger["a"]) == [1.0, 2.0, 3.0] and list(dat2.boger["b"]) == [4.0, 5.0, 6.0]) dat2_2 = pdf2.PanDat(boger=pd.DataFrame({ "a": [10, 300], "b": [40, 60] })) pdf2.pgsql.write_data(dat2_2, self.engine, schema) dat = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue( list(dat.boger["a"]) == [10, 300] and list(dat.boger["b"]) == [40, 60]) self.assertTrue(len(set(dat.boger["c"])) == 1)
def testNullsPd(self): pdf = PanDatFactory(table=[[], ["field one", "field two"]]) for f in ["field one", "field two"]: pdf.set_data_type("table", f, nullable=True) dat = pdf.PanDat( table={ "field one": [None, 200, 0, 300, 400], "field two": [100, 109, 300, None, 0] }) schema = test_schema + "_bool_defaults_pd" pdf.pgsql.write_schema(self.engine, schema, include_ancillary_info=False) pdf.pgsql.write_data(dat, self.engine, schema) dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue( pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True)) pdf = PanDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: pdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) pdf.set_infinity_io_flag(None) dat_inf = pdf.PanDat( table={ "field one": [float("inf"), 200, 0, 300, 400], "field two": [100, 109, 300, float("inf"), 0] }) dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue(pdf._same_data(dat_inf, dat_1)) pdf.pgsql.write_data(dat_inf, self.engine, schema) dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertTrue(pdf._same_data(dat_inf, dat_1)) pdf = PanDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: pdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) pdf.set_infinity_io_flag(None) dat_1 = pdf.pgsql.create_pan_dat(self.engine, schema) self.assertFalse(pdf._same_data(dat_inf, dat_1)) dat_inf = pdf.PanDat( table={ "field one": [-float("inf"), 200, 0, 300, 400], "field two": [100, 109, 300, -float("inf"), 0] }) self.assertTrue(pdf._same_data(dat_inf, dat_1))
class Outlier(Action): """flags outliers in a column with numerical value (not categorical). Class not meant for multi-variate outlier detection; Can be extended in future. Parameter used for outlier detection: z-score and IQR""" def __init__(self): self.config_schema = PanDatFactory(action_settings=[ [], ['Table', 'Column', 'Method', 'Value', 'Flag Column'] ]) self.config_defaults = self.config_schema.PanDat( action_settings=[{ 'Table': 'data', 'Column': 'column1', 'Method': 'zScore', 'Value': '3', 'Flag Column': 'flag' }]) def execute_action(self): """ Performs outlier/anomaly detection on fields/columns based on zScore, inter-quartile range or bounds """ config_dfs = self.read_data('config_schema') # parsing action config table into a dataframe action_config_df = config_dfs.action_settings table_col_dict = {} table_flag_col_dict = {} for _, series in action_config_df.iterrows(): if series['Table'] in table_col_dict.keys(): table_col_dict[series['Table']].add(str(series['Column'])) else: table_col_dict[series['Table']] = {series['Column']} table_flag_col_dict[series['Table']] = { series['Flag ' 'Column'] } # print("table_col_dict", table_col_dict) # print("table_flag_col_dict", table_flag_col_dict) self.data_schema = PanDatFactory( **{ table: [[], list(table_col_dict[table]) + list(table_flag_col_dict[table])] for table in table_col_dict.keys() }) table_dfs = self.read_data('data_schema') # processing data in dataframe for i in range(0, action_config_df.shape[0]): # print("\n *** Executing row", i, "in actions ***") table_name = action_config_df['Table'].iloc[i] col_to_analyze = action_config_df['Column'].iloc[i] flag_column = action_config_df['Flag Column'].iloc[i] # print("Table: ", table_name, ", column to analyze: ", # col_to_analyze, ", flag column: ", flag_column) z_threshold = 0 iqr_multiplier = 0 lb = 0 ub = 0 if action_config_df['Method'].iloc[i] == 'zScore': z_threshold = float(action_config_df['Value'].iloc[i]) table_df = getattr(table_dfs, table_name) numbers_l = [] for item in table_df[col_to_analyze]: numbers_l.append(float(item)) numpy_num_array = np.array(numbers_l) std_dev = np.std(numpy_num_array) mean = np.mean(numbers_l) # print("Mean: ", mean, "Standard deviation: ", std_dev) # print("z_threshold: ", z_threshold) table_df[flag_column] = table_df.apply( lambda row: bool(row[flag_column]) or (True if ( ((float(row[col_to_analyze]) - mean) / std_dev > z_threshold) or ((float(row[col_to_analyze]) - mean) / std_dev < -z_threshold)) else False), axis=1) elif action_config_df['Method'].iloc[i] == 'IQR': iqr_multiplier = float(action_config_df['Value'].iloc[i]) table_df = getattr(table_dfs, table_name) numbers_l = [] for item in table_df[col_to_analyze]: numbers_l.append(float(item)) numpy_num_array = np.array(numbers_l) q75, q25 = np.percentile(numpy_num_array, [75, 25]) iqr = q75 - q25 # print("First percentile: ", q25, "Third percentile: ", q75, # "Inter quartile range: ", iqr) # print("IQR multiplier: ", iqr_multiplier) table_df[flag_column] = table_df.apply( lambda row: bool(row[flag_column]) or (True if ( (float(row[col_to_analyze]) > q75 + (iqr_multiplier * iqr)) or (float(row[col_to_analyze]) < q25 - (iqr_multiplier * iqr))) else False), axis=1) elif action_config_df['Method'].iloc[i] == 'range': lb, ub = action_config_df['Value'].iloc[i].split(',') # print("Lower bound = ", lb, # ", Upper bound =", ub) table_df = getattr(table_dfs, table_name) table_df[flag_column] = table_df.apply( lambda row: bool(row[flag_column]) or (True if ( (float(row[col_to_analyze]) > float(ub)) or (float(row[col_to_analyze]) < float(lb))) else False), axis=1) else: print("Error: Please enter a valid value in Method to use (" "zScore, IQR, range).") # writing data self.write_data(table_dfs) exit()