def save_column_names_row(parms,display=True): """ * -------------------------------------------------------------------------- * function : save column names row to a file * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_save_row_transform_input_idList) filename = fparms[0] if(len(filename) == 0) : filename = "./" + cfg.get_config_value(cfg.CURRENT_IMPORTED_DATA_SOURCE_KEY) #filename = filename.replace(".","_") filename = filename + "_column_names.json" # see if save col names row if(len(filename) > 0) : colids = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist() # save the index row as file with open(filename, 'w') as colid_file : json.dump(colids,colid_file) colid_file.close() if(display) : #make scriptable add_to_script(["# save column names row", "from dfcleanser.data_transform.data_transform_dataframe_control save_column_names_row", "save_column_names_row(" + json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to save column names file to : " + filename,e) return([opstat, filename])
def display_inspect_nans(): """ * -------------------------------------------------------------------------- * function : display the inspect nans option * * parms : * * returns : * N/A * -------------------------------------------------------- """ nans_rows_table = dcTable("Rows with most NaNs", "nansrowTable", cfg.DataInspection_ID) nans_cols_table = dcTable("Columns with most NaNs", "nansTable", cfg.DataInspection_ID) diw.display_null_data(cfg.get_current_chapter_df(cfg.DataInspection_ID), nans_rows_table, nans_cols_table, 120)
def remwhitespace_column_names_row(parms,display=True): """ * -------------------------------------------------------- * function : drop the column names row * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() #TODO what the f**k return(opstat) try : df = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF) collist = df.columns.tolist() df.drop(labels=collist,axis=1,inplace=True) if(display) : #make scriptable add_to_script(["# change column names", "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names", "drop_column_names(False)"],opstat) except Exception as e: opstat.store_exception("Unable to change column names ",e) return(opstat)
def display_inspect_categories(): """ * -------------------------------------------------------------------------- * function : display the inspect categoriies option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: cattable = dcTable("Category Columns", "catcolsTable", cfg.DataInspection_ID) catcandidatetable = dcTable("Category Candidate Columns", "catcandcolsTable", cfg.DataInspection_ID) numcats, numcands = diw.display_df_categories( cfg.get_current_chapter_df(cfg.DataInspection_ID), cattable, catcandidatetable) except Exception as e: opstat.store_exception("Error displaying category data\n ", e) clock.stop() if (not (opstat.get_status())): display_exception(opstat)
def display_inspect_cols(parms): """ * -------------------------------------------------------------------------- * function : display the inspect cols option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: df = cfg.get_current_chapter_df(cfg.DataInspection_ID) colnames = df.columns.tolist() if (not (parms is None)): colname = parms else: colname = colnames[0] cnames = { 'default': colname, 'list': colnames, "callback": "change_inspect_cols_col", "size": 10 } if (is_numeric_col(df, colname)): coldetails_form = InputForm(diw.inspect_col_input_id, diw.inspect_col_input_idList, diw.inspect_col_input_labelList, diw.inspect_col_input_typeList, diw.inspect_col_input_placeholderList, diw.inspect_col_input_jsList, diw.inspect_col_input_reqList) else: coldetails_form = InputForm( diw.inspect_nn_col_input_id, diw.inspect_nn_col_input_idList, diw.inspect_nn_col_input_labelList, diw.inspect_nn_col_input_typeList, diw.inspect_nn_col_input_placeholderList, diw.inspect_nn_col_input_jsList, diw.inspect_nn_col_input_reqList) selectDicts = [] selectDicts.append(cnames) get_select_defaults(coldetails_form, diw.inspect_col_input_id, diw.inspect_col_input_idList, diw.inspect_col_input_typeList, selectDicts) coldetails_form.set_shortForm(True) coldetails_form.set_fullparms(True) if (cfg.get_dfc_mode() == cfg.INLINE_MODE): coldetails_form.set_gridwidth(360) if (is_numeric_col(df, colname)): coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 85, "left-margin": 2 }) else: coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 85, "left-margin": 75 }) else: coldetails_form.set_gridwidth(480) if (is_numeric_col(df, colname)): coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 110, "left-margin": 2 }) else: coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 110, "left-margin": 110 }) coldetails_html = coldetails_form.get_html() from dfcleanser.data_cleansing.data_cleansing_widgets import display_col_stats col_stats_html = display_col_stats(df, colname, False, True) gridclasses = ["dfc-left", "dfc-right"] gridhtmls = [col_stats_html, coldetails_html] if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-column-data-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-pop-up-column-data-wrapper", gridclasses, gridhtmls) except Exception as e: opstat.store_exception("Error displaying column data\n ", e) clock.stop() if (not (opstat.get_status())): display_exception(opstat)
def display_inspect_rows(rowid=0): """ * -------------------------------------------------------------------------- * function : display the inspect rows option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: print("\n") from dfcleanser.data_transform.data_transform_dataframe_widgets import display_current_df_index display_current_df_index( cfg.get_current_chapter_df(cfg.DataInspection_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataInspection_ID), 0, True) row_stats_html = diw.display_row_stats( cfg.get_current_chapter_df(cfg.DataInspection_ID), cfg.get_config_value(cfg.CURRENT_INSPECTION_DF), False) sample_row_html = dim.display_df_rows( cfg.get_current_chapter_df(cfg.DataInspection_ID), rowid, 200) rows_openexcel_tb = diw.get_inspection_openexcel_taskbar() rows_openexcel_tb.set_gridwidth(620) rows_openexcel_tb.set_customstyle({ "font-size": 13, "height": 90, "width": 120, "left-margin": 10 }) rows_openexcel_html = rows_openexcel_tb.get_html() rows_openexcel_html = (rows_openexcel_html + "<br>") cfg.set_config_value(cfg.CURRENT_SCROLL_ROW_KEY, rowid) gridclasses = ["dfc-top", "dfc-bottom", "dfc-footer"] gridhtmls = [row_stats_html, sample_row_html, rows_openexcel_html] if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-row-data-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-row-data-pop-up-wrapper", gridclasses, gridhtmls) except Exception as e: opstat.store_exception("Error displaying row data\n ", e) display_exception(opstat) import traceback traceback.print_exc() clock.stop()
def display_data_inspection(option, parms=None): """ * -------------------------------------------------------------------------- * function : main data inspection processing * * parms : * option - function option * parms - associated parms * * returns : * N/A * -------------------------------------------------------- """ from IPython.display import clear_output clear_output() opstat = opStatus() from dfcleanser.common.html_widgets import define_inputs, are_owner_inputs_defined if (not (are_owner_inputs_defined(cfg.DataInspection_ID))): define_inputs(cfg.DataInspection_ID, diw.datainspection_inputs) if (option == dim.MAIN_OPTION): drop_working_df() diw.display_dfc_inspection_main() clear_data_inspection_data() else: diw.display_inspection_main_taskbar() if (cfg.is_a_dfc_dataframe_loaded()): if ((option == dim.DISPLAY_DATATYPES_OPTION) or (option == dim.DISPLAY_NANS_OPTION) or (option == dim.DISPLAY_ROWS_OPTION) or (option == dim.DISPLAY_COLS_OPTION) or (option == dim.DISPLAY_CATEGORIES_OPTION)): fparms = get_parms_for_input(parms[0], diw.data_inspection_df_input_idList) if (len(fparms) > 0): cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, fparms[0]) if (not (option == dim.DISPLAY_ROWS_OPTION)): drop_working_df() if ((option == dim.DISPLAY_DATATYPES_OPTION) or (option == dim.DISPLAY_FULL_COLUMN_NAMES)): df_data_info = dim.get_df_datatypes_data( cfg.get_current_chapter_df(cfg.DataInspection_ID)) display_inspect_datatypes(option, df_data_info) elif (option == dim.DISPLAY_NANS_OPTION): display_inspect_nans() elif (option == dim.DISPLAY_ROWS_OPTION): display_inspect_rows() elif (option == dim.DISPLAY_COLS_OPTION): if (len(parms) > 1): display_inspect_cols(parms[1]) else: display_inspect_cols(None) elif (option == dim.DISPLAY_CATEGORIES_OPTION): display_inspect_categories() elif ((option == dim.DROP_ROW_NANS_OPTION) or (option == dim.DROP_COL_NANS_OPTION)): thresholdType = parms[0] if (option == dim.DROP_ROW_NANS_OPTION): fparms = get_parms_for_input(parms[1], diw.drop_rows_input_idList) else: fparms = get_parms_for_input(parms[1], diw.drop_columns_input_idList) if (len(fparms) > 0): try: threshold = int(fparms[0]) except: opstat.set_status(False) if (option == dim.DROP_ROW_NANS_OPTION): opstat.set_errorMsg("Drop Nan Rows Threshold value '" + fparms[0] + "' is invalid") else: opstat.set_errorMsg("Drop Nan Cols Threshold value '" + fparms[0] + "' is invalid") threshold = None else: opstat.set_status(False) if (option == dim.DROP_ROW_NANS_OPTION): opstat.set_errorMsg( "Drop Nan Rows Threshold value is not defined") else: opstat.set_errorMsg( "Drop Nan Cols Threshold value is not defined") threshold = None if (option == dim.DROP_ROW_NANS_OPTION): if (opstat.get_status()): dropstats = drop_nan_rows( cfg.get_current_chapter_df(cfg.DataInspection_ID), threshold, thresholdType, opstat) if (not (opstat.get_status())): display_exception(opstat) else: if (dropstats[0] > 0): display_status( str(dropstats[0]) + " Nan Rows Dropped Successfully") else: display_status( "No Rows matching threshold were dropped") else: if (opstat.get_status()): numcolsdropped = drop_nan_cols( cfg.get_current_chapter_df(cfg.DataInspection_ID), threshold, thresholdType, opstat) if (not (opstat.get_status())): display_exception(opstat) else: if (numcolsdropped > 0): display_status( str(numcolsdropped) + " Columns with Nans Dropped Successfully") else: display_status( " No Columns matching threshold were dropped") elif (option == dim.DISPLAY_ROW_OPTION): display_inspect_rows() elif (option == dim.DISPLAY_COL_GRAPHS): display_inspect_graphs(parms) elif (option == dim.DISPLAY_COL_OUTLIERS): display_inspect_outliers(parms[0]) elif (option == dim.DISPLAY_SCROLL_TO_DF_ROW): diw.display_scroll_to_row() elif (option == dim.PROCESS_SCROLL_TO_DF_ROW): opstat = opStatus() df = cfg.get_current_chapter_df(cfg.DataInspection_ID) retparms = get_row_id_for_df(df, parms, diw.scroll_df_rows_input_idList, opstat) if (opstat.get_status()): if (retparms[1] == 0): display_inspect_rows(retparms[0]) else: display_inspect_rows(retparms[0]) else: diw.display_scroll_to_row() display_exception(opstat) elif (option == dim.SCROLL_DF_ROWS_DOWN): new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY) if (new_row_id is None): new_row_id = 0 else: new_row_id = new_row_id + 200 df = cfg.get_current_chapter_df(cfg.DataInspection_ID) if (new_row_id > len(df)): new_row_id = cfg.get_config_value( cfg.CURRENT_SCROLL_ROW_KEY) display_inspect_rows(new_row_id) elif (option == dim.SCROLL_DF_ROWS_UP): new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY) if (new_row_id is None): new_row_id = 0 else: new_row_id = new_row_id - 200 if (new_row_id < 0): new_row_id = 0 display_inspect_rows(new_row_id) elif (option == dim.DISPLAY_DF_ROW): print("dim.DISPLAY_DF_ROW") elif (option == dim.DISPLAY_DF_ROW_REMOTE): chapterid = parms[0] #print("chapterId",chapterid) new_config_df = None if (chapterid == cfg.DataInspection_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_INSPECTION_DF) elif (chapterid == cfg.DataCleansing_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_CLEANSE_DF) elif (chapterid == cfg.DataTransform_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF) elif (chapterid == cfg.DataExport_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_EXPORT_DF) elif (chapterid == cfg.DataImport_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_IMPORT_DF) elif (chapterid == cfg.SWGeocodeUtility_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_GEOCODE_DF) elif (chapterid == cfg.SWDFSubsetUtility_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_SUBSET_DF) cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, new_config_df) display_inspect_rows() else: cfg.drop_config_value(cfg.CURRENT_INSPECTION_DF) if (not (option == dim.MAIN_OPTION)): cfg.display_no_dfs(cfg.DataInspection_ID) from dfcleanser.common.display_utils import display_pop_up_buffer display_pop_up_buffer()
def display_df_subset_setup(): """ * -------------------------------------------------------------------------- * function : display current df subset form * * parms : * df - dataframe to subset from * filters - filters form * colname - filters column name * * returns : N/A * -------------------------------------------------------- """ df_title = cfg.get_config_value(cfg.CURRENT_SUBSET_DF) df = cfg.get_dfc_dataframe_df(df_title) col_stats_table = get_column_stats_table(df_title, df) from dfcleanser.common.html_widgets import InputForm subset_input_form = InputForm(get_subset_input_id, get_subset_input_idList, get_subset_input_labelList, get_subset_input_typeList, get_subset_input_placeholderList, get_subset_input_jsList, get_subset_input_reqList) selectDicts = [] dataframes = cfg.get_dfc_dataframes_select_list(cfg.SWDFSubsetUtility_ID) selectDicts.append(dataframes) current_df = cfg.get_current_chapter_df(cfg.SWDFSubsetUtility_ID) colnames = current_df.columns.tolist() cols_name_list = [" "] for i in range(len(colnames)): cols_name_list.append(colnames[i]) cnames = { "default": cols_name_list[0], "list": cols_name_list, "callback": "change_subset_cols" } selectDicts.append(cnames) subssel = {"default": "Keep", "list": ["Keep", "Drop"]} selectDicts.append(subssel) get_select_defaults(subset_input_form, get_subset_input_form[0], get_subset_input_form[1], get_subset_input_form[3], selectDicts) subset_input_form.set_shortForm(False) subset_input_form.set_gridwidth(680) subset_input_form.set_custombwidth(140) subset_input_form.set_fullparms(True) get_subset_input_html = subset_input_form.get_html() get_subset_heading_html = "<div>Get Dataframe Subset</div><br></br>" gridclasses = ["dfc-top", "dfcleanser-common-grid-header", "dfc-bottom"] gridhtmls = [ col_stats_table, get_subset_heading_html, get_subset_input_html ] print("\n") display_generic_grid("sw-utils-subset-wrapper", gridclasses, gridhtmls)
def drop_duplicate_rows(parms,display=True): """ * -------------------------------------------------------------------------- * function : drop df duplicate rows * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_drop_dups_transform_input_idList) colnames = fparms[0] if(len(colnames) == 0) : colnames = None if(fparms[2] == "Drop") : drop = True else : drop = False keep = fparms[3] if(keep == "False") : keep = False df = cfg.get_current_chapter_df(cfg.DataTransform_ID) if(not (colnames is None)) : if(not drop) : fcolnames = [] colslist = df.columns.tolist() for i in range(len(colslist)) : if(not (colslist[i] in colnames)) : fcolnames.append(colslist[i]) colnames = fcolnames if(opstat.get_status()) : try : df.drop_duplicates(colnames,keep=keep,inplace=True) if(display) : #make scriptable add_to_script(["# drop duplicate rows", "from dfcleanser.data_transform.data_transform_dataframe_control drop_duplicate_rows", "drop_duplicate_rows("+ json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to drop duplicate rows : " + colnames,e) return(opstat)
def append_to_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : append column to df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_append_index_transform_input_idList) colnames = fparms[0] colnames = colnames.lstrip("[") colnames = colnames.rstrip("]") colnames = colnames.split(",") if(len(colnames) == 0) : opstat.set_status(False) opstat.set_errorMsg("column names list is empty") else : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) if(fparms[2] == "True") : drop = True else : drop = False if(fparms[3] == "True") : verify = True else : verify = False try : """ df.reset_index(drop=False,inplace=True) cnames = list(df.columns) levels_to_drop = [] for i in range(len(cnames)) : if(cnames[i].find("level_") > -1) : levels_to_drop.append(cnames[i]) if(len(levels_to_drop) > 0) : df.drop(levels_to_drop,axis=1,inplace=True) """ df.set_index(keys=colnames,drop=drop,append=True,inplace=True,verify_integrity=verify) if(display) : #make scriptable add_to_script(["# append to df index", "from dfcleanser.data_transform.data_transform_dataframe_control append_to_df_index", "append_to_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to append to df index : " + colnames,e) return(opstat)
def set_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : set df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_set_index_transform_input_idList) colnames = fparms[0] if(len(colnames) == 0) : opstat.set_status(False) opstat.set_errorMsg("column names list is empty") else : colnames = colnames.lstrip("[") colnames = colnames.rstrip("]") colnames = colnames.split(",") if(fparms[2] == "True") : drop = True else : drop = False if(opstat.get_status()) : if(fparms[3] == "True") : verify = True else : verify = False if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.set_index(colnames,drop=drop,append=True,inplace=True,verify_integrity=verify) cfg.set_dfc_dataframe_df(cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF),df) if(display) : #make scriptable add_to_script(["# set df index", "from dfcleanser.data_transform.data_transform_dataframe_control set_df_index", "set_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to set index of column(s) : " + str(colnames),e) return(opstat)
def reset_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : reset df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() df = cfg.get_current_chapter_df(cfg.DataTransform_ID) fparms = get_parms_for_input(parms,dftw.df_reset_index_transform_input_idList) drop_levels = fparms[0] if(len(drop_levels) > 0) : drop_levels = drop_levels.lstrip("[") drop_levels = drop_levels.rstrip("]") drop_levels = drop_levels.split(",") if(drop_levels[0] == "All") : drop_levels = [] index_columns = df.index.names if(len(index_columns) > 0) : for i in range(len(index_columns)) : if( not (index_columns[i] is None) ) : drop_levels.append(index_columns[i]) else : drop_levels = None if(fparms[2] == "True") : drop = False else : drop = True if(opstat.get_status()) : try : df.reset_index(level=drop_levels,drop=drop,inplace=True) if(display) : #make scriptable add_to_script(["# reset df index", "from dfcleanser.data_transform.data_transform_dataframe_control reset_df_index", "reset_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to reset df index : ",e) return(opstat)
def process_df_transform(optionid,parms,display=True) : """ * -------------------------------------------------------------------------- * function : process dataframe transform option * * parms : * optionid - transform option * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() #dftw.display_dataframe_transform_taskbar() if(optionid == dtm.PROCESS_SHOW_COLUMN_NAMES_ROW) : dftw.display_dataframe_col_names_taskbar() print("\n") col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID) col_names_table.set_table_column_parms({"font":12}) col_names_table.set_note("None") display_column_names(cfg.get_current_chapter_df(cfg.DataTransform_ID),col_names_table,None) if(optionid == dtm.PROCESS_SAVE_COLUMN_NAMES_ROW) : [opstat, filename] = save_column_names_row(parms) dftw.display_dataframe_col_names_taskbar() if(opstat.get_status()) : display_status_note("Column Names Row Saved Successfully to : " + filename) clear_dataframe_transform_cfg_values() else : display_exception(opstat) # add column names row elif(optionid == dtm.PROCESS_ADD_COLUMN_NAMES_ROW) : opstat = add_column_names_row(parms) dftw.display_dataframe_col_names_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("Column Names Row Added Successfully") col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID) col_names_table.set_table_column_parms({"font":12}) col_names_table.set_note("None") display_column_names(cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF),col_names_table,None) else : display_main_option([[0,0]]) display_exception(opstat) elif(optionid == dtm.PROCESS_CHANGE_COLUMN_NAMES) : opstat = change_column_names(parms) dftw.display_dataframe_col_names_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("Column Names Changed Successfully") col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID) col_names_table.set_table_column_parms({"font":12}) col_names_table.set_note("None") display_column_names(cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF),col_names_table,None) else : display_exception(opstat) if(optionid == dtm.PROCESS_DROP_COLUMN_NAMES_ROW) : opstat = drop_column_names_row() dftw.display_dataframe_col_names_taskbar() print("\n") if(opstat.get_status()) : display_status_note("Column Names Row Dropped Successfully") clear_dataframe_transform_cfg_values() else : display_exception(opstat) if(optionid == dtm.PROCESS_WHITESPACE_COLUMN_NAMES) : opstat = remwhitespace_column_names_row(parms) dftw.display_dataframe_col_names_taskbar() print("\n") if(opstat.get_status()) : display_status_note("Column Names Whitespace Removed Successfully") clear_dataframe_transform_cfg_values() else : display_exception(opstat) elif(optionid == dtm.PROCESS_SET_DF_INDEX) : opstat = set_df_index(parms) dftw.display_dataframe_indices_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("df Index Set Successfully") else : display_exception(opstat) dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID)) dftw.display_remote_df(cfg.DataTransform_ID) elif(optionid == dtm.PROCESS_RESET_DF_INDEX) : opstat = reset_df_index(parms) dftw.display_dataframe_indices_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("df Index Reset Successfully") else : display_exception(opstat) dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID)) dftw.display_remote_df(cfg.DataTransform_ID) elif(optionid == dtm.PROCESS_APPEND_TO_INDEX) : opstat = append_to_df_index(parms) dftw.display_dataframe_indices_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("df Index Appended to Successfully") else : dftw.display_dataframe_options([[4,0]]) display_exception(opstat) dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID)) dftw.display_remote_df(cfg.DataTransform_ID) elif(optionid == dtm.PROCESS_SORT_DF_INDEX) : opstat = sort_df_index(parms) dftw.display_dataframe_indices_taskbar() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note("Dataframe Sorted by index Successfully") else : display_exception(opstat) dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID)) dftw.display_remote_df(cfg.DataTransform_ID) # drop duplicate rows elif(optionid == dtm.PROCESS_SORT_COLUMN) : opstat = process_sort_by_column(parms,display) dftw.display_dataframe_transform_main() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note(opstat.get_errorMsg()) else : display_main_option([[0,0]]) display_exception(opstat) # drop duplicate rows elif(optionid == dtm.PROCESS_DROP_DUPLICATE_ROWS) : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) start_rows = len(df) opstat = drop_duplicate_rows(parms,display) end_rows = len(df) dftw.display_dataframe_transform_main() print("\n") if(opstat.get_status()) : clear_dataframe_transform_cfg_values() display_status_note(str(start_rows-end_rows) + " Duplicate Rows Dropped Successfully") else : display_exception(opstat) # return elif(optionid == dtm.DF_TRANSFORM_RETURN) : dftw.display_dataframe_transform_main() # help elif(optionid == dtm.DF_TRANSFORM_HELP) : print("help")
def add_column_names_row(parms,display=True): """ * -------------------------------------------------------------------------- * function : add a column names row * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_add_row_transform_input_idList) filename = fparms[0] collist = fparms[1] if(len(filename) == 0) : filename = "None" if(len(collist) == 0 ) : collist = "None" else : collist = collist.replace("'","") collist = collist.split(",") if( (not(filename == "None")) or (not(collist == "None"))) : if(not(filename == "None")) : try : with open(filename, 'r') as colid_file : colids = json.load(colid_file) colid_file.close() except Exception as e: opstat.store_exception("Unable to open column names file" + filename,e) else : colids = collist cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns = colids if(display) : #make scriptable add_to_script(["# Add Column Names Row", "from dfcleanser.data_transform.data_transform_dataframe_control add_column_names_row", "add_column_names_row(" + single_quote(filename) +"," + json.dumps(collist) + ",False)"],opstat) else : opstat.set_status(False) opstat.set_errorMsg("No Column List or filename defined") except Exception as e: opstat.store_exception("Unable to add column names",e) return(opstat)
def process_sort_by_column(parms,display=True) : """ * -------------------------------------------------------------------------- * function : sort by column transform option * * parms : * parms - associated parms * display - display results flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.sort_column_input_idList) colname = fparms[0] sortorder = fparms[1] if(sortorder == "True") : sortorder = True else : sortorder = False sortkind = fparms[2] sortkind = sortkind.lstrip("'") sortkind = sortkind.rstrip("'") naposition = fparms[3] naposition = naposition.lstrip("'") naposition = naposition.rstrip("'") resetrowids = fparms[4] if(resetrowids == "True") : resetrowids = True else : resetrowids = False if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.sort_values(colname,axis=0,ascending=sortorder,inplace=True,kind=sortkind,na_position=naposition) if(resetrowids) : from dfcleanser.data_transform.data_transform_dataframe_control import reset_df_index opstat = reset_df_index() if(display) : #make scriptable add_to_script(["# sort by column ", "from dfcleanser.data_transform.data_transform_columns_control import process_sort_by_column", "process_sort_by_column(" + json.dumps(parms) + ",False)"],opstat) opstat.set_errorMsg("df sorted by column '" + colname + "' successfully.") except Exception as e: opstat.store_exception("Sort df By Column Error : "+colname,e) cfg.drop_config_value(dftw.sort_column_input_id+"Parms") return(opstat)
def find_matching_rows(df_title, column_type, cols_lists, vals_lists, opstat): """ * -------------------------------------------------------------------------- * function : find rows in df matching the col names and values * * parms : * cols_list - col names * vals_list - column values * opstat - status object * * returns : * N/A * -------------------------------------------------------- """ from dfcleanser.common.common_utils import is_int_col df = cfg.get_current_chapter_df(cfg.DataInspection_ID) clock = RunningClock() clock.start() import pandas as pd final_criteria = pd.Series() for i in range(len(cols_lists)): vals_list = vals_lists[i] vals_list = vals_list.replace("[", "") vals_list = vals_list.replace("]", "") vals_list = vals_list.split(",") col_vals_list = [] if (column_type == 0): if (is_int_col(df, cols_lists[i])): try: for j in range(len(vals_list)): col_vals_list.append(int(vals_list[j])) except: #print("int excx",j) opstat.set_status(False) else: try: for j in range(len(vals_list)): col_vals_list.append(float(vals_list[j])) except: #print("float excx",j) opstat.set_status(False) else: try: for j in range(len(vals_list)): col_vals_list.append(str(vals_list[j])) except: opstat.set_status(False) if (opstat.get_status()): if (len(col_vals_list) > 0): if (column_type == 0): try: current_criteria = df[cols_lists[i]].isin( col_vals_list) num_ccs = [ i for i in current_criteria.index if current_criteria[i] ] except: opstat.set_status(False) opstat.set_errorMsg( "failed to get current criteria subset " + cols_lists[i]) if (opstat.get_status()): try: if (len(final_criteria) > 0): final_criteria = final_criteria & current_criteria else: final_criteria = current_criteria num_fcs = [ i for i in final_criteria.index if final_criteria[i] ] except: opstat.set_status(False) opstat.set_errorMsg( "failed to get final criteria subset " + cols_lists[i]) else: for k in range(len(col_vals_list)): current_criteria = df[cols_lists[i]].str.contains( col_vals_list[k]) try: if (len(final_criteria) > 0): final_criteria = final_criteria & current_criteria else: final_criteria = current_criteria except: opstat.set_status(False) opstat.set_errorMsg( "failed to get final criteria subset " + cols_lists[i]) else: opstat.set_status(False) opstat.set_errorMsg("no valid column_values entered for " + cols_lists[i]) else: opstat.set_status(False) opstat.set_errorMsg("invalid column_values entered for " + cols_lists[i]) clock.stop() num_trues = [i for i in final_criteria.index if final_criteria[i]] if (len(num_trues) > 0): search_df = df[final_criteria].copy() #print("search_df",len(search_df)) search_df.reset_index(drop=True, inplace=True) from dfcleanser.common.cfg import dfc_dataframe, add_dfc_dataframe search_df_notes = "search subset from " + cfg.get_config_value( cfg.CURRENT_INSPECTION_DF) new_dfcdf = dfc_dataframe(df_title, search_df, search_df_notes) add_dfc_dataframe(new_dfcdf) return (len(num_trues))
def get_subset_df() : return(cfg.get_current_chapter_df(cfg.SWDFSubsetUtility_ID))