def upperCase_df_column(dftitle, dfcolname): """ * ------------------------------------------------------------------------ * function : convert string column to upper case * * parms : * dftitle - dataframe title * dfcolname - dataframe column to normalize * * returns : * Successful : upper cased columns list * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) new_col_values = [] try: new_col_values = map(lambda x: x.upper(), df[dfcolname]) return (new_col_values) except Exception as e: opstat.store_exception( "'upperCase_df_column' error : " + dftitle + " " + dfcolname, e) return (opstat)
def display_inspect_outliers(colname): """ * -------------------------------------------------------------------------- * function : display the inspect outliers option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() display_inspect_cols(colname) outliers_html = diw.get_simple_outliers(cfg.get_current_chapter_df( cfg.DataInspection_ID), colname, opstat, display=False) gridclasses = ["dfc-main"] gridhtmls = [outliers_html] print("\n") if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-outliers-data-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-outliers-pop-up-data-wrapper", gridclasses, gridhtmls)
def normalize_df_column(dftitle, dfcolname): """ * ------------------------------------------------------------------------ * function : normalize a dataframe column * * parms : * dftitle - dataframe title * dfcolname - dataframe column to normalize * * returns : * Successful : normalized column list * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) from sklearn.preprocessing import MinMaxScaler try: scaler = MinMaxScaler() scaled_values = scaler.fit_transform(df[dfcolname]) return (scaled_values) except Exception as e: opstat.store_exception( "'normalize_df_column' error : " + dftitle + " " + dfcolname, e) return (opstat)
def drop_column_names_row(display=True): """ * -------------------------------------------------------- * function : drop the column names row * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) collist = df.columns.tolist() df.drop(labels=collist,axis=1,inplace=True) if(display) : #make scriptable add_to_script(["# change column names", "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names", "drop_column_names(False)"],opstat) except Exception as e: opstat.store_exception("Unable to change column names ",e) return(opstat)
def absolute_df_column(dftitle, dfcolname): """ * ------------------------------------------------------------------------ * function : convert dataframe column to absolute value * * parms : * dftitle - dataframe title * dfcolname - dataframe column to apply trig function to * * returns : * Successful : col list of abs values * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) import numpy as np colabsolutes = np.array() try: colabsolutes = np.absolute(df[dfcolname]) return (colabsolutes) except Exception as e: opstat.store_exception( "'absolute_df_column' error : " + dftitle + " " + dfcolname, e) return (opstat)
def get_df_geocode_center(dftitle, dfcolname): """ * ------------------------------------------------------------------------ * function : get the center point of a dataframe locations column * * parms : * dftitle - dataframe name * dfcolname - dataframe column to use for locations * * returns : * center point if no exception * opStatus object if exception * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() import json geocoords = [] df = cfg.get_dfc_dataframe_df(dftitle) if (len(dfcolname == 1)): geocoords = df[dfcolname[0]].tolist() if (type(geocoords[0]) == str): geocoords = json.dumps(geocoords) elif (len(dfcolname) == 2): geolats = df[dfcolname[0]].tolist() if (type(geolats[0]) == str): geolats = json.dumps(geolats) geolongs = df[dfcolname[1]].tolist() if (type(geolongs[0]) == str): geolongs = json.dumps(geolongs) for i in range(len(geolats)): geocoords.append([geolats[i], geolongs[i]]) else: opstat.set_status(False) opstat.set_errorMsg( "get_df_geocode_center Error : column names list is invalid") if (opstat.get_status()): return (get_geocode_center(geocoords, opstat)) else: return (opstat)
def sort_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : sort df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_sort_index_transform_input_idList) levels = fparms[0] if(len(levels) > 0) : levels = levels.lstrip("[") levels = levels.rstrip("]") levels = levels.split(",") else : levels = None if(fparms[2] == "True") : ascending = True else : ascending = False kind = fparms[3] na_position = fparms[4] if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.sort_index(axis=0,level=levels,ascending=ascending,inplace=True,kind=kind,na_position=na_position) if(display) : #make scriptable add_to_script(["# set row ids column", "from dfcleanser.data_transform.data_transform_dataframe_control sort_df_index", "sort_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to sort df index : ",e) return(opstat)
def change_column_names(parms,display=True): """ * -------------------------------------------------------------------------- * function : change column names * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_change_row_transform_input_idList) ccolname = fparms[0] ncolname = fparms[1] if( (len(ccolname) < 1) or (len(ncolname) < 1) ) : opstat.set_status(False) if(len(ccolname) < 1) : opstat.set_errorMsg("current_column_name is invalid") else : opstat.set_errorMsg("current_column_name is invalid") else : collist = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist() try : found = collist.index(ccolname) except : opstat.set_status(False) opstat.set_errorMsg("current_column_name is not in df") if(opstat.get_status()) : collist[found] = ncolname cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns = collist if(display) : #make scriptable add_to_script(["# change column names", "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names", "change_column_names(" + json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to change column names ",e) return(opstat)
def export_sql_table(parms, display=True): """ * -------------------------------------------------------------------------- * function : export a sql table into pandas dataframe * * parms : * parms - sql parms * display - display flag * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() dew.display_export_main_taskbar() print("export_sql_table", parms) save_data_export_start() clock = RunningClock() clock.start() dbid = cfg.get_config_value(cfg.CURRENT_DB_ID_KEY) dbcondict = set_dbcon_dict(dbid, get_stored_con_Parms(dbid)) sqltableparms = dew.get_sqltable_export_inputs(parms) (export_notes, opstat) = export_pandas_sqltable(sqltableparms, dbcondict, dew.pandas_export_sqltable_id) clock.stop() if (opstat.get_status()): for i in range(len(sqltableparms)): sqltableparms[i] = get_string_value(sqltableparms[i]) sqltableparms = sqltableparms[0:8] sqltableparms.append(export_notes) sqltablelabels = dew.pandas_export_sqltable_labelList[0:8] sqltablelabels.append("DB Connector String") display_data_export_parms("Pandas SQL Table Export Parms", sqltablelabels, sqltableparms, cfg.DataExport_ID, sqltableparms[1], True) else: display_exception(opstat)
def process_county_cities(parms): opstat = opStatus() fparms = get_parms_for_input(parms, suzw.county_cities_input_idList) state = fparms[0][:2] county = fparms[1] cfg.set_config_value(suzw.county_cities_input_id + "Parms", fparms) suzw.display_get_cities_for_county(parms) print("\n") citiesHeader = [""] citiesRows = [] citiesWidths = [20, 80] citiesAligns = ["left", "left"] primary_cities = suzm.get_cities_for_county(state, county, city_type=suzm.ANY_CITY_TYPE) if (not (primary_cities is None)): citiesRows.append(["US Zipcode Cities", str(primary_cities)]) cities_table = None from dfcleanser.common.table_widgets import dcTable, get_row_major_table, ROW_MAJOR, SCROLL_DOWN cities_table = dcTable( "Cities For " + str(county) + " - " + str(suzm.get_state_name(state).upper()), 'citiescodesid', cfg.SWZipcodeUtility_ID, citiesHeader, citiesRows, citiesWidths, citiesAligns) cities_table.set_small(True) cities_table.set_checkLength(False) cities_table.set_border(True) cities_table.set_tabletype(ROW_MAJOR) cities_table.set_rowspertable(50) citiesHtml = get_row_major_table(cities_table, SCROLL_DOWN, False) gridclasses = ["dfc-top"] gridhtmls = [citiesHtml] display_generic_grid("display-geocode-coords-wrapper", gridclasses, gridhtmls)
def save_column_names_row(parms,display=True): """ * -------------------------------------------------------------------------- * function : save column names row to a file * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_save_row_transform_input_idList) filename = fparms[0] if(len(filename) == 0) : filename = "./" + cfg.get_config_value(cfg.CURRENT_IMPORTED_DATA_SOURCE_KEY) #filename = filename.replace(".","_") filename = filename + "_column_names.json" # see if save col names row if(len(filename) > 0) : colids = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist() # save the index row as file with open(filename, 'w') as colid_file : json.dump(colids,colid_file) colid_file.close() if(display) : #make scriptable add_to_script(["# save column names row", "from dfcleanser.data_transform.data_transform_dataframe_control save_column_names_row", "save_column_names_row(" + json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to save column names file to : " + filename,e) return([opstat, filename])
def get_trig_values_for_column(dftitle, dfcolname, trigfunc): """ * ------------------------------------------------------------------------ * function : get trig column values * * parms : * dftitle - dataframe title * dfcolname - dataframe column to apply trig function to * trigfunc - trig function to apply * ('sin','cos','tan','arcsin','arccos','arctan') * * returns : * Successful : col list of trig values * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) try: import numpy as np trigcol = np.array() if (trigfunc == 'sin'): trigcol = np.sin(df[dfcolname]) elif (trigfunc == 'cos'): trigcol = np.cos(df[dfcolname]) elif (trigfunc == 'tan'): trigcol = np.tan(df[dfcolname]) elif (trigfunc == 'arcsin'): trigcol = np.arcsin(df[dfcolname]) elif (trigfunc == 'arccos'): trigcol = np.arccos(df[dfcolname]) elif (trigfunc == 'arctan'): trigcol = np.arctan(df[dfcolname]) else: trigcol = None return (trigcol) except Exception as e: opstat.store_exception( "'get_trig_values_for_column' error : " + dftitle + " " + dfcolname + " " + trigfunc, e) return (opstat)
def process_custom_export(fparms, exportId, display=True): """ * -------------------------------------------------------------------------- * function : custom export * * parms : * fparms - export parms * exportId - export id * display - display flag * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() fparms[0] = fparms[0].replace("\n", "<br/>") try: exec(fparms[0]) except Exception as e: opstat.store_exception("Unable to export custom", e) if (opstat.get_status()): if (display): #make scriptable script = [ "# Export Custom ", "from dfcleanser.data_export.data_export_control import process_custom_export", "process_custom_export(" + json.dumps(fparms) + "," + str(exportId) + ",False)" ] add_to_script(script, opstat) if (len(fparms) > 0): cfg.set_config_value(exportId + "Parms", "custom") cfg.set_config_value(cfg.CURRENT_EXPORTED_FILE_NAME_KEY, "custom", True) return (opstat)
def test_export_sql_db_connector(driverid, sqlinputparms): """ * -------------------------------------------------------------------------- * function : test the sql db connector * * parms : * importtype - pandas export identifier * sqlinputparms - connection string * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() try: export_test_sql_db_connector(driverid, sqlinputparms) except Exception as e: opstat.store_exception("DB Connection failed ", e) display_exception(opstat)
def random_float_range(dftitle, randomFloatLower, randomFloatUpper): """ * ------------------------------------------------------------------------ * function : generate column of random floats in a range * * parms : * dftitle - dataframe title * randomFloatLower - random integer lower range value * randomFloatUpper - random integer upper range value * * returns : * Successful : cols list of random floats * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) import numpy as np import random colrandfloats = np.array() try: for i in range(len(df)): colrandfloats.append( random.randrange(float(randomFloatLower), float(randomFloatUpper))) return (colrandfloats) except Exception as e: opstat.store_exception( "'random_float_range' error : " + dftitle + " " + str(randomFloatLower) + " " + str(randomFloatUpper), e) return (opstat)
def save_datastructures_file(self,dstype,creator) : try : with open(self.get_datastructures_file_name(dstype,creator), 'w') as datastructures_file : if(dstype == DICT_ID) : if(creator == DFC_CREATED) : json.dump(self.dictStore,datastructures_file) else : json.dump(self.userdictStore,datastructures_file) else : if(creator == DFC_CREATED) : json.dump(self.listStore,datastructures_file) else : json.dump(self.userlistStore,datastructures_file) datastructures_file.close() except Exception as e: opstat = opStatus() opstat.store_exception("Unable to save file " + self.get_datastructures_file_name(dstype,creator),e) display_exception(opstat)
def convert_df_column_to_degrees_or_radians(dftitle, dfcolname, degrees): """ * ------------------------------------------------------------------------ * function : convert dataframe column to degrees or radians * * parms : * dftitle - dataframe title * dfcolname - dataframe column to apply trig function to * degrees - True - convert to degrees * False - conveet to radians * * returns : * Successful : converted column values list * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) import numpy as np colvalues = np.array() try: if (degrees): colvalues = np.degrees(df[dfcolname]) else: colvalues = np.radians(df[dfcolname]) return (colvalues) except Exception as e: opstat.store_exception( "'convert_df_column_to_degrees_or_radians' error : " + dftitle + " " + dfcolname + " " + str(degrees), e) return (opstat)
def round_df_column(dftitle, dfcolname, decimals): """ * ------------------------------------------------------------------------ * function : round float column to decials range * * parms : * dftitle - dataframe title * dfcolname - dataframe column to round * decimals - rounding precision * 0 - round to int * * returns : * Successful : roundex col vals list * Error : opstat * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() df = cfg.get_dfc_dataframe_df(dftitle) import numpy as np dfrounds = np.array() try: if (decimals == 0): dfrounds = np.rint(df[dfcolname]) else: dfrounds = np.round_(df[dfcolname, decimals]) return (dfrounds) except Exception as e: opstat.store_exception( "'round_df_column' error : " + dftitle + " " + dfcolname + " " + str(decimals), e) return (opstat)
def export_custom(parms): """ * -------------------------------------------------------------------------- * function : export a custom * * parms : * parms - sql parms * * returns : N/A * -------------------------------------------------------- """ functionid = parms[0] opstat = opStatus() dispstats = False if (functionid == 1): opstat = process_custom_export(parms[1], dew.custom_export_id, display=True) dispstats = True elif (functionid == 2): custom_code = "# custom export\n" custom_code = custom_code + parms[1] cfg.set_config_value(dew.custom_export_id + "Parms", custom_code) display_export_forms(dem.EXPORT_CUSTOM_ONLY) elif (functionid == 3): cfg.drop_config_value(dew.custom_export_id + "Parms") display_export_forms(dem.EXPORT_CUSTOM_ONLY) elif (functionid == 5): display_export_forms(dem.EXPORT_CUSTOM_ONLY, -1, True) return (dispstats, opstat)
def display_inspect_categories(): """ * -------------------------------------------------------------------------- * function : display the inspect categoriies option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: cattable = dcTable("Category Columns", "catcolsTable", cfg.DataInspection_ID) catcandidatetable = dcTable("Category Candidate Columns", "catcandcolsTable", cfg.DataInspection_ID) numcats, numcands = diw.display_df_categories( cfg.get_current_chapter_df(cfg.DataInspection_ID), cattable, catcandidatetable) except Exception as e: opstat.store_exception("Error displaying category data\n ", e) clock.stop() if (not (opstat.get_status())): display_exception(opstat)
def load_datastructures_file(self,dstype,creator) : try : with open(self.get_datastructures_file_name(dstype,creator), 'r') as datastructures_file : if(dstype == DICT_ID) : if(creator == DFC_CREATED) : self.dictStore = json.load(datastructures_file) else : self.userdictStore = json.load(datastructures_file) else : if(creator == DFC_CREATED) : self.listStore = json.load(datastructures_file) else : self.userlistStore = json.load(datastructures_file) datastructures_file.close() except Exception as e: if(dstype == DICT_ID) : if(creator == DFC_CREATED) : self.dictStore = {} else : self.userdictStore = {} else : if(creator == DFC_CREATED) : self.listStore = {} else : self.userlistStore = {} opstat = opStatus() opstat.store_exception("Unable to load common file : " + self.get_datastructures_file_name(dstype,creator),e) display_exception(opstat)
def display_list_maint(keyValue=None, loadfile=None): """ * ------------------------------------------------------------------------ * function : display the user lists maintenance form * * parms : * keyValue - list name * * ------------------------------------------------------------------------- """ opstat = opStatus() if (loadfile is None): list_maint_input_form = InputForm( maint_list_utility_input_id, maint_list_utility_input_idList, maint_list_utility_input_labelList, maint_list_utility_input_typeList, maint_list_utility_input_placeholderList, maint_list_utility_input_jsList, maint_list_utility_input_reqList) else: list_maint_input_form = InputForm( maint_list_file_utility_input_id, maint_list_file_utility_input_idList, maint_list_file_utility_input_labelList, maint_list_file_utility_input_typeList, maint_list_file_utility_input_placeholderList, maint_list_file_utility_input_jsList, maint_list_file_utility_input_reqList) selectDicts = [] from dfcleanser.sw_utilities.sw_utility_model import get_lists_names, USER_CREATED list_names = get_lists_names(USER_CREATED) #print("list_names",list_names) if (not (list_names is None)): if (keyValue is None): def_list = list_names[0] else: def_list = keyValue sellist = swum.get_List(def_list, USER_CREATED) dsstr = "[" for i in range(len(sellist)): dsstr = dsstr + str(sellist[i]) if (i == (len(sellist) - 1)): dsstr = dsstr + "]" else: dsstr = dsstr + "," else: list_names = ["No User lists defined"] def_list = "No User lists defined" sellist = "User defined list" listssel = { "default": def_list, "list": list_names, "callback": "select_list" } selectDicts.append(listssel) from dfcleanser.common.common_utils import get_select_defaults get_select_defaults(list_maint_input_form, maint_list_utility_input_id, maint_list_utility_input_idList, maint_list_utility_input_typeList, selectDicts) list_maint_input_form.set_gridwidth(700) if (loadfile is None): list_maint_input_form.set_buttonstyle({ "font-size": 13, "height": 75, "width": 90, "left-margin": 20 }) else: list_maint_input_form.set_buttonstyle({ "font-size": 13, "height": 75, "width": 90, "left-margin": 205 }) list_maint_input_form.set_fullparms(True) cfg.drop_config_value(maint_list_utility_input_id + "Parms") cfg.drop_config_value(maint_list_utility_input_id + "ParmsProtect") if (not (loadfile is None)): import json from dfcleanser.common.common_utils import does_file_exist if (does_file_exist(loadfile)): try: with open(loadfile, 'r') as ds_file: ds = json.load(ds_file) ds_file.close() dsstr = "[" for i in range(len(ds)): dsstr = dsstr + str(ds[i]) if (i == (len(ds) - 1)): dsstr = dsstr + "]" else: dsstr = dsstr + "," except Exception as e: opstat.set_status(False) opstat.set_errorMsg("Error processing user file to load" + loadfile) opstat.set_exception(e) else: opstat.set_status(False) opstat.set_errorMsg("invalid user file to load" + loadfile) if (opstat.get_status()): if (loadfile is None): cfg.set_config_value(maint_list_utility_input_id + "Parms", [def_list, "", dsstr, ""]) else: cfg.set_config_value(maint_list_utility_input_id + "Parms", [def_list, "", dsstr, loadfile]) cfg.set_config_value(maint_list_utility_input_id + "ParmsProtect", [True, False, True, True]) help_note = "To add a user list enter parms and values above and click on 'Add User List'.</br>To update the current list change values and click on 'Update User List'" from dfcleanser.common.common_utils import get_help_note_html list_maint_notes_html = get_help_note_html(help_note, 80, 75, None) list_maint_html = "" list_maint_html = list_maint_input_form.get_html() list_maint_title_html = "<div>User Lists</div><br></br>" gridclasses = [ "dfcleanser-common-grid-header", "dfc-bottom", "dfc-footer" ] gridhtmls = [ list_maint_title_html, list_maint_html, list_maint_notes_html ] #print(list_maint_html) #print(list_maint_notes_html) print("\n") display_generic_grid("sw-utils-listdict-wrapper", gridclasses, gridhtmls) else: display_exception(opstat) add_error_to_log("[Get User Dict from File] " + loadfile + str(sys.exc_info()[0].__name__))
def display_dict_maint(keyValue=None, loadfile=None): """ * ------------------------------------------------------------------------ * function : display the user dicts maintenance form * * parms : * keyValue - dict name * * ------------------------------------------------------------------------- """ #print("display_dict_maint",keyValue,loadfile) opstat = opStatus() if (loadfile is None): dict_maint_input_form = InputForm( maint_dict_utility_input_id, maint_dict_utility_input_idList, maint_dict_utility_input_labelList, maint_dict_utility_input_typeList, maint_dict_utility_input_placeholderList, maint_dict_utility_input_jsList, maint_dict_utility_input_reqList) else: dict_maint_input_form = InputForm( maint_dict_file_utility_input_id, maint_dict_file_utility_input_idList, maint_dict_file_utility_input_labelList, maint_dict_file_utility_input_typeList, maint_dict_file_utility_input_placeholderList, maint_dict_file_utility_input_jsList, maint_dict_file_utility_input_reqList) selectDicts = [] from dfcleanser.sw_utilities.sw_utility_model import get_dicts_names, USER_CREATED dict_names = get_dicts_names(USER_CREATED) #print("dict_names",dict_names) if (not (dict_names is None)): if (keyValue is None): def_dict = dict_names[0] else: def_dict = keyValue seldict = swum.get_Dict(def_dict, USER_CREATED) keys = list(seldict.keys()) if ((def_dict == "Country_Codes") or (def_dict == "Language_Codes")): keys.sort() seldict = swum.get_pretty_dict(seldict, keys) else: dict_names = ["No User dicts defined"] def_dict = "No User dicts defined" seldict = "User defined dict" dictssel = { "default": def_dict, "list": dict_names, "callback": "select_dict" } selectDicts.append(dictssel) from dfcleanser.common.common_utils import get_select_defaults get_select_defaults(dict_maint_input_form, maint_dict_utility_input_id, maint_dict_utility_input_idList, maint_dict_utility_input_typeList, selectDicts) dict_maint_input_form.set_gridwidth(700) #dict_maint_input_form.set_custombwidth(110) if (loadfile is None): dict_maint_input_form.set_buttonstyle({ "font-size": 13, "height": 75, "width": 90, "left-margin": 20 }) else: dict_maint_input_form.set_buttonstyle({ "font-size": 13, "height": 75, "width": 90, "left-margin": 205 }) dict_maint_input_form.set_fullparms(True) cfg.drop_config_value(maint_dict_utility_input_id + "Parms") cfg.drop_config_value(maint_dict_utility_input_id + "ParmsProtect") if (not (loadfile is None)): import json #from dfcleanser.common.common_utils import does_file_exist #print("does_file_exist",does_file_exist(loadfile)) try: with open(loadfile, 'r') as ds_file: ds = json.load(ds_file) ds_file.close() keys = list(ds.keys()) seldict = swum.get_pretty_dict(ds, keys) #print(seldict) except Exception as e: opstat.set_errorMsg("invalid user file to load " + loadfile) opstat.set_exception(e) if (opstat.get_status()): if (loadfile is None): cfg.set_config_value(maint_dict_utility_input_id + "Parms", [def_dict, "", seldict, ""]) else: cfg.set_config_value(maint_dict_utility_input_id + "Parms", [def_dict, "", seldict, loadfile]) cfg.set_config_value(maint_dict_utility_input_id + "ParmsProtect", [True, False, True, True]) help_note = "To add a user dict enter parms and values above and click on 'Add User Dict'.</br>To update the current dict change values and click on 'Update User Dict'" from dfcleanser.common.common_utils import get_help_note_html dict_maint_notes_html = get_help_note_html(help_note, 80, 75, None) dict_maint_html = "Fill in new user dict parms or update currently displayed user dict." dict_maint_html = dict_maint_input_form.get_html() dict_maint_title_html = "<div>User Dicts</div><br></br>" gridclasses = [ "dfcleanser-common-grid-header", "dfc-bottom", "dfc-footer" ] gridhtmls = [ dict_maint_title_html, dict_maint_html, dict_maint_notes_html ] #print(dict_maint_html) #print(dict_maint_notes_html) print("\n") display_generic_grid("sw-utils-listdict-wrapper", gridclasses, gridhtmls) else: display_exception(opstat) add_error_to_log("[Get User Dict from File] " + loadfile + str(sys.exc_info()[0].__name__))
def display_inspect_cols(parms): """ * -------------------------------------------------------------------------- * function : display the inspect cols option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: df = cfg.get_current_chapter_df(cfg.DataInspection_ID) colnames = df.columns.tolist() if (not (parms is None)): colname = parms else: colname = colnames[0] cnames = { 'default': colname, 'list': colnames, "callback": "change_inspect_cols_col", "size": 10 } if (is_numeric_col(df, colname)): coldetails_form = InputForm(diw.inspect_col_input_id, diw.inspect_col_input_idList, diw.inspect_col_input_labelList, diw.inspect_col_input_typeList, diw.inspect_col_input_placeholderList, diw.inspect_col_input_jsList, diw.inspect_col_input_reqList) else: coldetails_form = InputForm( diw.inspect_nn_col_input_id, diw.inspect_nn_col_input_idList, diw.inspect_nn_col_input_labelList, diw.inspect_nn_col_input_typeList, diw.inspect_nn_col_input_placeholderList, diw.inspect_nn_col_input_jsList, diw.inspect_nn_col_input_reqList) selectDicts = [] selectDicts.append(cnames) get_select_defaults(coldetails_form, diw.inspect_col_input_id, diw.inspect_col_input_idList, diw.inspect_col_input_typeList, selectDicts) coldetails_form.set_shortForm(True) coldetails_form.set_fullparms(True) if (cfg.get_dfc_mode() == cfg.INLINE_MODE): coldetails_form.set_gridwidth(360) if (is_numeric_col(df, colname)): coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 85, "left-margin": 2 }) else: coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 85, "left-margin": 75 }) else: coldetails_form.set_gridwidth(480) if (is_numeric_col(df, colname)): coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 110, "left-margin": 2 }) else: coldetails_form.set_buttonstyle({ "font-size": 12, "height": 75, "width": 110, "left-margin": 110 }) coldetails_html = coldetails_form.get_html() from dfcleanser.data_cleansing.data_cleansing_widgets import display_col_stats col_stats_html = display_col_stats(df, colname, False, True) gridclasses = ["dfc-left", "dfc-right"] gridhtmls = [col_stats_html, coldetails_html] if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-column-data-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-pop-up-column-data-wrapper", gridclasses, gridhtmls) except Exception as e: opstat.store_exception("Error displaying column data\n ", e) clock.stop() if (not (opstat.get_status())): display_exception(opstat)
def display_inspect_rows(rowid=0): """ * -------------------------------------------------------------------------- * function : display the inspect rows option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() clock = RunningClock() clock.start() try: print("\n") from dfcleanser.data_transform.data_transform_dataframe_widgets import display_current_df_index display_current_df_index( cfg.get_current_chapter_df(cfg.DataInspection_ID), cfg.get_current_chapter_dfc_df_title(cfg.DataInspection_ID), 0, True) row_stats_html = diw.display_row_stats( cfg.get_current_chapter_df(cfg.DataInspection_ID), cfg.get_config_value(cfg.CURRENT_INSPECTION_DF), False) sample_row_html = dim.display_df_rows( cfg.get_current_chapter_df(cfg.DataInspection_ID), rowid, 200) rows_openexcel_tb = diw.get_inspection_openexcel_taskbar() rows_openexcel_tb.set_gridwidth(620) rows_openexcel_tb.set_customstyle({ "font-size": 13, "height": 90, "width": 120, "left-margin": 10 }) rows_openexcel_html = rows_openexcel_tb.get_html() rows_openexcel_html = (rows_openexcel_html + "<br>") cfg.set_config_value(cfg.CURRENT_SCROLL_ROW_KEY, rowid) gridclasses = ["dfc-top", "dfc-bottom", "dfc-footer"] gridhtmls = [row_stats_html, sample_row_html, rows_openexcel_html] if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-row-data-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-row-data-pop-up-wrapper", gridclasses, gridhtmls) except Exception as e: opstat.store_exception("Error displaying row data\n ", e) display_exception(opstat) import traceback traceback.print_exc() clock.stop()
def display_data_inspection(option, parms=None): """ * -------------------------------------------------------------------------- * function : main data inspection processing * * parms : * option - function option * parms - associated parms * * returns : * N/A * -------------------------------------------------------- """ from IPython.display import clear_output clear_output() opstat = opStatus() from dfcleanser.common.html_widgets import define_inputs, are_owner_inputs_defined if (not (are_owner_inputs_defined(cfg.DataInspection_ID))): define_inputs(cfg.DataInspection_ID, diw.datainspection_inputs) if (option == dim.MAIN_OPTION): drop_working_df() diw.display_dfc_inspection_main() clear_data_inspection_data() else: diw.display_inspection_main_taskbar() if (cfg.is_a_dfc_dataframe_loaded()): if ((option == dim.DISPLAY_DATATYPES_OPTION) or (option == dim.DISPLAY_NANS_OPTION) or (option == dim.DISPLAY_ROWS_OPTION) or (option == dim.DISPLAY_COLS_OPTION) or (option == dim.DISPLAY_CATEGORIES_OPTION)): fparms = get_parms_for_input(parms[0], diw.data_inspection_df_input_idList) if (len(fparms) > 0): cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, fparms[0]) if (not (option == dim.DISPLAY_ROWS_OPTION)): drop_working_df() if ((option == dim.DISPLAY_DATATYPES_OPTION) or (option == dim.DISPLAY_FULL_COLUMN_NAMES)): df_data_info = dim.get_df_datatypes_data( cfg.get_current_chapter_df(cfg.DataInspection_ID)) display_inspect_datatypes(option, df_data_info) elif (option == dim.DISPLAY_NANS_OPTION): display_inspect_nans() elif (option == dim.DISPLAY_ROWS_OPTION): display_inspect_rows() elif (option == dim.DISPLAY_COLS_OPTION): if (len(parms) > 1): display_inspect_cols(parms[1]) else: display_inspect_cols(None) elif (option == dim.DISPLAY_CATEGORIES_OPTION): display_inspect_categories() elif ((option == dim.DROP_ROW_NANS_OPTION) or (option == dim.DROP_COL_NANS_OPTION)): thresholdType = parms[0] if (option == dim.DROP_ROW_NANS_OPTION): fparms = get_parms_for_input(parms[1], diw.drop_rows_input_idList) else: fparms = get_parms_for_input(parms[1], diw.drop_columns_input_idList) if (len(fparms) > 0): try: threshold = int(fparms[0]) except: opstat.set_status(False) if (option == dim.DROP_ROW_NANS_OPTION): opstat.set_errorMsg("Drop Nan Rows Threshold value '" + fparms[0] + "' is invalid") else: opstat.set_errorMsg("Drop Nan Cols Threshold value '" + fparms[0] + "' is invalid") threshold = None else: opstat.set_status(False) if (option == dim.DROP_ROW_NANS_OPTION): opstat.set_errorMsg( "Drop Nan Rows Threshold value is not defined") else: opstat.set_errorMsg( "Drop Nan Cols Threshold value is not defined") threshold = None if (option == dim.DROP_ROW_NANS_OPTION): if (opstat.get_status()): dropstats = drop_nan_rows( cfg.get_current_chapter_df(cfg.DataInspection_ID), threshold, thresholdType, opstat) if (not (opstat.get_status())): display_exception(opstat) else: if (dropstats[0] > 0): display_status( str(dropstats[0]) + " Nan Rows Dropped Successfully") else: display_status( "No Rows matching threshold were dropped") else: if (opstat.get_status()): numcolsdropped = drop_nan_cols( cfg.get_current_chapter_df(cfg.DataInspection_ID), threshold, thresholdType, opstat) if (not (opstat.get_status())): display_exception(opstat) else: if (numcolsdropped > 0): display_status( str(numcolsdropped) + " Columns with Nans Dropped Successfully") else: display_status( " No Columns matching threshold were dropped") elif (option == dim.DISPLAY_ROW_OPTION): display_inspect_rows() elif (option == dim.DISPLAY_COL_GRAPHS): display_inspect_graphs(parms) elif (option == dim.DISPLAY_COL_OUTLIERS): display_inspect_outliers(parms[0]) elif (option == dim.DISPLAY_SCROLL_TO_DF_ROW): diw.display_scroll_to_row() elif (option == dim.PROCESS_SCROLL_TO_DF_ROW): opstat = opStatus() df = cfg.get_current_chapter_df(cfg.DataInspection_ID) retparms = get_row_id_for_df(df, parms, diw.scroll_df_rows_input_idList, opstat) if (opstat.get_status()): if (retparms[1] == 0): display_inspect_rows(retparms[0]) else: display_inspect_rows(retparms[0]) else: diw.display_scroll_to_row() display_exception(opstat) elif (option == dim.SCROLL_DF_ROWS_DOWN): new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY) if (new_row_id is None): new_row_id = 0 else: new_row_id = new_row_id + 200 df = cfg.get_current_chapter_df(cfg.DataInspection_ID) if (new_row_id > len(df)): new_row_id = cfg.get_config_value( cfg.CURRENT_SCROLL_ROW_KEY) display_inspect_rows(new_row_id) elif (option == dim.SCROLL_DF_ROWS_UP): new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY) if (new_row_id is None): new_row_id = 0 else: new_row_id = new_row_id - 200 if (new_row_id < 0): new_row_id = 0 display_inspect_rows(new_row_id) elif (option == dim.DISPLAY_DF_ROW): print("dim.DISPLAY_DF_ROW") elif (option == dim.DISPLAY_DF_ROW_REMOTE): chapterid = parms[0] #print("chapterId",chapterid) new_config_df = None if (chapterid == cfg.DataInspection_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_INSPECTION_DF) elif (chapterid == cfg.DataCleansing_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_CLEANSE_DF) elif (chapterid == cfg.DataTransform_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF) elif (chapterid == cfg.DataExport_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_EXPORT_DF) elif (chapterid == cfg.DataImport_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_IMPORT_DF) elif (chapterid == cfg.SWGeocodeUtility_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_GEOCODE_DF) elif (chapterid == cfg.SWDFSubsetUtility_ID): new_config_df = cfg.get_config_value(cfg.CURRENT_SUBSET_DF) cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, new_config_df) display_inspect_rows() else: cfg.drop_config_value(cfg.CURRENT_INSPECTION_DF) if (not (option == dim.MAIN_OPTION)): cfg.display_no_dfs(cfg.DataInspection_ID) from dfcleanser.common.display_utils import display_pop_up_buffer display_pop_up_buffer()
def display_inspect_datatypes(option, df_data_info): """ * -------------------------------------------------------------------------- * function : display the datatypes option * * parms : * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() import matplotlib.pyplot as plt clock = RunningClock() clock.start() try: if (not (option == dim.DISPLAY_FULL_COLUMN_NAMES)): data_types_table = dcTable("Column Data Types", "datatypesTable", cfg.DataInspection_ID) else: data_types_table = None data_types_html = diw.display_df_datatypes(data_types_table, df_data_info[0], df_data_info[1], df_data_info[2], option, False) gridclasses = ["dfc-main"] gridhtmls = [data_types_html] if (cfg.get_dfc_mode() == cfg.INLINE_MODE): display_generic_grid("df-inspection-wrapper", gridclasses, gridhtmls) else: display_generic_grid("df-inspection-pop-up-wrapper", gridclasses, gridhtmls) print("\n") import matplotlib.pyplot as plt import numpy as np font = {'fontsize': 14} font2 = {'fontsize': 18} objects = [] for i in range(len(df_data_info[0])): ttype = str(df_data_info[0][i]) ttype = ttype.replace("datetime.", "") objects.append(ttype) y_pos = np.arange(len(objects)) plt.bar(y_pos, df_data_info[1], align='center', alpha=0.5, color='#428bca') plt.xticks(y_pos, objects, rotation='vertical') plt.ylabel('Type Counts', fontdict=font) plt.xlabel('Data Types', fontdict=font) plt.title('Column Data Types', fontdict=font2) plt.show() except Exception as e: opstat.store_exception("Error displaying data types\n ", e) clock.stop() if (not (opstat.get_status())): display_exception(opstat)
def display_system_environment(funcId, parms=None): """ * -------------------------------------------------------------------------- * function : display system environment screens * * parms : * funcId - display func id * parms - associated parms * * returns : * N/A * -------------------------------------------------------- """ if (not (cfg.check_if_dc_init())): sysw.display_system_main_taskbar() return else: from dfcleanser.common.html_widgets import define_inputs, are_owner_inputs_defined if (not (are_owner_inputs_defined(cfg.System_ID))): define_inputs(cfg.System_ID, sysw.system_inputs) if (funcId == sysm.DISPLAY_MAIN): display_main_tb() clear_system_data() if (not (funcId == sysm.PROCESS_EULA)): if (not isEULA_read()): #display_system_main_taskbar() sysw.display_EULA() return if (funcId == sysm.DISPLAY_CHAPTERS): sysw.display_system_chapters_taskbar() if (funcId == sysm.RESET_CHAPTERS): sysw.display_system_main_taskbar() initialize_notebook() if (funcId == sysm.PROCESS_CHAPTERS): parms[0] = parms[0].replace("[", "") parms[0] = parms[0].replace("]", "") utils_cbs = parms[0].split(",") utilscbs = [] for i in range(len(utils_cbs)): if (utils_cbs[i] == '"True"'): utilscbs.append(1) else: utilscbs.append(0) from dfcleanser.system.load import reload_dfcleanser reload_dfcleanser([utilscbs]) clear_cell() sysw.display_system_main_taskbar() elif (funcId == sysm.DISPLAY_DATAFRAMES): if (not (parms is None)): title = parms[0] else: title = None sysw.display_system_main_taskbar() sysw.display_df_dataframes(title) elif (funcId == sysm.DISPLAY_ADD_DATAFRAME): sysw.display_system_main_taskbar() cfg.drop_config_value(sysw.dfmgr_add_input_id + "Parms") sysw.display_add_df_input() elif (funcId == sysm.PROCESS_DATAFRAME): fid = parms[0] from dfcleanser.common.common_utils import get_parms_for_input fparms = get_parms_for_input(parms[1], sysw.dfmgr_input_idList) dftitle = None if (fid == sysm.DROP_DATAFRAME): cfg.drop_dfc_dataframe(fparms[0]) elif (fid == sysm.SET_DATAFRAME): print("sysm.SET_DATAFRAME") elif (fid == sysm.UPDATE_DATAFRAME): cfg.set_dfc_dataframe_notes(fparms[0], fparms[3]) dftitle = fparms[0] elif (fid == sysm.RENAME_DATAFRAME): cfg.rename_dfc_dataframe( cfg.get_config_value(cfg.CURRENT_DF_DISPLAYED_KEY), fparms[0]) dftitle = fparms[0] sysw.display_system_main_taskbar() sysw.display_df_dataframes(dftitle) elif (funcId == sysm.PROCESS_ADD_DATAFRAME): opstat = opStatus() from dfcleanser.common.common_utils import get_parms_for_input fparms = get_parms_for_input(parms[1], sysw.dfmgr_add_input_idList) dftitle = fparms[0] dfobject = fparms[1] dfnotes = fparms[2] if (not (len(dftitle)) > 0): opstat.set_status(False) opstat.set_errorMsg("Invalid df title parm") else: if (len(dfobject) > 0): try: add_df_js = ("add_new_dfc_df('" + dftitle + "', '" + dfobject + "', '" + dfnotes + "');") run_jscript(add_df_js, "fail to add dataframe : ") except Exception: opstat.set_status(False) opstat.set_errorMsg( "Unable to add df to dfc manager : " + str(sys.exc_info()[0].__name__)) else: opstat.set_status(False) opstat.set_errorMsg("Invalid df name parm") if (opstat.get_status()): sysw.display_df_dataframes(dftitle) else: display_status(opstat.get_errorMsg()) sysw.display_add_df_input() elif (funcId == sysm.DISPLAY_SYSTEM): display_main_tb() sysw.show_sys_info() elif (funcId == sysm.DISPLAY_OFFLINE): display_main_tb() sysw.display_offline() elif (funcId == sysm.DISPLAY_ABOUT): sysw.display_system_main_taskbar() sysw.show_about_info() elif (funcId == sysm.DISPLAY_DFC_FILES): sysw.display_system_main_taskbar() sysw.display_dfc_files_form() elif (funcId == sysm.DISPLAY_EULA): display_main_tb() sysw.display_EULA() elif (funcId == sysm.DISPLAY_README): display_main_tb() sysw.display_README() elif (funcId == sysm.PROCESS_EULA): display_main_tb() cfg.set_config_value(cfg.EULA_FLAG_KEY, "true") elif (funcId == sysm.EXIT_SETUP): from dfcleanser.system.load import unload_dfcleanser unload_dfcleanser() return
def get_geocode_center(geocoords): """ * ------------------------------------------------------------------------ * function : get the center point of a list of [lat,lng] locations * * parms : * geocoords - geecode locations list * * returns : * center point if no exception * opStatus object if exception * * Notes : * dfcleanser generic function * ------------------------------------------------------------------------- """ opstat = opStatus() import math try: # verify geecoords for i in range(len(geocoords)): try: float(geocoords[i][0]) float(geocoords[i][1]) except: geocoords.pop(i) if (len(geocoords > 0)): x = float(0) y = float(0) z = float(0) for i in range(len(geocoords)): latitude = geocoords[i][0] * math.pi / 180 longitude = geocoords[i][1] * math.pi / 180 x = x + math.cos(latitude) * math.cos(longitude) y = y + math.cos(latitude) * math.sin(longitude) z = z + math.sin(latitude) x = x / len(geocoords) y = y / len(geocoords) z = z / len(geocoords) centralLongitude = math.atan2(y, x) centralSquareRoot = math.sqrt(x * x + y * y) centralLatitude = math.atan2(z, centralSquareRoot) centralLatitude = centralLatitude * 180 / math.pi centralLongitude = centralLongitude * 180 / math.pi return ([centralLatitude, centralLongitude]) else: opstat.set_status(False) opstat.set_errorMsg( "Calculate Geocode Center Error : geocoords list is empty") return (opstat) except: opstat.set_status(False) opstat.set_errorMsg("Calculate Geocode Center Exception : " + str(sys.exc_info()[0].__name__)) return (opstat)
def display_script_exception(e): opstat = opStatus() opstat.store_exception("Unable to run script", e) display_exception(opstat) return ()