コード例 #1
0
def save_column_names_row(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : save column names row to a file
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus() 
    
    try :
    
        fparms      =   get_parms_for_input(parms,dftw.df_save_row_transform_input_idList)
        filename    =   fparms[0]
            
        if(len(filename) == 0) :
            filename = "./" + cfg.get_config_value(cfg.CURRENT_IMPORTED_DATA_SOURCE_KEY)
            #filename = filename.replace(".","_")
            filename = filename + "_column_names.json"

        # see if save col names row 
        if(len(filename) > 0) :
                        
            colids = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist()

            # save the index row as file
            with open(filename, 'w') as colid_file :
                json.dump(colids,colid_file)
                
            colid_file.close()
                
            if(display) :
                    
                #make scriptable
                add_to_script(["# save column names row",
                               "from dfcleanser.data_transform.data_transform_dataframe_control save_column_names_row",
                               "save_column_names_row(" + json.dumps(parms) + ",False)"],opstat)
                
    except Exception as e:
        opstat.store_exception("Unable to save column names file to : " + filename,e)

    return([opstat, filename])
コード例 #2
0
def display_inspect_nans():
    """
    * -------------------------------------------------------------------------- 
    * function : display the inspect nans option
    * 
    * parms :
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    nans_rows_table = dcTable("Rows with most NaNs", "nansrowTable",
                              cfg.DataInspection_ID)
    nans_cols_table = dcTable("Columns with most NaNs", "nansTable",
                              cfg.DataInspection_ID)
    diw.display_null_data(cfg.get_current_chapter_df(cfg.DataInspection_ID),
                          nans_rows_table, nans_cols_table, 120)
コード例 #3
0
def remwhitespace_column_names_row(parms,display=True):
    """
    * -------------------------------------------------------- 
    * function : drop the column names row
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus() 
    #TODO  what the f**k   
    return(opstat)
    
    try :
        
        df          =   cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF)
    
        collist     =   df.columns.tolist()
        df.drop(labels=collist,axis=1,inplace=True)
        
        
            
        if(display) :
            #make scriptable
            add_to_script(["# change column names",
                           "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names",
                           "drop_column_names(False)"],opstat)
            
    except Exception as e:
        opstat.store_exception("Unable to change column names ",e)
        
    return(opstat)
コード例 #4
0
def display_inspect_categories():
    """
    * -------------------------------------------------------------------------- 
    * function : display the inspect categoriies option
    * 
    * parms :
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus()

    clock = RunningClock()
    clock.start()

    try:

        cattable = dcTable("Category Columns", "catcolsTable",
                           cfg.DataInspection_ID)

        catcandidatetable = dcTable("Category Candidate Columns",
                                    "catcandcolsTable", cfg.DataInspection_ID)

        numcats, numcands = diw.display_df_categories(
            cfg.get_current_chapter_df(cfg.DataInspection_ID), cattable,
            catcandidatetable)

    except Exception as e:
        opstat.store_exception("Error displaying category data\n ", e)

    clock.stop()

    if (not (opstat.get_status())):
        display_exception(opstat)
コード例 #5
0
def display_inspect_cols(parms):
    """
    * -------------------------------------------------------------------------- 
    * function : display the inspect cols option
    * 
    * parms :
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus()

    clock = RunningClock()
    clock.start()

    try:

        df = cfg.get_current_chapter_df(cfg.DataInspection_ID)
        colnames = df.columns.tolist()

        if (not (parms is None)):
            colname = parms
        else:
            colname = colnames[0]

        cnames = {
            'default': colname,
            'list': colnames,
            "callback": "change_inspect_cols_col",
            "size": 10
        }

        if (is_numeric_col(df, colname)):

            coldetails_form = InputForm(diw.inspect_col_input_id,
                                        diw.inspect_col_input_idList,
                                        diw.inspect_col_input_labelList,
                                        diw.inspect_col_input_typeList,
                                        diw.inspect_col_input_placeholderList,
                                        diw.inspect_col_input_jsList,
                                        diw.inspect_col_input_reqList)
        else:

            coldetails_form = InputForm(
                diw.inspect_nn_col_input_id, diw.inspect_nn_col_input_idList,
                diw.inspect_nn_col_input_labelList,
                diw.inspect_nn_col_input_typeList,
                diw.inspect_nn_col_input_placeholderList,
                diw.inspect_nn_col_input_jsList,
                diw.inspect_nn_col_input_reqList)

        selectDicts = []
        selectDicts.append(cnames)

        get_select_defaults(coldetails_form, diw.inspect_col_input_id,
                            diw.inspect_col_input_idList,
                            diw.inspect_col_input_typeList, selectDicts)

        coldetails_form.set_shortForm(True)
        coldetails_form.set_fullparms(True)

        if (cfg.get_dfc_mode() == cfg.INLINE_MODE):

            coldetails_form.set_gridwidth(360)
            if (is_numeric_col(df, colname)):
                coldetails_form.set_buttonstyle({
                    "font-size": 12,
                    "height": 75,
                    "width": 85,
                    "left-margin": 2
                })
            else:
                coldetails_form.set_buttonstyle({
                    "font-size": 12,
                    "height": 75,
                    "width": 85,
                    "left-margin": 75
                })

        else:

            coldetails_form.set_gridwidth(480)
            if (is_numeric_col(df, colname)):
                coldetails_form.set_buttonstyle({
                    "font-size": 12,
                    "height": 75,
                    "width": 110,
                    "left-margin": 2
                })
            else:
                coldetails_form.set_buttonstyle({
                    "font-size": 12,
                    "height": 75,
                    "width": 110,
                    "left-margin": 110
                })

        coldetails_html = coldetails_form.get_html()

        from dfcleanser.data_cleansing.data_cleansing_widgets import display_col_stats
        col_stats_html = display_col_stats(df, colname, False, True)

        gridclasses = ["dfc-left", "dfc-right"]
        gridhtmls = [col_stats_html, coldetails_html]

        if (cfg.get_dfc_mode() == cfg.INLINE_MODE):
            display_generic_grid("df-inspection-column-data-wrapper",
                                 gridclasses, gridhtmls)
        else:
            display_generic_grid("df-inspection-pop-up-column-data-wrapper",
                                 gridclasses, gridhtmls)

    except Exception as e:
        opstat.store_exception("Error displaying column data\n ", e)

    clock.stop()

    if (not (opstat.get_status())):
        display_exception(opstat)
コード例 #6
0
def display_inspect_rows(rowid=0):
    """
    * -------------------------------------------------------------------------- 
    * function : display the inspect rows option
    * 
    * parms :
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus()

    clock = RunningClock()
    clock.start()

    try:

        print("\n")

        from dfcleanser.data_transform.data_transform_dataframe_widgets import display_current_df_index
        display_current_df_index(
            cfg.get_current_chapter_df(cfg.DataInspection_ID),
            cfg.get_current_chapter_dfc_df_title(cfg.DataInspection_ID), 0,
            True)

        row_stats_html = diw.display_row_stats(
            cfg.get_current_chapter_df(cfg.DataInspection_ID),
            cfg.get_config_value(cfg.CURRENT_INSPECTION_DF), False)

        sample_row_html = dim.display_df_rows(
            cfg.get_current_chapter_df(cfg.DataInspection_ID), rowid, 200)

        rows_openexcel_tb = diw.get_inspection_openexcel_taskbar()
        rows_openexcel_tb.set_gridwidth(620)
        rows_openexcel_tb.set_customstyle({
            "font-size": 13,
            "height": 90,
            "width": 120,
            "left-margin": 10
        })
        rows_openexcel_html = rows_openexcel_tb.get_html()
        rows_openexcel_html = (rows_openexcel_html + "<br>")

        cfg.set_config_value(cfg.CURRENT_SCROLL_ROW_KEY, rowid)

        gridclasses = ["dfc-top", "dfc-bottom", "dfc-footer"]
        gridhtmls = [row_stats_html, sample_row_html, rows_openexcel_html]

        if (cfg.get_dfc_mode() == cfg.INLINE_MODE):
            display_generic_grid("df-inspection-row-data-wrapper", gridclasses,
                                 gridhtmls)
        else:
            display_generic_grid("df-inspection-row-data-pop-up-wrapper",
                                 gridclasses, gridhtmls)

    except Exception as e:
        opstat.store_exception("Error displaying row data\n ", e)
        display_exception(opstat)

        import traceback
        traceback.print_exc()

    clock.stop()
コード例 #7
0
def display_data_inspection(option, parms=None):
    """
    * -------------------------------------------------------------------------- 
    * function : main data inspection processing
    * 
    * parms :
    *   option  -   function option
    *   parms   -   associated parms
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    from IPython.display import clear_output
    clear_output()

    opstat = opStatus()

    from dfcleanser.common.html_widgets import define_inputs, are_owner_inputs_defined
    if (not (are_owner_inputs_defined(cfg.DataInspection_ID))):
        define_inputs(cfg.DataInspection_ID, diw.datainspection_inputs)

    if (option == dim.MAIN_OPTION):
        drop_working_df()
        diw.display_dfc_inspection_main()
        clear_data_inspection_data()
    else:
        diw.display_inspection_main_taskbar()

    if (cfg.is_a_dfc_dataframe_loaded()):

        if ((option == dim.DISPLAY_DATATYPES_OPTION)
                or (option == dim.DISPLAY_NANS_OPTION)
                or (option == dim.DISPLAY_ROWS_OPTION)
                or (option == dim.DISPLAY_COLS_OPTION)
                or (option == dim.DISPLAY_CATEGORIES_OPTION)):

            fparms = get_parms_for_input(parms[0],
                                         diw.data_inspection_df_input_idList)

            if (len(fparms) > 0):
                cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, fparms[0])

            if (not (option == dim.DISPLAY_ROWS_OPTION)):
                drop_working_df()

        if ((option == dim.DISPLAY_DATATYPES_OPTION)
                or (option == dim.DISPLAY_FULL_COLUMN_NAMES)):
            df_data_info = dim.get_df_datatypes_data(
                cfg.get_current_chapter_df(cfg.DataInspection_ID))
            display_inspect_datatypes(option, df_data_info)

        elif (option == dim.DISPLAY_NANS_OPTION):
            display_inspect_nans()

        elif (option == dim.DISPLAY_ROWS_OPTION):
            display_inspect_rows()

        elif (option == dim.DISPLAY_COLS_OPTION):
            if (len(parms) > 1):
                display_inspect_cols(parms[1])
            else:
                display_inspect_cols(None)

        elif (option == dim.DISPLAY_CATEGORIES_OPTION):
            display_inspect_categories()

        elif ((option == dim.DROP_ROW_NANS_OPTION)
              or (option == dim.DROP_COL_NANS_OPTION)):

            thresholdType = parms[0]

            if (option == dim.DROP_ROW_NANS_OPTION):
                fparms = get_parms_for_input(parms[1],
                                             diw.drop_rows_input_idList)
            else:
                fparms = get_parms_for_input(parms[1],
                                             diw.drop_columns_input_idList)

            if (len(fparms) > 0):
                try:
                    threshold = int(fparms[0])
                except:
                    opstat.set_status(False)
                    if (option == dim.DROP_ROW_NANS_OPTION):
                        opstat.set_errorMsg("Drop Nan Rows Threshold value '" +
                                            fparms[0] + "' is invalid")
                    else:
                        opstat.set_errorMsg("Drop Nan Cols Threshold value '" +
                                            fparms[0] + "' is invalid")

                    threshold = None

            else:
                opstat.set_status(False)
                if (option == dim.DROP_ROW_NANS_OPTION):
                    opstat.set_errorMsg(
                        "Drop Nan Rows Threshold value is not defined")
                else:
                    opstat.set_errorMsg(
                        "Drop Nan Cols Threshold value is not defined")

                threshold = None

            if (option == dim.DROP_ROW_NANS_OPTION):

                if (opstat.get_status()):
                    dropstats = drop_nan_rows(
                        cfg.get_current_chapter_df(cfg.DataInspection_ID),
                        threshold, thresholdType, opstat)

                if (not (opstat.get_status())):
                    display_exception(opstat)
                else:
                    if (dropstats[0] > 0):
                        display_status(
                            str(dropstats[0]) +
                            " Nan Rows Dropped Successfully")
                    else:
                        display_status(
                            "No Rows matching threshold were dropped")

            else:

                if (opstat.get_status()):
                    numcolsdropped = drop_nan_cols(
                        cfg.get_current_chapter_df(cfg.DataInspection_ID),
                        threshold, thresholdType, opstat)

                if (not (opstat.get_status())):
                    display_exception(opstat)
                else:
                    if (numcolsdropped > 0):
                        display_status(
                            str(numcolsdropped) +
                            " Columns with Nans Dropped Successfully")
                    else:
                        display_status(
                            " No Columns matching threshold were dropped")

        elif (option == dim.DISPLAY_ROW_OPTION):
            display_inspect_rows()

        elif (option == dim.DISPLAY_COL_GRAPHS):
            display_inspect_graphs(parms)

        elif (option == dim.DISPLAY_COL_OUTLIERS):
            display_inspect_outliers(parms[0])

        elif (option == dim.DISPLAY_SCROLL_TO_DF_ROW):
            diw.display_scroll_to_row()

        elif (option == dim.PROCESS_SCROLL_TO_DF_ROW):

            opstat = opStatus()

            df = cfg.get_current_chapter_df(cfg.DataInspection_ID)

            retparms = get_row_id_for_df(df, parms,
                                         diw.scroll_df_rows_input_idList,
                                         opstat)

            if (opstat.get_status()):

                if (retparms[1] == 0):
                    display_inspect_rows(retparms[0])
                else:
                    display_inspect_rows(retparms[0])

            else:

                diw.display_scroll_to_row()
                display_exception(opstat)

        elif (option == dim.SCROLL_DF_ROWS_DOWN):

            new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY)

            if (new_row_id is None):
                new_row_id = 0
            else:
                new_row_id = new_row_id + 200

                df = cfg.get_current_chapter_df(cfg.DataInspection_ID)
                if (new_row_id > len(df)):
                    new_row_id = cfg.get_config_value(
                        cfg.CURRENT_SCROLL_ROW_KEY)

            display_inspect_rows(new_row_id)

        elif (option == dim.SCROLL_DF_ROWS_UP):

            new_row_id = cfg.get_config_value(cfg.CURRENT_SCROLL_ROW_KEY)

            if (new_row_id is None):
                new_row_id = 0
            else:
                new_row_id = new_row_id - 200
                if (new_row_id < 0):
                    new_row_id = 0

            display_inspect_rows(new_row_id)

        elif (option == dim.DISPLAY_DF_ROW):

            print("dim.DISPLAY_DF_ROW")

        elif (option == dim.DISPLAY_DF_ROW_REMOTE):

            chapterid = parms[0]
            #print("chapterId",chapterid)

            new_config_df = None

            if (chapterid == cfg.DataInspection_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_INSPECTION_DF)
            elif (chapterid == cfg.DataCleansing_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_CLEANSE_DF)
            elif (chapterid == cfg.DataTransform_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF)
            elif (chapterid == cfg.DataExport_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_EXPORT_DF)
            elif (chapterid == cfg.DataImport_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_IMPORT_DF)
            elif (chapterid == cfg.SWGeocodeUtility_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_GEOCODE_DF)
            elif (chapterid == cfg.SWDFSubsetUtility_ID):
                new_config_df = cfg.get_config_value(cfg.CURRENT_SUBSET_DF)

            cfg.set_config_value(cfg.CURRENT_INSPECTION_DF, new_config_df)

            display_inspect_rows()

    else:

        cfg.drop_config_value(cfg.CURRENT_INSPECTION_DF)

        if (not (option == dim.MAIN_OPTION)):
            cfg.display_no_dfs(cfg.DataInspection_ID)

    from dfcleanser.common.display_utils import display_pop_up_buffer
    display_pop_up_buffer()
コード例 #8
0
def display_df_subset_setup():
    """
    * -------------------------------------------------------------------------- 
    * function : display current df subset form
    * 
    * parms :
    *  df      -   dataframe to subset from
    *  filters -   filters form 
    *  colname -   filters column name 
    *
    * returns : N/A
    * --------------------------------------------------------
    """
    df_title = cfg.get_config_value(cfg.CURRENT_SUBSET_DF)
    df = cfg.get_dfc_dataframe_df(df_title)

    col_stats_table = get_column_stats_table(df_title, df)

    from dfcleanser.common.html_widgets import InputForm
    subset_input_form = InputForm(get_subset_input_id, get_subset_input_idList,
                                  get_subset_input_labelList,
                                  get_subset_input_typeList,
                                  get_subset_input_placeholderList,
                                  get_subset_input_jsList,
                                  get_subset_input_reqList)

    selectDicts = []

    dataframes = cfg.get_dfc_dataframes_select_list(cfg.SWDFSubsetUtility_ID)
    selectDicts.append(dataframes)

    current_df = cfg.get_current_chapter_df(cfg.SWDFSubsetUtility_ID)
    colnames = current_df.columns.tolist()
    cols_name_list = [" "]
    for i in range(len(colnames)):
        cols_name_list.append(colnames[i])

    cnames = {
        "default": cols_name_list[0],
        "list": cols_name_list,
        "callback": "change_subset_cols"
    }
    selectDicts.append(cnames)

    subssel = {"default": "Keep", "list": ["Keep", "Drop"]}
    selectDicts.append(subssel)

    get_select_defaults(subset_input_form, get_subset_input_form[0],
                        get_subset_input_form[1], get_subset_input_form[3],
                        selectDicts)

    subset_input_form.set_shortForm(False)
    subset_input_form.set_gridwidth(680)
    subset_input_form.set_custombwidth(140)
    subset_input_form.set_fullparms(True)

    get_subset_input_html = subset_input_form.get_html()

    get_subset_heading_html = "<div>Get Dataframe Subset</div><br></br>"

    gridclasses = ["dfc-top", "dfcleanser-common-grid-header", "dfc-bottom"]
    gridhtmls = [
        col_stats_table, get_subset_heading_html, get_subset_input_html
    ]

    print("\n")
    display_generic_grid("sw-utils-subset-wrapper", gridclasses, gridhtmls)
コード例 #9
0
def drop_duplicate_rows(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : drop df duplicate rows
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat = opStatus()
    
    fparms      =   get_parms_for_input(parms,dftw.df_drop_dups_transform_input_idList)

    colnames    =   fparms[0]
    
    if(len(colnames) == 0) :
        colnames    =   None
        
    if(fparms[2] == "Drop") :
        drop = True
    else :
        drop = False
    
    keep        =   fparms[3]
    if(keep == "False") :
        keep    =   False
       
        
    df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
            
    if(not (colnames is None)) :
        if(not drop) :
            fcolnames   =   []  
            colslist    =   df.columns.tolist()
            
            for i in range(len(colslist)) :
                if(not (colslist[i] in colnames)) :
                    fcolnames.append(colslist[i]) 
                    
            colnames    =   fcolnames
                
    if(opstat.get_status()) :
        
        try : 
            
            df.drop_duplicates(colnames,keep=keep,inplace=True)
                
            if(display) :
                #make scriptable
                add_to_script(["# drop duplicate rows",
                               "from dfcleanser.data_transform.data_transform_dataframe_control drop_duplicate_rows",
                               "drop_duplicate_rows("+ json.dumps(parms) + ",False)"],opstat)
        
        except Exception as e: 
            opstat.store_exception("Unable to drop duplicate rows : " + colnames,e)

    return(opstat)
コード例 #10
0
def append_to_df_index(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : append column to df indices
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat = opStatus()

    fparms      =   get_parms_for_input(parms,dftw.df_append_index_transform_input_idList)

    colnames    =   fparms[0]
    colnames    =   colnames.lstrip("[")
    colnames    =   colnames.rstrip("]")
    colnames    =   colnames.split(",")
    
    if(len(colnames) == 0) :
        opstat.set_status(False)
        opstat.set_errorMsg("column names list is empty")
        
    else :
        
        df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
        
        if(fparms[2] == "True") :
            drop = True
        else :
            drop = False
            
        if(fparms[3] == "True") :
            verify = True
        else :
            verify = False
    
        try :
            
            """
            df.reset_index(drop=False,inplace=True)
                
            cnames        =   list(df.columns)
            levels_to_drop  =   []
            
            for i in range(len(cnames)) :
                if(cnames[i].find("level_") > -1) :
                    levels_to_drop.append(cnames[i])
            
            if(len(levels_to_drop) > 0) :
                df.drop(levels_to_drop,axis=1,inplace=True)
            """
            
            df.set_index(keys=colnames,drop=drop,append=True,inplace=True,verify_integrity=verify)
                
            if(display) :
                #make scriptable
                add_to_script(["# append to df index",
                               "from dfcleanser.data_transform.data_transform_dataframe_control append_to_df_index",
                               "append_to_df_index(" + json.dumps(parms[1]) + ",False)"],opstat)
        
        except Exception as e: 
            opstat.store_exception("Unable to append to df index : " + colnames,e)

    return(opstat)
コード例 #11
0
def set_df_index(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : set df indices
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    opstat = opStatus()
    
    fparms      =   get_parms_for_input(parms,dftw.df_set_index_transform_input_idList)

    colnames    =   fparms[0]
    
    if(len(colnames) == 0) :
        opstat.set_status(False)
        opstat.set_errorMsg("column names list is empty")
        
    else :
        
        colnames    =   colnames.lstrip("[")
        colnames    =   colnames.rstrip("]")
        colnames    =   colnames.split(",")
        
        if(fparms[2] == "True") :
            drop = True
        else :
            drop = False
            
        if(opstat.get_status()) :
                
            if(fparms[3] == "True") :
                verify = True
            else :
                verify = False
                
    if(opstat.get_status()) :

        try : 
            
            df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
            
            df.set_index(colnames,drop=drop,append=True,inplace=True,verify_integrity=verify)
            
            cfg.set_dfc_dataframe_df(cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF),df)
            
            if(display) :
                #make scriptable
                add_to_script(["# set df index",
                               "from dfcleanser.data_transform.data_transform_dataframe_control set_df_index",
                               "set_df_index(" + json.dumps(parms[1]) + ",False)"],opstat)

        
        except Exception as e: 
            opstat.store_exception("Unable to set index of column(s) : " + str(colnames),e)

    return(opstat)
コード例 #12
0
def reset_df_index(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : reset df indices
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat = opStatus() 
    
    df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
    
    fparms      =   get_parms_for_input(parms,dftw.df_reset_index_transform_input_idList)
    
    drop_levels     =   fparms[0]
    
    if(len(drop_levels) > 0) :
        
        drop_levels     =   drop_levels.lstrip("[")
        drop_levels     =   drop_levels.rstrip("]")
        drop_levels     =   drop_levels.split(",")
        
        if(drop_levels[0] == "All") :
            
            drop_levels     =   []
            
            index_columns   =   df.index.names
            if(len(index_columns) > 0) :
                for i in range(len(index_columns)) :
                    if( not (index_columns[i] is None) ) :
                        drop_levels.append(index_columns[i])
        
    else :
        
        drop_levels     =   None
        
    
    if(fparms[2] == "True") :
        drop  =   False
    else :
        drop  =   True
    
    if(opstat.get_status()) :

        try :
            
            df.reset_index(level=drop_levels,drop=drop,inplace=True)
                
            if(display) :
                #make scriptable
                add_to_script(["# reset df index",
                               "from dfcleanser.data_transform.data_transform_dataframe_control reset_df_index",
                               "reset_df_index(" + json.dumps(parms[1]) + ",False)"],opstat)
    
        except Exception as e: 
            opstat.store_exception("Unable to reset df index : ",e)

    return(opstat)
コード例 #13
0
def process_df_transform(optionid,parms,display=True) :
    """
    * -------------------------------------------------------------------------- 
    * function : process dataframe transform option
    * 
    * parms :
    *   optionid  -   transform option
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat  =   opStatus()
    
    #dftw.display_dataframe_transform_taskbar()
    
    if(optionid == dtm.PROCESS_SHOW_COLUMN_NAMES_ROW) :
        
        dftw.display_dataframe_col_names_taskbar()
        
        print("\n")
        col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID)
        col_names_table.set_table_column_parms({"font":12})
        col_names_table.set_note("None")
        display_column_names(cfg.get_current_chapter_df(cfg.DataTransform_ID),col_names_table,None)  


    if(optionid == dtm.PROCESS_SAVE_COLUMN_NAMES_ROW) :
        
        [opstat, filename]  =   save_column_names_row(parms)
        
        dftw.display_dataframe_col_names_taskbar()

        if(opstat.get_status()) :
            display_status_note("Column Names Row Saved Successfully to : " + filename) 
            clear_dataframe_transform_cfg_values()
        else :
            display_exception(opstat)

        
    # add column names row
    elif(optionid == dtm.PROCESS_ADD_COLUMN_NAMES_ROW) :
    
        opstat     =   add_column_names_row(parms) 
        
        dftw.display_dataframe_col_names_taskbar()
        print("\n")
        
        if(opstat.get_status()) :

            clear_dataframe_transform_cfg_values()
            display_status_note("Column Names Row Added Successfully")
            
            col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID)
            col_names_table.set_table_column_parms({"font":12})
            col_names_table.set_note("None")
            display_column_names(cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF),col_names_table,None)    
                
        else :
                    
            display_main_option([[0,0]])
            display_exception(opstat)

    
    elif(optionid == dtm.PROCESS_CHANGE_COLUMN_NAMES) :
        
        opstat = change_column_names(parms)
        
        dftw.display_dataframe_col_names_taskbar()
        print("\n")
        
        if(opstat.get_status()) :
                
            clear_dataframe_transform_cfg_values()
            display_status_note("Column Names Changed Successfully")
            
            col_names_table = dcTable("Column Names ","cnamesTable",cfg.DataTransform_ID)
            col_names_table.set_table_column_parms({"font":12})
            col_names_table.set_note("None")
            display_column_names(cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF),col_names_table,None)    
                
        else :
            display_exception(opstat)


    if(optionid == dtm.PROCESS_DROP_COLUMN_NAMES_ROW) :
        
        opstat      =   drop_column_names_row()
        
        dftw.display_dataframe_col_names_taskbar()
        print("\n")
            
        if(opstat.get_status()) :
            display_status_note("Column Names Row Dropped Successfully")
            clear_dataframe_transform_cfg_values()
        else :
            display_exception(opstat)

            
    if(optionid == dtm.PROCESS_WHITESPACE_COLUMN_NAMES) :
        
        opstat      =   remwhitespace_column_names_row(parms)
        
        dftw.display_dataframe_col_names_taskbar()
        print("\n")
            
        if(opstat.get_status()) :
            display_status_note("Column Names Whitespace Removed Successfully")
            clear_dataframe_transform_cfg_values()
        else :
            display_exception(opstat)
        
    
    elif(optionid == dtm.PROCESS_SET_DF_INDEX) :
        
        opstat = set_df_index(parms) 
        
        dftw.display_dataframe_indices_taskbar()
        print("\n")
                
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note("df Index Set Successfully")
        else :
            display_exception(opstat)
            
        dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID),
                                      cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID))
        
        dftw.display_remote_df(cfg.DataTransform_ID)

                
    elif(optionid == dtm.PROCESS_RESET_DF_INDEX) :
        
        opstat = reset_df_index(parms)
        
        dftw.display_dataframe_indices_taskbar()
        print("\n")
            
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note("df Index Reset Successfully")
        else :
            display_exception(opstat)
            
        dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID),
                                      cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID))
        
        dftw.display_remote_df(cfg.DataTransform_ID) 
            
    elif(optionid == dtm.PROCESS_APPEND_TO_INDEX) :
        
        opstat = append_to_df_index(parms)
        
        dftw.display_dataframe_indices_taskbar()
        print("\n")
            
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note("df Index Appended to Successfully")
        else :
            dftw.display_dataframe_options([[4,0]])
            display_exception(opstat)
            
        dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID),
                                      cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID))
 
        dftw.display_remote_df(cfg.DataTransform_ID) 
            
    elif(optionid == dtm.PROCESS_SORT_DF_INDEX) :
        
        opstat = sort_df_index(parms)
        
        dftw.display_dataframe_indices_taskbar()
        print("\n")
            
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note("Dataframe Sorted by index Successfully")
        else :
            display_exception(opstat)
        
        dftw.display_current_df_index(cfg.get_current_chapter_df(cfg.DataTransform_ID),
                                      cfg.get_current_chapter_dfc_df_title(cfg.DataTransform_ID))
        
        dftw.display_remote_df(cfg.DataTransform_ID) 

    # drop duplicate rows
    elif(optionid == dtm.PROCESS_SORT_COLUMN) :

        opstat = process_sort_by_column(parms,display)
        
        dftw.display_dataframe_transform_main()
        print("\n")
            
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note(opstat.get_errorMsg())
        else :
            display_main_option([[0,0]])
            display_exception(opstat) 
            
    # drop duplicate rows
    elif(optionid == dtm.PROCESS_DROP_DUPLICATE_ROWS) :
        
        df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
        
        start_rows  =   len(df)

        opstat = drop_duplicate_rows(parms,display)
        
        end_rows    =   len(df)
        
        dftw.display_dataframe_transform_main()
        print("\n")
            
        if(opstat.get_status()) :
            clear_dataframe_transform_cfg_values()
            display_status_note(str(start_rows-end_rows) + " Duplicate Rows Dropped Successfully")
        else :
            display_exception(opstat) 
    
    # return
    elif(optionid == dtm.DF_TRANSFORM_RETURN) :
        
        dftw.display_dataframe_transform_main()
        
    # help
    elif(optionid == dtm.DF_TRANSFORM_HELP) :
        print("help")
コード例 #14
0
def add_column_names_row(parms,display=True):
    """
    * -------------------------------------------------------------------------- 
    * function : add a column names row
    * 
    * parms :
    *   parms     -   transform parms
    *   display   -   display flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat = opStatus() 
    
    try :

        fparms      =   get_parms_for_input(parms,dftw.df_add_row_transform_input_idList)
        filename    =   fparms[0]
        collist     =   fparms[1]
            
        if(len(filename) == 0) :
            filename = "None"
                
        if(len(collist) == 0 ) :
            collist = "None"
        else :
            collist =   collist.replace("'","")
            collist =   collist.split(",")
                
        if( (not(filename == "None")) or (not(collist == "None"))) :
            
            if(not(filename == "None")) :
                
                try :
                    
                    with open(filename, 'r') as colid_file :
                        colids = json.load(colid_file)
                        colid_file.close()
    
                except Exception as e: 
                    opstat.store_exception("Unable to open column names file" + filename,e)
                    
            else :
                
                colids = collist    
                
            cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns = colids
                    
            if(display) :
                        
                #make scriptable
                add_to_script(["# Add Column Names Row",
                               "from dfcleanser.data_transform.data_transform_dataframe_control add_column_names_row",
                               "add_column_names_row(" + single_quote(filename) +"," + json.dumps(collist) + ",False)"],opstat)
    
        else :
            
            opstat.set_status(False)
            opstat.set_errorMsg("No Column List or filename defined")
    
    except Exception as e: 
        opstat.store_exception("Unable to add column names",e)
    
    return(opstat)
コード例 #15
0
def process_sort_by_column(parms,display=True) :
    """
    * -------------------------------------------------------------------------- 
    * function : sort by column transform option
    * 
    * parms :
    *   parms   -   associated parms
    *   display -   display results flag
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """
    
    opstat  =   opStatus()

    fparms      =   get_parms_for_input(parms,dftw.sort_column_input_idList)
    
    colname     =   fparms[0]
        
    sortorder   =   fparms[1]
    if(sortorder == "True") :
        sortorder   =   True
    else :
        sortorder   =   False
    
    sortkind    =   fparms[2]
    sortkind    =   sortkind.lstrip("'")
    sortkind    =   sortkind.rstrip("'")    
    naposition  =   fparms[3]
    naposition  =   naposition.lstrip("'")
    naposition  =   naposition.rstrip("'")    

    resetrowids =   fparms[4]
    if(resetrowids == "True") :
        resetrowids     =   True
    else :
        resetrowids     =   False

    if(opstat.get_status()) :

        try : 
            
            df = cfg.get_current_chapter_df(cfg.DataTransform_ID)
            df.sort_values(colname,axis=0,ascending=sortorder,inplace=True,kind=sortkind,na_position=naposition)
        
            if(resetrowids) :
                from dfcleanser.data_transform.data_transform_dataframe_control import reset_df_index
                opstat = reset_df_index()
            
            if(display) :
            
                #make scriptable
                add_to_script(["# sort by column ",
                               "from dfcleanser.data_transform.data_transform_columns_control import process_sort_by_column",
                               "process_sort_by_column(" + json.dumps(parms) + ",False)"],opstat)
                
                opstat.set_errorMsg("df sorted by column '" + colname + "' successfully.")
        
        except Exception as e:
            opstat.store_exception("Sort df By Column Error : "+colname,e)
    
    cfg.drop_config_value(dftw.sort_column_input_id+"Parms")
    
    return(opstat)
コード例 #16
0
def find_matching_rows(df_title, column_type, cols_lists, vals_lists, opstat):
    """
    * -------------------------------------------------------------------------- 
    * function : find rows in df matching the col names and values
    * 
    * parms :
    *   cols_list -   col names
    *   vals_list -   column values
    *   opstat    -   status object
    *
    * returns : 
    *  N/A
    * --------------------------------------------------------
    """

    from dfcleanser.common.common_utils import is_int_col
    df = cfg.get_current_chapter_df(cfg.DataInspection_ID)

    clock = RunningClock()
    clock.start()

    import pandas as pd
    final_criteria = pd.Series()

    for i in range(len(cols_lists)):

        vals_list = vals_lists[i]
        vals_list = vals_list.replace("[", "")
        vals_list = vals_list.replace("]", "")

        vals_list = vals_list.split(",")

        col_vals_list = []

        if (column_type == 0):

            if (is_int_col(df, cols_lists[i])):
                try:
                    for j in range(len(vals_list)):
                        col_vals_list.append(int(vals_list[j]))
                except:
                    #print("int excx",j)
                    opstat.set_status(False)
            else:
                try:
                    for j in range(len(vals_list)):
                        col_vals_list.append(float(vals_list[j]))
                except:
                    #print("float excx",j)
                    opstat.set_status(False)

        else:
            try:
                for j in range(len(vals_list)):
                    col_vals_list.append(str(vals_list[j]))
            except:
                opstat.set_status(False)

        if (opstat.get_status()):

            if (len(col_vals_list) > 0):

                if (column_type == 0):

                    try:
                        current_criteria = df[cols_lists[i]].isin(
                            col_vals_list)
                        num_ccs = [
                            i for i in current_criteria.index
                            if current_criteria[i]
                        ]

                    except:
                        opstat.set_status(False)
                        opstat.set_errorMsg(
                            "failed to get current criteria subset " +
                            cols_lists[i])

                    if (opstat.get_status()):
                        try:

                            if (len(final_criteria) > 0):
                                final_criteria = final_criteria & current_criteria
                            else:
                                final_criteria = current_criteria

                            num_fcs = [
                                i for i in final_criteria.index
                                if final_criteria[i]
                            ]

                        except:
                            opstat.set_status(False)
                            opstat.set_errorMsg(
                                "failed to get final criteria subset " +
                                cols_lists[i])

                else:

                    for k in range(len(col_vals_list)):

                        current_criteria = df[cols_lists[i]].str.contains(
                            col_vals_list[k])

                        try:
                            if (len(final_criteria) > 0):
                                final_criteria = final_criteria & current_criteria
                            else:
                                final_criteria = current_criteria
                        except:
                            opstat.set_status(False)
                            opstat.set_errorMsg(
                                "failed to get final criteria subset " +
                                cols_lists[i])

            else:
                opstat.set_status(False)
                opstat.set_errorMsg("no valid column_values entered for " +
                                    cols_lists[i])

        else:
            opstat.set_status(False)
            opstat.set_errorMsg("invalid column_values entered for " +
                                cols_lists[i])

    clock.stop()

    num_trues = [i for i in final_criteria.index if final_criteria[i]]

    if (len(num_trues) > 0):

        search_df = df[final_criteria].copy()
        #print("search_df",len(search_df))
        search_df.reset_index(drop=True, inplace=True)

        from dfcleanser.common.cfg import dfc_dataframe, add_dfc_dataframe

        search_df_notes = "search subset from " + cfg.get_config_value(
            cfg.CURRENT_INSPECTION_DF)
        new_dfcdf = dfc_dataframe(df_title, search_df, search_df_notes)
        add_dfc_dataframe(new_dfcdf)

    return (len(num_trues))
コード例 #17
0
def get_subset_df() :
    return(cfg.get_current_chapter_df(cfg.SWDFSubsetUtility_ID))