def check_dataframe_diff(df1: pd.DataFrame, df2: pd.DataFrame, logger: logging.Logger = None, max_count: int = 10, ignore_same: bool = False): if logger is None: from kkutils.util.com import set_logger # 常に使いたくないのでここで呼ぶ logger = set_logger(__name__) df1, df2 = df1.copy().fillna(-999), df2.copy().fillna( -999) # Series内の nan == nan は False になるので fill しておく logger.info("check dataframe shape.", color=["BOLD", "GREEN"]) logger.info(f"df1 shape: {df1.shape}") logger.info(f"df2 shape: {df2.shape}") logger.info("check dataframe index.", color=["BOLD", "GREEN"]) ndf1, ndf2 = df1.index, df2.index if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0): logger.warning(f"index is different.") same_index = values_include(ndf1, ndf2) logger.debug(f"same index: {same_index}") logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}") logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}") else: if ignore_same == False: logger.info(f"index is same.", color=["BOLD", "BLUE"]) logger.info("check dataframe columns.", color=["BOLD", "GREEN"]) ndf1, ndf2 = df1.columns, df2.columns same_columns = values_include(ndf1, ndf2) if (ndf1.shape[0] != ndf2.shape[0]) or (~(ndf1 == ndf2).sum() > 0): logger.warning(f"columns is different.") logger.debug(f"same columns: {same_columns}") logger.warning(f"only df1 index: {values_not_include(ndf2, ndf1)}") logger.warning(f"only df2 index: {values_not_include(ndf1, ndf2)}") else: if ignore_same == False: logger.info(f"columns is same.", color=["BOLD", "BLUE"]) logger.info("we check only same indexes and same columns", color=["BOLD", "GREEN"]) df1 = df1.loc[df1.index.isin(df2.index), df1.columns.isin(df2.columns)] df2 = df2.loc[df1.index, df1.columns] logger.info("check whole data.", color=["BOLD", "GREEN"]) for x in same_columns: sebool = (df1[x] == df2[x]) if (~sebool).sum() > 0: logger.warning( f'"{x}" is different. different count: {(~sebool).sum()}. different index: {df1.index[~sebool]}. {(~sebool).sum()}. different values: {[(_x, _y, ) for _x, _y in zip(df1.loc[~sebool, x].iloc[:max_count].values, df2.loc[~sebool, x].iloc[:max_count].values)]}' ) else: if ignore_same == False: logger.info(f'"{x}" is same.', color=["BOLD", "BLUE"])
def __init__(self, connection_string: str, max_disp_len: int=100, log_level="info", logfilepath: str=None): """ postgresql db との connection を確立する Params:: connection_string: 接続文字列 ex) host=172.18.10.2 port=5432 dbname=boatrace user=postgres password=postgres """ self.con = None if connection_string is None else psycopg2.connect(connection_string) self.max_disp_len = max_disp_len self.sql_list = [] # insert, update, delete は一連のsqlをsetした後に一気に実行することにする self.logger = set_logger(_logname+".Psgre."+str(id(self.con)), log_level=log_level, internal_log=False, logfilepath=logfilepath) if connection_string is None: self.logger.info("dummy connection is established.") else: self.logger.info(f'connection is established. {connection_string[:connection_string.find("password")]}')
import numpy as np import pandas as pd from typing import List # local package from kkutils.lib.ml.procs import MyAsType from kkutils.util.com import check_type, is_callable, set_logger logger = set_logger(__name__) __all__ = [ "ProcRegistry", ] class ProcRegistry(object): def __init__(self, colname_explain: np.ndarray, colname_answer: np.ndarray): super().__init__() self.processing = {} self.default_proc(colname_explain, colname_answer) def default_proc(self, colname_explain: np.ndarray, colname_answer: np.ndarray): logger.info("START") check_type(colname_explain, [np.ndarray]) check_type(colname_answer, [np.ndarray]) self.processing["default_x"] = {} self.processing["default_x"]["type"] = "x" self.processing["default_x"]["cols"] = colname_explain self.processing["default_x"]["proc"] = [] self.processing["default_y"] = {}