def query_multiple(self, queries, reuse_odbc_connection=False, args=[], batch_size=1, continue_on_error=False): """ Runs query_pandas. If query does not start with "SELECT" then query will be transformed to "SELECT * FROM {query}" """ ret=[] errors=[] i=0 for query in batch(queries, batch_size): sql=" UNION ALL ".join(query) try: df=None df=self.query_pandas(sql, reuse_odbc_connection, args=args) i=i+1 logger.info("Batches completion progress:"+str(i)+"/"+str(len(queries)/batch_size) +" batch size:"+str(batch_size) + " "+str(i*batch_size)+"/"+str(len(queries))) except Exception as e: logger.debug("ERROR during execution of query:"+ str(("ERROR", query, e))) if continue_on_error: if batch_size>1: df, error=self.query_multiple(query, reuse_odbc_connection=reuse_odbc_connection, args=args, batch_size=1, continue_on_error=True) errors.extend(error) else: return (None, [("ERROR", query, e)]) else: raise if df is not None: ret.append(df) if continue_on_error: return (pandas.concat(ret), errors) else: return pandas.concat(ret)
def get_spark_session(master, spark_params={}): app_name = "" conf = SparkConf() \ .setAppName(app_name) \ .setMaster(master) for (key, value) in spark_params.items(): conf = conf.set(key, value) spark = SparkSession.builder.config(conf=conf).getOrCreate() spark.sparkContext.setLogLevel("ERROR") logger.debug("getOrCreate'd Spark session") return spark
def query_pandas(self, query, reuse_odbc_connection=False, args=[]): """ Reads database source to pandas dataframe. Depending on the configuration of the connector either odbc, jdbc via spark or jaydebeapi jdbc will be used :param reuse_odbc_connection: if set connection will be reused (currently works only for ODBC sources) """ logger.debug(f"Executing query on connector {str(self)}:" + query +" with args:"+str(args)) if self.supports_odbc: logger.debug(" using ODBC") conn = self.get_odbc_connection(reuse_odbc_connection) def pd_read_sql(conn): return pandas.read_sql(query, conn, coerce_float=False, parse_dates=None, params=args) try: return pd_read_sql(conn) except Exception as ex: if self.is_connection_broken_error(ex): logger.info("Connection is broken, reconnecting.") conn = self.get_odbc_connection(False) return pd_read_sql(conn) else: raise ex else: if len(args)>0: raise Exception("Query args not supported for spark od jaydebeapi.") if self.jdbc_access_method=="spark": logger.debug("\nWarning: conversion from spark df to pandas needed.") return self.query_spark(query).toPandas() else: logger.debug("\nUsing jaydebeapi jdbc access method") return self.read_jdbc_to_pd_df(query, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
def execute_statement(self, statement, add_col_names=False, reuse_odbc_connection=False, ignore_error=False, args=[], commit=False): if self.supports_odbc: conn = self.get_odbc_connection(reuse_odbc_connection ) cursor = conn.cursor() try: logger.debug(f"Executing statement:{statement}") cursor.execute(statement, *args) except DatabaseError as ex: logger.info(f"DB Exception caught:{ex}.\nReconnecting.") conn = self.get_odbc_connection(reuse_odbc_connection=False) cursor.execute(statement, *args) except Exception as ex: if ignore_error: logger.debug(f"Ignored exception when running {statement}:" + str(ex)) else: raise ex res = [] rowcount = cursor.rowcount try: recs = cursor.fetchall() if add_col_names: fields = tuple(map(lambda x: x[0], cursor.description)) recs.insert(0, fields) res.append(recs) except pyodbc.ProgrammingError: pass while cursor.nextset(): # NB: This always skips the first resultset try: recs = cursor.fetchall() if add_col_names: fields = tuple(map(lambda x: x[0], cursor.description)) recs.insert(0, fields) res.append(recs) except pyodbc.ProgrammingError: continue if commit: conn.execute("COMMIT;") return (rowcount,res); else: if self.jdbc_access_method=="spark": print("\nWarning: conversion from spark df to pandas needed.") return self.query_spark(statement).toPandas() else: print("\nUsing jaydebeapi jdbc access method") return self.read_jdbc_to_pd_df(statement, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
def sort_columns(c): cr = c.replace(left_name, "").replace(right_name, "") if cr in cols_to_compare: ret = (cols_to_compare.index(cr) + 1) * 10 + (0 if left_name in c else 1) else: if cr not in columns: ret = 0 else: ret = (columns.index(cr) + 1) * 100 + (0 if left_name in c else 1) logger.debug("Column sorted: " + c + " -> " + str(ret)) return ret
def convert_pandas_df_to_spark(spark, pandas_df, spark_register_name=None, convert_via_parquet=True, persist=True): if convert_via_parquet: tmp = tempfile.mkdtemp() path = os.path.join(tmp, 'temp_convert_pandas_df_to_spark.parquet') pandas_df.to_parquet(path) convertedToSpark = spark.read.parquet(path) logger.debug("Saved temp parquet file:" + path) else: convertedToSpark = DataFrameConverter().get_spark_df(spark, pandas_df) if spark_register_name is not None: convertedToSpark.registerTempTable(spark_register_name) if persist: convertedToSpark.persist() return convertedToSpark
def query_spark(self, query, register_temp_table=None): if self.supports_jdbc: logger.debug(f"Executing query (JDBC) on connector {str(self)}:"+query) ret=self.get_spark_connector().get_df_from_jdbc(self.get_jdbc_conn_string(), query, self.db_dialect.get_jdbc_driver(), self.username, self.get_password) else: print("Warning: conversion from panadas df to spark needed. Can be slow.") print(f"Executing query (ODBC) on connector {str(self)}:" + query) pdf = self.query_pandas(query) is_empty = pdf.empty ret=None if not is_empty: ret = self.get_spark_connector().convert_pandas_df_to_spark(pdf) else: print("No data returned.") if register_temp_table is not None: print("Registering temp table as:"+register_temp_table) ret.registerTempTable(register_temp_table) return ret
def compare_data_frames(df_left, df_right, keys, left_name, right_name, cols_to_compare, keys_fuzzy_match=[], keys_fuzzy_match_comparers=None, make_copy=True, merge_validate=None): import pandas as pd from Levenshtein import distance as levenshtein_distance if keys_fuzzy_match_comparers and len( keys_fuzzy_match != keys_fuzzy_match_comparers): raise Exception( "keys_fuzzy_match_comparers have to be None to use default fuzzy field comparer or the length of keys_fuzzy_match." ) def default_fuzzy_comp(field_1, field_2): field_1_last_part = field_1.split("_")[-1] field_2_last_part = field_2.split("_")[-1] # equal when 1 char diff and also last character is euqal return levenshtein_distance(field_1, field_2) <= 1 \ and field_1_last_part == field_2_last_part if not keys_fuzzy_match_comparers: keys_fuzzy_match_comparers = [ default_fuzzy_comp for x in keys_fuzzy_match ] if make_copy: df_left = df_left.copy() df_right = df_right.copy() l_name = left_name r_name = right_name left_name = ("_" if left_name != "" else "") + left_name right_name = ("_" if right_name != "" else "") + right_name columns = list(df_left.columns) keys_to_merge = [k for k in keys if k not in keys_fuzzy_match] df_left['idx'] = df_left.reset_index().index df_right['idx'] = df_right.reset_index().index df_left = df_left.rename(columns=dict([(c, c + left_name) for c in list(df_left.columns) if c not in keys_to_merge]), inplace=False) df_right = df_right.rename(columns=dict([(c, c + right_name) for c in list(df_right.columns) if c not in keys_to_merge]), inplace=False) compared_df = pd.merge( df_left, df_right, on=keys_to_merge, suffixes=[left_name, right_name], indicator="row_source", how="inner" if len(keys_fuzzy_match) > 0 else "outer", validate=merge_validate) keys_fuzzy_match_doubled = [] if len(keys_fuzzy_match) > 0: keys_fuzzy_match_doubled = reduce(lambda x, y: x + y, [[kfm + left_name, kfm + right_name] for kfm in keys_fuzzy_match]) def filter_rows(r): for idx, kfm in enumerate(keys_fuzzy_match): field_1 = r[kfm + left_name] field_2 = r[kfm + right_name] ret_comp = keys_fuzzy_match_comparers[idx](field_1, field_2) if not ret_comp: return False return True filtering_array = compared_df.apply(filter_rows, axis=1) if len(filtering_array) > 0: compared_df = compared_df[filtering_array] # add left , right joing # that is rows which are not matached df_left["row_source"] = "left_only" df_right["row_source"] = "right_only" compared_df = pd.concat([ compared_df, df_left[~df_left["idx" + left_name].isin(compared_df["idx" + left_name])] ]) compared_df = pd.concat([ compared_df, df_right[~df_right["idx" + right_name].isin(compared_df["idx" + right_name])] ]) compared_df['row_source'] = compared_df.row_source.astype(str) compared_df.loc[compared_df["row_source"] == "left_only", "row_source"] = l_name + "_only" compared_df.loc[compared_df["row_source"] == "right_only", "row_source"] = r_name + "_only" def sort_columns(c): cr = c.replace(left_name, "").replace(right_name, "") if cr in cols_to_compare: ret = (cols_to_compare.index(cr) + 1) * 10 + (0 if left_name in c else 1) else: if cr not in columns: ret = 0 else: ret = (columns.index(cr) + 1) * 100 + (0 if left_name in c else 1) logger.debug("Column sorted: " + c + " -> " + str(ret)) return ret cols = ["row_source"] + keys_to_merge + keys_fuzzy_match_doubled +\ sorted([col for col in compared_df if col not in list(set(keys_to_merge).union(keys_fuzzy_match_doubled))+ ["row_source"]], key=sort_columns) logger.debug("compare_data_frames: cols:" + str(cols)) compare_sorted_df = compared_df[cols] with pd.option_context('mode.chained_assignment', None): for col_comp in cols_to_compare: compare_sorted_df[ col_comp + "_diff"] = ~(compare_sorted_df[col_comp + left_name] == compare_sorted_df[col_comp + right_name]) compare_sorted_df["diff"] = compare_sorted_df[[ cl + "_diff" for cl in cols_to_compare ]].any(axis=1) compare_sorted_df["diff_cnt"] = compare_sorted_df[[ cl + "_diff" for cl in cols_to_compare ]].sum(axis=1) compare_sorted_df = compare_sorted_df.drop( columns=["idx" + left_name, "idx" + right_name]) return compare_sorted_df