Beispiel #1
0
    def query_multiple(self, queries, reuse_odbc_connection=False, args=[], batch_size=1, continue_on_error=False):
        """
            Runs query_pandas. If query does not start with "SELECT" then query will be transformed to "SELECT * FROM {query}"
        """
        ret=[]
        errors=[]
        i=0
        for query in batch(queries, batch_size):
            sql=" UNION ALL ".join(query)
            try:
                df=None
                df=self.query_pandas(sql, reuse_odbc_connection, args=args)
                i=i+1
                logger.info("Batches completion progress:"+str(i)+"/"+str(len(queries)/batch_size) +"  batch size:"+str(batch_size) + " "+str(i*batch_size)+"/"+str(len(queries)))
            except Exception as e:
                logger.debug("ERROR during execution of query:"+ str(("ERROR", query, e)))
                if continue_on_error:
                    if batch_size>1:
                        df, error=self.query_multiple(query, reuse_odbc_connection=reuse_odbc_connection, args=args, batch_size=1, continue_on_error=True)
                        errors.extend(error)
                    else:

                        return (None, [("ERROR", query, e)])
                else:
                    raise
            if df is not None:
                ret.append(df)
        if continue_on_error:
            return (pandas.concat(ret), errors)
        else:
            return pandas.concat(ret)
Beispiel #2
0
def get_spark_session(master, spark_params={}):
    app_name = ""
    conf = SparkConf() \
        .setAppName(app_name) \
        .setMaster(master)

    for (key, value) in spark_params.items():
        conf = conf.set(key, value)

    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    logger.debug("getOrCreate'd Spark session")
    return spark
Beispiel #3
0
    def query_pandas(self, query, reuse_odbc_connection=False, args=[]):
        """
            Reads database source to pandas dataframe.
            Depending on the configuration of the connector either odbc, jdbc via spark or jaydebeapi jdbc will be used

            :param reuse_odbc_connection: if set connection will be reused (currently works only for ODBC sources)
        """
        logger.debug(f"Executing query on connector {str(self)}:" + query +" with args:"+str(args))
        if self.supports_odbc:
             logger.debug(" using ODBC")
             conn = self.get_odbc_connection(reuse_odbc_connection)
             def pd_read_sql(conn):
                 return pandas.read_sql(query, conn, coerce_float=False, parse_dates=None, params=args)
             try:
                return pd_read_sql(conn)
             except Exception as ex:
                if self.is_connection_broken_error(ex):
                    logger.info("Connection is broken, reconnecting.")
                    conn = self.get_odbc_connection(False)
                    return pd_read_sql(conn)
                else:
                    raise ex
        else:
            if len(args)>0:
                raise Exception("Query args not supported for spark od jaydebeapi.")
            if self.jdbc_access_method=="spark":
                logger.debug("\nWarning: conversion from spark df to pandas needed.")
                return self.query_spark(query).toPandas()
            else:
                logger.debug("\nUsing jaydebeapi jdbc access method")
                return self.read_jdbc_to_pd_df(query, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
Beispiel #4
0
    def execute_statement(self, statement, add_col_names=False, reuse_odbc_connection=False, ignore_error=False, args=[], commit=False):

        if self.supports_odbc:
            conn = self.get_odbc_connection(reuse_odbc_connection )
            cursor = conn.cursor()
            try:
                logger.debug(f"Executing statement:{statement}")
                cursor.execute(statement, *args)
            except DatabaseError as ex:
                logger.info(f"DB Exception caught:{ex}.\nReconnecting.")
                conn = self.get_odbc_connection(reuse_odbc_connection=False)
                cursor.execute(statement, *args)
            except Exception as ex:
                if ignore_error:
                    logger.debug(f"Ignored exception when running {statement}:" + str(ex))
                else:
                    raise ex


            res = []
            rowcount = cursor.rowcount
            try:
                recs = cursor.fetchall()
                if add_col_names:
                    fields = tuple(map(lambda x: x[0], cursor.description))
                    recs.insert(0, fields)
                res.append(recs)
            except pyodbc.ProgrammingError:
                pass
            while cursor.nextset():  # NB: This always skips the first resultset
                try:
                    recs = cursor.fetchall()
                    if add_col_names:
                        fields = tuple(map(lambda x: x[0], cursor.description))
                        recs.insert(0, fields)
                    res.append(recs)
                except pyodbc.ProgrammingError:
                    continue
            if commit:
                conn.execute("COMMIT;")
            return (rowcount,res);
        else:
             if self.jdbc_access_method=="spark":
                print("\nWarning: conversion from spark df to pandas needed.")
                return self.query_spark(statement).toPandas()
             else:
                print("\nUsing jaydebeapi jdbc access method")
                return self.read_jdbc_to_pd_df(statement, self.jdbc_driver, self.get_jdbc_conn_string(),[self.username, self.get_password()])
Beispiel #5
0
    def sort_columns(c):
        cr = c.replace(left_name, "").replace(right_name, "")

        if cr in cols_to_compare:
            ret = (cols_to_compare.index(cr) + 1) * 10 + (0 if left_name in c
                                                          else 1)
        else:
            if cr not in columns:
                ret = 0
            else:
                ret = (columns.index(cr) + 1) * 100 + (0 if left_name in c else
                                                       1)

        logger.debug("Column sorted: " + c + " -> " + str(ret))

        return ret
Beispiel #6
0
def convert_pandas_df_to_spark(spark,
                               pandas_df,
                               spark_register_name=None,
                               convert_via_parquet=True,
                               persist=True):
    if convert_via_parquet:
        tmp = tempfile.mkdtemp()
        path = os.path.join(tmp, 'temp_convert_pandas_df_to_spark.parquet')
        pandas_df.to_parquet(path)
        convertedToSpark = spark.read.parquet(path)
        logger.debug("Saved temp parquet file:" + path)
    else:
        convertedToSpark = DataFrameConverter().get_spark_df(spark, pandas_df)
    if spark_register_name is not None:
        convertedToSpark.registerTempTable(spark_register_name)
    if persist:
        convertedToSpark.persist()
    return convertedToSpark
Beispiel #7
0
 def query_spark(self, query, register_temp_table=None):
     if self.supports_jdbc:
         logger.debug(f"Executing query (JDBC) on connector {str(self)}:"+query)
         ret=self.get_spark_connector().get_df_from_jdbc(self.get_jdbc_conn_string(),
                                                   query,
                                                   self.db_dialect.get_jdbc_driver(),
                                                   self.username,
                                                   self.get_password)
     else:
         print("Warning: conversion from panadas df to spark needed. Can be slow.")
         print(f"Executing query (ODBC) on connector {str(self)}:" + query)
         pdf = self.query_pandas(query)
         is_empty = pdf.empty
         ret=None
         if not is_empty:
             ret = self.get_spark_connector().convert_pandas_df_to_spark(pdf)
         else:
             print("No data returned.")
     if register_temp_table is not None:
         print("Registering temp table as:"+register_temp_table)
         ret.registerTempTable(register_temp_table)
     return ret
Beispiel #8
0
def compare_data_frames(df_left,
                        df_right,
                        keys,
                        left_name,
                        right_name,
                        cols_to_compare,
                        keys_fuzzy_match=[],
                        keys_fuzzy_match_comparers=None,
                        make_copy=True,
                        merge_validate=None):
    import pandas as pd
    from Levenshtein import distance as levenshtein_distance
    if keys_fuzzy_match_comparers and len(
            keys_fuzzy_match != keys_fuzzy_match_comparers):
        raise Exception(
            "keys_fuzzy_match_comparers have to be None to use default fuzzy field comparer or the length of keys_fuzzy_match."
        )

    def default_fuzzy_comp(field_1, field_2):
        field_1_last_part = field_1.split("_")[-1]
        field_2_last_part = field_2.split("_")[-1]
        # equal when 1 char diff  and also last character is euqal
        return levenshtein_distance(field_1, field_2) <= 1 \
                and field_1_last_part == field_2_last_part

    if not keys_fuzzy_match_comparers:
        keys_fuzzy_match_comparers = [
            default_fuzzy_comp for x in keys_fuzzy_match
        ]

    if make_copy:
        df_left = df_left.copy()
        df_right = df_right.copy()
    l_name = left_name
    r_name = right_name
    left_name = ("_" if left_name != "" else "") + left_name
    right_name = ("_" if right_name != "" else "") + right_name

    columns = list(df_left.columns)

    keys_to_merge = [k for k in keys if k not in keys_fuzzy_match]
    df_left['idx'] = df_left.reset_index().index
    df_right['idx'] = df_right.reset_index().index
    df_left = df_left.rename(columns=dict([(c, c + left_name)
                                           for c in list(df_left.columns)
                                           if c not in keys_to_merge]),
                             inplace=False)
    df_right = df_right.rename(columns=dict([(c, c + right_name)
                                             for c in list(df_right.columns)
                                             if c not in keys_to_merge]),
                               inplace=False)

    compared_df = pd.merge(
        df_left,
        df_right,
        on=keys_to_merge,
        suffixes=[left_name, right_name],
        indicator="row_source",
        how="inner" if len(keys_fuzzy_match) > 0 else "outer",
        validate=merge_validate)
    keys_fuzzy_match_doubled = []
    if len(keys_fuzzy_match) > 0:
        keys_fuzzy_match_doubled = reduce(lambda x, y: x + y,
                                          [[kfm + left_name, kfm + right_name]
                                           for kfm in keys_fuzzy_match])

        def filter_rows(r):
            for idx, kfm in enumerate(keys_fuzzy_match):
                field_1 = r[kfm + left_name]
                field_2 = r[kfm + right_name]

                ret_comp = keys_fuzzy_match_comparers[idx](field_1, field_2)
                if not ret_comp:
                    return False
            return True

        filtering_array = compared_df.apply(filter_rows, axis=1)
        if len(filtering_array) > 0:
            compared_df = compared_df[filtering_array]

        # add left , right joing
        # that is  rows which are not matached
        df_left["row_source"] = "left_only"
        df_right["row_source"] = "right_only"
        compared_df = pd.concat([
            compared_df,
            df_left[~df_left["idx" + left_name].isin(compared_df["idx" +
                                                                 left_name])]
        ])
        compared_df = pd.concat([
            compared_df,
            df_right[~df_right["idx" +
                               right_name].isin(compared_df["idx" +
                                                            right_name])]
        ])

    compared_df['row_source'] = compared_df.row_source.astype(str)
    compared_df.loc[compared_df["row_source"] == "left_only",
                    "row_source"] = l_name + "_only"
    compared_df.loc[compared_df["row_source"] == "right_only",
                    "row_source"] = r_name + "_only"

    def sort_columns(c):
        cr = c.replace(left_name, "").replace(right_name, "")

        if cr in cols_to_compare:
            ret = (cols_to_compare.index(cr) + 1) * 10 + (0 if left_name in c
                                                          else 1)
        else:
            if cr not in columns:
                ret = 0
            else:
                ret = (columns.index(cr) + 1) * 100 + (0 if left_name in c else
                                                       1)

        logger.debug("Column sorted: " + c + " -> " + str(ret))

        return ret

    cols = ["row_source"] + keys_to_merge + keys_fuzzy_match_doubled +\
           sorted([col for col in compared_df if col not in list(set(keys_to_merge).union(keys_fuzzy_match_doubled))+ ["row_source"]], key=sort_columns)
    logger.debug("compare_data_frames: cols:" + str(cols))
    compare_sorted_df = compared_df[cols]
    with pd.option_context('mode.chained_assignment', None):
        for col_comp in cols_to_compare:
            compare_sorted_df[
                col_comp +
                "_diff"] = ~(compare_sorted_df[col_comp + left_name]
                             == compare_sorted_df[col_comp + right_name])
        compare_sorted_df["diff"] = compare_sorted_df[[
            cl + "_diff" for cl in cols_to_compare
        ]].any(axis=1)
        compare_sorted_df["diff_cnt"] = compare_sorted_df[[
            cl + "_diff" for cl in cols_to_compare
        ]].sum(axis=1)
    compare_sorted_df = compare_sorted_df.drop(
        columns=["idx" + left_name, "idx" + right_name])
    return compare_sorted_df