def test_nanvl(data_gen): s1 = gen_scalar(data_gen, force_no_nulls=True) data_type = data_gen.data_type assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, data_gen).select( f.nanvl(f.col('a'), f.col('b')), f.nanvl(f.col('a'), s1.cast(data_type)), f.nanvl(f.lit(None).cast(data_type), f.col('b')), f.nanvl(f.lit(float('nan')).cast(data_type), f.col('b'))))
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas Pandas count doesn't include nan. if isinstance(spark_type, (FloatType, DoubleType)): return F.count(F.nanvl(col, F.lit(None))) else: return F.count(col)
def nan_to_null(self, index_ops: IndexOpsLike) -> IndexOpsLike: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. return index_ops._with_new_scol( F.nanvl(index_ops.spark.column, SF.lit(None)), field=index_ops._internal.data_fields[0].copy(nullable=True), )
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance( spark_type, FloatType): stat_exprs.append( sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(internal)
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) metadata = Metadata(data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
def fillspark(hist, df): import pyspark.sql.functions as fcns indexes = [] for axis in hist._group + hist._fixed: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if isinstance(axis, histbook.axis.groupby): indexes.append(exprcol) elif isinstance(axis, histbook.axis.groupbin): scaled = (exprcol - float(axis.origin)) * (1.0 / float(axis.binwidth)) if axis.closedlow: discretized = fcns.floor(scaled) else: discretized = fcns.ceil(scaled) - 1 indexes.append( fcns.nanvl( discretized * float(axis.binwidth) + float(axis.origin), fcns.lit("NaN"))) elif isinstance(axis, histbook.axis.bin): scaled = (exprcol - float(axis.low)) * (int(axis.numbins) / (float(axis.high) - float(axis.low))) if axis.closedlow: discretized = fcns.floor(scaled) + 1 else: discretized = fcns.ceil(scaled) indexes.append( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), int(axis.numbins) + 2).otherwise( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.numbins) + 1), discretized)))) elif isinstance(axis, histbook.axis.intbin): indexes.append( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1), fcns.round(exprcol - int(axis.min) + 1)))) elif isinstance(axis, histbook.axis.split): def build(x, i): if i < len(axis.edges): if axis.closedlow: return build(x.when(exprcol < float(axis.edges[i]), i), i + 1) else: return build( x.when(exprcol <= float(axis.edges[i]), i), i + 1) else: return x.otherwise(i) indexes.append( build( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), len(axis.edges) + 1), 0)) elif isinstance(axis, histbook.axis.cut): indexes.append(fcns.when(exprcol, 0).otherwise(1)) else: raise AssertionError(axis) aliasnum = [-1] def alias(x): aliasnum[0] += 1 return x.alias("@" + str(aliasnum[0])) index = alias(fcns.struct(*indexes)) selectcols = [index] if hist._weightoriginal is not None: weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed)) for axis in hist._profile: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if hist._weightoriginal is None: selectcols.append(alias(exprcol)) selectcols.append(alias(exprcol * exprcol)) else: selectcols.append(alias(exprcol * weightcol)) selectcols.append(alias(exprcol * exprcol * weightcol)) if hist._weightoriginal is None: df2 = df.select(*selectcols) else: selectcols.append(alias(weightcol)) selectcols.append(alias(weightcol * weightcol)) df2 = df.select(*selectcols) aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]] if hist._weightoriginal is None: aggs.append(fcns.count(df2[df2.columns[0]])) def getornew(content, key, nextaxis): if key in content: return content[key] elif isinstance(nextaxis, histbook.axis.GroupAxis): return {} else: return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE) def recurse(index, columns, axis, content): if len(axis) == 0: content += columns elif isinstance(axis[0], (histbook.axis.groupby, histbook.axis.groupbin)): content[index[0]] = recurse( index[1:], columns, axis[1:], getornew(content, index[0], axis[1] if len(axis) > 1 else None)) if isinstance(axis[0], histbook.axis.groupbin) and None in content: content["NaN"] = content[None] del content[None] elif isinstance( axis[0], (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)): i = index[0] - (1 if not axis[0].underflow else 0) if int(i) < axis[0].totbins: recurse(index[1:], columns, axis[1:], content[int(i)]) elif isinstance(axis[0], histbook.axis.cut): recurse(index[1:], columns, axis[1:], content[0 if index[0] else 1]) else: raise AssertionError(axis[0]) return content query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs) def wait(): for row in query.collect(): recurse(row[0], row[1:], hist._group + hist._fixed, hist._content) return wait
def _fit(self, ratings_df): ''' Fit ALS model using reviews as training data. Parameters ========== ratings_df (pyspark.sql.DataFrame) Data used to train recommender model. Columns are 'user', 'item', and 'rating'. Values of user and item must be numeric. Values of rating range from 1 to 5. Returns ======= self ''' # avg_rating_df = ( # ratings_df # .groupBy() # .avg(self.getRatingCol()) # .withColumnRenamed('avg({})'.format(self.getRatingCol()), # 'avg_rating') # ) # print('Fit starting!') start_time = time.monotonic() # print('ratings_df') # ratings_df.show() rating_stats_df = ( ratings_df .agg( F.avg(self.getRatingCol()).alias('avg_rating'), F.stddev_samp(self.getRatingCol()).alias('stddev_rating') ) ) # print('ratings_stats_df:') # rating_stats_df.show() # if not self.getUseALS(): # self.setLambda_1(0.0) # self.setLambda_2(0.0) item_bias_df = ( ratings_df .crossJoin(rating_stats_df) .withColumn( 'diffs_item_rating', F.col(self.getRatingCol()) - F.col('avg_rating') ) .groupBy(self.getItemCol()) .agg( F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'), F.nanvl( F.stddev_samp(F.col('diffs_item_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_item_rating'), F.count("*").alias('count_item_rating') ) .withColumn( 'stderr_diffs_item_rating', (self.getLambda_1() + F.col('stddev_diffs_item_rating')) / F.sqrt('count_item_rating') ) .withColumn( 'item_bias', F.col('avg_diffs_item_rating') / (1 + F.col('stderr_diffs_item_rating')) ) .select( self.getItemCol(), 'item_bias', 'avg_diffs_item_rating', 'stderr_diffs_item_rating', 'stddev_diffs_item_rating', 'count_item_rating' ) ) # print('item_bias_df:') # item_bias_df.show(5) # item_bias_df.printSchema() # print('item_bias_df NaN') # item_bias_df.where(F.isnan("item_bias")).show() user_bias_df = ( ratings_df .crossJoin(rating_stats_df) .join(item_bias_df, on=self.getItemCol()) .withColumn( 'diffs_user_rating', F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias') ) .groupBy(self.getUserCol()) .agg( F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'), F.nanvl( F.stddev_samp(F.col('diffs_user_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_user_rating'), F.count("*").alias('count_user_rating') ) .withColumn( 'stderr_diffs_user_rating', (self.getLambda_2() + F.col('stddev_diffs_user_rating')) / F.sqrt('count_user_rating') ) .withColumn( 'user_bias', F.col('avg_diffs_user_rating') / (1 + F.col('stderr_diffs_user_rating')) ) .select( self.getUserCol(), 'user_bias', 'avg_diffs_user_rating', 'stderr_diffs_user_rating', 'stddev_diffs_user_rating', 'count_user_rating' ) ) # print('user_bias_df:') # user_bias_df.show(5) # print('user_bias_df NaN') # user_bias_df.where(F.isnan("user_bias")).show() if self.getUseALS(): if self.getUseBias(): residual_df = ( ratings_df .crossJoin(rating_stats_df) .join(user_bias_df, on=self.getUserCol()) .join(item_bias_df, on=self.getItemCol()) .withColumn( self.getRatingCol(), F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('user_bias') - F.col('item_bias') ) .select( self.getUserCol(), self.getItemCol(), self.getRatingCol() ) ) else: residual_df = ratings_df # self.setColdStartStrategy('drop') residual_stats_df = ( residual_df .agg( F.avg(F.col(self.getRatingCol())).alias('avg_residual'), F.stddev(F.col(self.getRatingCol())).alias('stddev_residual') ) ) # print('residual_df') # residual_df.show() # print('residual_df NaN') # residual_df.where(F.isnan("rating")).show() # print('residual_stats_df') # residual_stats_df.show() als_model = ALS( rank=self.getRank(), maxIter=self.getMaxIter(), regParam=self.getRegParam(), numUserBlocks=self.getNumUserBlocks(), numItemBlocks=self.getNumItemBlocks(), implicitPrefs=self.getImplicitPrefs(), alpha=self.getAlpha(), userCol=self.getUserCol(), itemCol=self.getItemCol(), ratingCol=self.getRatingCol(), nonnegative=self.getNonnegative(), checkpointInterval=self.getCheckpointInterval(), intermediateStorageLevel=self.getIntermediateStorageLevel(), finalStorageLevel=self.getFinalStorageLevel() ) recommender = als_model.fit(residual_df) else: recommender = None residual_stats_df = None print('Fit done in {} seconds'.format(time.monotonic() - start_time)) return( RecommenderModel( self.getUseALS(), self.getUseBias(), self.getLambda_3(), # self.getColdStartStrategy(), recommender, rating_stats_df, residual_stats_df, user_bias_df, item_bias_df ) )
def main(spark): # sql declaration bsad = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \ "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsad" bsid = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \ "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsid" bseg = "select * from 200836_az_fi1_1051041.bseg_nw" knb1 = "select bukrs, akont, kunnr from 200836_az_fi1_1051041.knb1 WHERE knb1.AKONT is NOT NULL and knb1.akont<>''" tka02 = "select bukrs, kokrs from 200836_az_fi1_1051041.tka02 where tka02.GSBER is null or trim(tka02.GSBER)=''" lgl_ent_ldgr = "select lgl_ent_ldgr_id, lgl_ent_ldgr_cd from radar.lgl_ent_ldgr where src_sys_cd='1051041'" fscl_prd = "select fscl_prd_id, fscl_prd_yr_nr, fscl_prd_sqn_nr, fscl_prd_yr_vrnt_txt from radar.fscl_prd" \ " where src_sys_cd='1051041' and fscl_prd_typ_cd='Month'" vbrk = "select vbeln, fkart from 200836_az_fi1_1051041.vbrk" t001 = "select * from 200836_az_fi1_1051041.t001" # create dataframes df_bsad = spark.sql(bsad).persist(StorageLevel(True, True, False, False, 1)) df_bsid = spark.sql(bsid).persist(StorageLevel(True, True, False, False, 1)) df_bseg = spark.sql(bseg) df_knb1 = spark.sql(knb1) df_tka02 = spark.sql(tka02) df_lgl_ent_ldgr = spark.sql(lgl_ent_ldgr) df_fscl_prd = spark.sql(fscl_prd) df_vbrk = spark.sql(vbrk) df_t001 = spark.sql(t001) # temporary table df_bsad_bsid = df_bsad.select(df_bsad.bukrs, df_bsad.belnr, df_bsad.gjahr, df_bsad.buzei)\ .unionAll(df_bsid.select(df_bsid.bukrs, df_bsid.belnr, df_bsid.gjahr, df_bsid.buzei)) df_stg_bsad_bsid_bseg = df_bsad_bsid.join( df_bseg,[df_bsad_bsid.bukrs == df_bseg.bukrs, df_bsad_bsid.belnr == df_bseg.belnr, df_bsad_bsid.gjahr == df_bseg.gjahr, df_bsad_bsid.buzei == df_bseg.buzei], 'inner')\ .select(df_bseg.menge, df_bseg.bukrs, df_bseg.belnr, df_bseg.gjahr, df_bseg.buzei) sql_rate = "select from_curr, to_curr, budat, curr_from_dt, CURR_DRVD_EXCH_RATE from ( select stg.from_curr, " \ "stg.to_curr, budat, stg.curr_from_dt, stg.CURR_DRVD_EXCH_RATE, " \ "row_number() over(partition by stg.from_curr,stg.to_curr,budat order by stg.curr_from_dt desc) as rn" \ " from ( select distinct bsad.budat from ( select budat from 200836_az_fi1_1051041.bsad" \ " union all select budat from 200836_az_fi1_1051041.bsid )bsad )src" \ "inner join 200836_az_fi1_1051041.stg_tcurf_tcurr stg where budat>=stg.curr_from_dt" \ ")src1 where rn=1" df_rate = spark.sql(sql_rate).persist( StorageLevel(True, True, False, False, 1)) # joins # at inner join ati df_master = df_bsad.select("bukrs", "belnr", "gjahr", "buzei", "blart", "rebzg", "cpudt", "budat", "shkzg", "sgtxt", "kunnr", "prctr", "kostl", "waers", "dmbtr", "monat", "wrbtr", "dmbe2")\ .unionAll(df_bsad.select("bukrs", "belnr", "gjahr", "buzei", "blart", "rebzg", "cpudt", "budat", "shkzg", "sgtxt", "kunnr", "prctr", "kostl", "waers", "dmbtr", "monat", "wrbtr", "dmbe2")) joined_df = df_master.join(df_t001, df_master.bukrs == df_t001.bukrs, 'inner') \ .join(df_vbrk, df_vbrk.vbeln == df_master.rebzg, 'left_outer') \ .join(df_knb1, [df_master.bukrs == df_knb1.bukrs, df_master.kunnr == df_knb1.kunnr], 'left_outer') \ .join(df_fscl_prd, [df_master.gjahr == df_fscl_prd.fscl_prd_yr_nr, df_master.monat == df_fscl_prd.fscl_prd_sqn_nr, df_fscl_prd.fscl_prd_yr_vrnt_txt == df_t001.periv], 'left_outer') \ .join(df_lgl_ent_ldgr, df_lgl_ent_ldgr.lgl_ent_ldgr_cd == df_t001.ktopl, 'left_outer') \ .join(df_tka02, df_master.bukrs == df_tka02.bukrs, 'left_outer') \ .join(df_stg_bsad_bsid_bseg, [df_stg_bsad_bsid_bseg.bukrs == df_master.bukrs, df_stg_bsad_bsid_bseg.belnr == df_master.belnr, df_stg_bsad_bsid_bseg.gjahr == df_master.gjahr, df_stg_bsad_bsid_bseg.buzei == df_master.buzei], 'left_outer') \ .join(df_rate, [df_rate.from_curr == df_master.waers, df_rate.to_curr == 'USD', df_rate.budat == df_master.budat], 'left_outer').alias("stg1") \ .join(df_rate, [df_rate.from_curr == df_master.waers, df_rate.to_curr == df_t001.waers, df_rate.budat == df_master.budat], 'left_outer').alias("stg2") \ .filter("blart not in ('DZ', 'ZP', 'CT', 'CZ', 'DZ', 'FZ', 'PA', 'ZP', 'ZU')") # please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions. df_sb_ldgr_actv = joined_df.select( f.lit("1051041").alias("p_src_sys_cd"), f.concat(df_master.bukrs, df_master.belnr, df_master.gjahr.cast("string"), df_master.buzei.cast("string")).alias("sb_ldgr_actv_dcmt_nr"), f.lit("AR").alias("sb_ldgr_acct_typ_cd"), df_master.blart.alias("sb_ldgr_actv_typ_cd"), df_master.rebzg.alias("sb_ldgr_actv_inv_dcmt_nr"), f.when(df_master.rebzg != '', df_vbrk.fkart).alias("sb_ldgr_actv_inv_dcmt_typ_dn"), f.lit(None).alias("sb_ldgr_actv_csh_dcmt_nr"), f.lit(None).alias("sb_ldgr_actv_csh_dcmt_typ_dn"), f.concat(df_knb1.bukrs, f.lit('---'), df_knb1.akont, f.lit('---')).alias("sb_ldgr_actv_gnrl_ldgr_acct_cd"), df_master.cpudt.alias("sb_ldgr_actv_crt_ts"), df_master.cpudt.alias("sb_ldgr_actv_etry_ts"), df_master.budat.alias("sb_ldgr_actv_to_be_pstg_ts"), df_master.shkzg.alias("acct_trsn_itm_etry_typ_cd"), df_master.sgtxt.alias("sb_ldgr_actv_cmt"), f.concat( df_master.bukrs, df_master.belnr, df_master.gjahr.cast("string")).alias("sb_ldgr_actv_trsn_btch_id"), df_t001.land1.alias("ctry_cd"), df_master.bukrs.alias("entrs_lgl_ent_nr"), df_lgl_ent_ldgr.lgl_ent_ldgr_id.alias("lgl_ent_ldgr_id"), df_master.kunnr.alias("sb_ldgr_acct_id"), df_fscl_prd.fscl_prd_id.alias("fscl_prd_id"), f.when((f.nanvl(df_tka02.kokrs, f.lit("")) == "").__or__( f.nanvl(df_master.prctr, f.lit("")) == ""), f.lit(None)).otherwise( f.concat(df_tka02.kokrs, df_master.prctr)).alias("pft_cntr_cd"), f.when((f.nanvl(df_tka02.kokrs, f.lit("")) == "").__or__( f.nanvl(df_master.kostl, f.lit("")) == ""), f.lit(None)).otherwise( f.concat(df_tka02.kokrs, df_master.kostl)).alias("pstg_obj_nn_cntrct_id"), f.lit(None).alias("pstg_obj_cntrct_id"), f.lit(None).alias("prt_id"), f.lit(None).alias("prod_id"), df_stg_bsad_bsid_bseg.menge.alias("sb_ldgr_actv_qty"), f.lit('M').alias("curr_exch_rate_typ_cd"), f.lit(None).alias("sb_ldgr_actv_lcl_to_grp_curr_exch_rate"), f.when(df_master.waers == 'USD', f.lit(1.0)).otherwise("stg1.CURR_DRVD_EXCH_RATE").alias( "sb_ldgr_actv_trsn_to_grp_curr_exch_rate"), f.when(df_master.waers == 'USD', f.lit(1.0)).otherwise("stg2.CURR_DRVD_EXCH_RATE").alias( "sb_ldgr_actv_trsn_to_lcl_curr_exch_rate"), df_t001.waers.alias("sb_ldgr_actv_lcl_curr_cd"), df_master.waers.alias("sb_ldgr_actv_trsn_curr_cd"), df_master.dmbtr.alias("sb_ldgr_actv_lcl_curr_amt"), df_master.wrbtr.alias("sb_ldgr_actv_trsn_curr_amt"), df_master.dmbe2.alias("sb_ldgr_actv_grp_curr_amt"), f.current_timestamp(), f.lit('ic_fi1_1051041_sb_ldgr_actv.hql'), f.lit('N').alias("radar_dlt_ind"), f.lit(None).alias("sb_ldgr_actv_gnrl_ldgr_acct_typ_dn"), f.lit("COMPANY").alias("entrs_lgl_ent_bsn_rol_cd"), f.lit("1051041").alias("src_sys_cd")) # insert df_sb_ldgr_actv.write.mode("overwrite").insertInto("radar.sb_ldgr_actv", overwrite=True) # test data spark.sql( "select * from radar.sb_ldgr_actv where src_sys_cd=1051041 and sb_ldgr_acct_typ_cd='AR'" ).take(10)
def main(spark): # sql declaration acct_trsn = "select acct_trsn_id,curr_exch_rate_typ_cd,acct_trsn_typ_cd,acct_trsn_bckpst_ind," \ "rvrsl_rsn_cd from radar.acct_trsn where src_sys_cd='1051041'" acct_trsn_itm = "select acct_trsn_id, acct_trsn_itm_etry_typ_cd from radar.acct_trsn_itm where src_sys_cd='1051041'" rvrsl_rsn = "select rvrsl_rsn_cd,rvrsl_rsn_nm,rvrsl_rsn_dn from radar.rvrsl_rsn where src_sys_cd='1051041'" curr_exch_rate_typ = "select curr_exch_rate_typ_cd,curr_exch_rate_typ_nm,curr_exch_rate_typ_dn" \ " from radar.curr_exch_rate_typ where src_sys_cd='1051041'" acct_trsn_typ = "select acct_trsn_typ_cd,acct_trsn_typ_nm,acct_trsn_typ_dn" \ " from radar.acct_trsn_typ where src_sys_cd='1051041'" acct_trsn_itm_etry_typ = "select acct_trsn_itm_etry_typ_cd,acct_trsn_itm_etry_typ_nm,acct_trsn_itm_etry_typ_dn" \ " from radar.acct_trsn_itm_etry_typ where src_sys_cd='1051041'" # create dataframes df_at = spark.sql(acct_trsn) df_ati = spark.sql(acct_trsn_itm) df_rr = spark.sql(rvrsl_rsn) df_cert = spark.sql(curr_exch_rate_typ) df_att = spark.sql(acct_trsn_typ) df_atiet = spark.sql(acct_trsn_itm_etry_typ) df_ss = spark.sql("select * from radar.src_sys") # join conditions cond_rr = [df_at.rvrsl_rsn_cd == df_rr.rvrsl_rsn_cd] cond_cert = [df_at.curr_exch_rate_typ_cd == df_cert.curr_exch_rate_typ_cd] cond_att = [df_at.acct_trsn_typ_cd == df_att.acct_trsn_typ_cd] cond_atiet = [ df_ati.acct_trsn_itm_etry_typ_cd == df_atiet.acct_trsn_itm_etry_typ_cd ] # joins # at inner join ati df_master = df_at.join(df_ati, df_at.acct_trsn_id == df_ati.acct_trsn_id, "inner") joined_df = df_master.join(df_rr, cond_rr, 'left_outer') \ .join(df_cert, cond_cert, 'left_outer') \ .join(df_att, cond_att, 'left_outer') \ .join(df_atiet, cond_atiet, 'left_outer') \ .join(df_ss, df_ss.src_sys_cd == "1051041", 'left_outer') spark.sql( "create temporary function gnrl_ldgr_acct_trsn_atr_dmnsn_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'" ) # please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions. df_gnrl_ldgr_acct_trsn_atr_dmnsn = joined_df.select( f.nanvl(df_at.curr_exch_rate_typ_cd, f.lit("?")).alias("curr_exch_rate_typ_cd"), f.coalesce(df_cert.curr_exch_rate_typ_nm, df_cert.curr_exch_rate_typ_dn, f.lit("NO VALUE")).alias("curr_exch_rate_typ_nm"), f.coalesce(df_cert.curr_exch_rate_typ_dn, df_cert.curr_exch_rate_typ_nm, f.lit("NO VALUE")).alias("curr_exch_rate_typ_dn"), f.nanvl(df_at.acct_trsn_typ_cd, f.lit("?")).alias("acct_trsn_typ_cd"), f.coalesce(df_att.acct_trsn_typ_nm, df_att.acct_trsn_typ_dn, f.lit("NO VALUE")).alias("acct_trsn_typ_nm"), f.coalesce(df_att.acct_trsn_typ_dn, df_att.acct_trsn_typ_nm, f.lit("NO VALUE")).alias("acct_trsn_typ_dn"), f.nanvl( f.when(df_ati.acct_trsn_itm_etry_typ_cd == "S", f.lit("DR")).when( df_ati.acct_trsn_itm_etry_typ_cd == "H", f.lit("CR")).otherwise(df_ati.acct_trsn_itm_etry_typ_cd), f.lit("NO VALUE")).alias("acct_trsn_itm_etry_typ_cd"), f.coalesce(df_atiet.acct_trsn_itm_etry_typ_nm, df_atiet.acct_trsn_itm_etry_typ_dn, f.lit("NO VALUE")).alias("acct_trsn_itm_etry_typ_nm"), f.nanvl( f.when(df_at.acct_trsn_bckpst_ind == "X", f.lit("Y")).otherwise(f.lit("N")), f.lit("?")).alias("acct_trsn_bckpst_ind"), f.when(f.nanvl(df_at.rvrsl_rsn_cd, f.lit("")) == "", f.lit("?")).otherwise(df_at.rvrsl_rsn_cd).alias("rvrsl_rsn_cd"), f.coalesce(df_rr.rvrsl_rsn_nm, df_rr.rvrsl_rsn_dn, f.lit("NO VALUE")).alias("rvrsl_rsn_nm"), f.coalesce(df_rr.rvrsl_rsn_dn, df_rr.rvrsl_rsn_nm, f.lit("NO VALUE")).alias("rvrsl_rsn_dn"), f.lit("1051041"), f.nanvl(df_ss.src_sys_nm, f.lit("NO VALUE")).alias("src_sys_nm"), f.current_timestamp(), f.lit('rz_gnrl_ldgr_acct_trsn_atr_dmnsn.hql'), f.lit('N'), f.lit("1051041")) df_hardcode = spark.sql( "select -1051041, '?', 'NO VALUE', 'NO VALUE', '?', 'NO VALUE', 'NO VALUE', '?', " "'NO VALUE', 'N', '?', 'NO VALUE', 'NO VALUE', '1051041', ss.src_sys_nm, " "from_unixtime(unix_timestamp(), 'yyyy-mm-dd hh:mm:ss'), " "'rz_gnrl_ldgr_acct_trsn_atr_dmnsn.hql', 'N', '1051041' from radar.src_sys ss " "where ss.src_sys_cd=1051041") df_gnrl_ldgr_acct_trsn_atr_dmnsn.createOrReplaceTempView("tmp") spark.sql("select cast( rpad('1051041',18,'0') as bigint )+gnrl_ldgr_acct_trsn_atr_dmnsn_sequence() as acct_trsn_atr_ky, " "tmp.* from tmp").unionAll(df_hardcode).drop_duplicates()\ .createOrReplaceTempView("gnrl_ldgr_acct_trsn_atr_dmnsn") # insert data spark.sql( "insert overwrite table radar_rz.gnrl_ldgr_acct_trsn_atr_dmnsn partition (src_sys_cd) select * from gnrl_ldgr_acct_trsn_atr_dmnsn" ) # test Data spark.sql("select * from radar_rz.gnrl_ldgr_acct_trsn_atr_dmnsn limit 10" ).show()