Beispiel #1
0
def test_nanvl(data_gen):
    s1 = gen_scalar(data_gen, force_no_nulls=True)
    data_type = data_gen.data_type
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).select(
            f.nanvl(f.col('a'), f.col('b')),
            f.nanvl(f.col('a'), s1.cast(data_type)),
            f.nanvl(f.lit(None).cast(data_type), f.col('b')),
            f.nanvl(f.lit(float('nan')).cast(data_type), f.col('b'))))
Beispiel #2
0
 def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column:
     # Special handle floating point types because Spark's count treats nan as a valid value,
     # whereas Pandas count doesn't include nan.
     if isinstance(spark_type, (FloatType, DoubleType)):
         return F.count(F.nanvl(col, F.lit(None)))
     else:
         return F.count(col)
Beispiel #3
0
 def nan_to_null(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     # Special handle floating point types because Spark's count treats nan as a valid value,
     # whereas pandas count doesn't include nan.
     return index_ops._with_new_scol(
         F.nanvl(index_ops.spark.column, SF.lit(None)),
         field=index_ops._internal.data_fields[0].copy(nullable=True),
     )
Beispiel #4
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(
                        spark_type, FloatType):
                    stat_exprs.append(
                        sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=data_columns,
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return DataFrame(internal)
Beispiel #5
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i))
                         for i, s in enumerate(groupkeys)]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType):
                    stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        metadata = Metadata(data_columns=data_columns,
                            index_map=[('__index_level_{}__'.format(i), s.name)
                                       for i, s in enumerate(groupkeys)])
        return DataFrame(sdf, metadata)
Beispiel #6
0
def fillspark(hist, df):
    import pyspark.sql.functions as fcns

    indexes = []
    for axis in hist._group + hist._fixed:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))

        if isinstance(axis, histbook.axis.groupby):
            indexes.append(exprcol)

        elif isinstance(axis, histbook.axis.groupbin):
            scaled = (exprcol - float(axis.origin)) * (1.0 /
                                                       float(axis.binwidth))
            if axis.closedlow:
                discretized = fcns.floor(scaled)
            else:
                discretized = fcns.ceil(scaled) - 1
            indexes.append(
                fcns.nanvl(
                    discretized * float(axis.binwidth) + float(axis.origin),
                    fcns.lit("NaN")))

        elif isinstance(axis, histbook.axis.bin):
            scaled = (exprcol -
                      float(axis.low)) * (int(axis.numbins) /
                                          (float(axis.high) - float(axis.low)))
            if axis.closedlow:
                discretized = fcns.floor(scaled) + 1
            else:
                discretized = fcns.ceil(scaled)
            indexes.append(
                fcns.when(
                    fcns.isnull(exprcol) | fcns.isnan(exprcol),
                    int(axis.numbins) + 2).otherwise(
                        fcns.greatest(
                            fcns.lit(0),
                            fcns.least(fcns.lit(int(axis.numbins) + 1),
                                       discretized))))

        elif isinstance(axis, histbook.axis.intbin):
            indexes.append(
                fcns.greatest(
                    fcns.lit(0),
                    fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1),
                               fcns.round(exprcol - int(axis.min) + 1))))

        elif isinstance(axis, histbook.axis.split):

            def build(x, i):
                if i < len(axis.edges):
                    if axis.closedlow:
                        return build(x.when(exprcol < float(axis.edges[i]), i),
                                     i + 1)
                    else:
                        return build(
                            x.when(exprcol <= float(axis.edges[i]), i), i + 1)
                else:
                    return x.otherwise(i)

            indexes.append(
                build(
                    fcns.when(
                        fcns.isnull(exprcol) | fcns.isnan(exprcol),
                        len(axis.edges) + 1), 0))

        elif isinstance(axis, histbook.axis.cut):
            indexes.append(fcns.when(exprcol, 0).otherwise(1))

        else:
            raise AssertionError(axis)

    aliasnum = [-1]

    def alias(x):
        aliasnum[0] += 1
        return x.alias("@" + str(aliasnum[0]))

    index = alias(fcns.struct(*indexes))

    selectcols = [index]
    if hist._weightoriginal is not None:
        weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed))
    for axis in hist._profile:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))
        if hist._weightoriginal is None:
            selectcols.append(alias(exprcol))
            selectcols.append(alias(exprcol * exprcol))
        else:
            selectcols.append(alias(exprcol * weightcol))
            selectcols.append(alias(exprcol * exprcol * weightcol))

    if hist._weightoriginal is None:
        df2 = df.select(*selectcols)
    else:
        selectcols.append(alias(weightcol))
        selectcols.append(alias(weightcol * weightcol))
        df2 = df.select(*selectcols)

    aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]]
    if hist._weightoriginal is None:
        aggs.append(fcns.count(df2[df2.columns[0]]))

    def getornew(content, key, nextaxis):
        if key in content:
            return content[key]
        elif isinstance(nextaxis, histbook.axis.GroupAxis):
            return {}
        else:
            return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE)

    def recurse(index, columns, axis, content):
        if len(axis) == 0:
            content += columns

        elif isinstance(axis[0],
                        (histbook.axis.groupby, histbook.axis.groupbin)):
            content[index[0]] = recurse(
                index[1:], columns, axis[1:],
                getornew(content, index[0],
                         axis[1] if len(axis) > 1 else None))
            if isinstance(axis[0], histbook.axis.groupbin) and None in content:
                content["NaN"] = content[None]
                del content[None]

        elif isinstance(
                axis[0],
            (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)):
            i = index[0] - (1 if not axis[0].underflow else 0)
            if int(i) < axis[0].totbins:
                recurse(index[1:], columns, axis[1:], content[int(i)])

        elif isinstance(axis[0], histbook.axis.cut):
            recurse(index[1:], columns, axis[1:],
                    content[0 if index[0] else 1])

        else:
            raise AssertionError(axis[0])

        return content

    query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs)

    def wait():
        for row in query.collect():
            recurse(row[0], row[1:], hist._group + hist._fixed, hist._content)

    return wait
Beispiel #7
0
    def _fit(self, ratings_df):
        '''
        Fit ALS model using reviews as training data.

        Parameters
        ==========
        ratings_df      (pyspark.sql.DataFrame) Data used to train recommender
                        model. Columns are 'user', 'item', and 'rating'. Values
                        of user and item must be numeric. Values of rating
                        range from 1 to 5.

        Returns
        =======
        self
        '''
        # avg_rating_df = (
        #     ratings_df
        #     .groupBy()
        #     .avg(self.getRatingCol())
        #     .withColumnRenamed('avg({})'.format(self.getRatingCol()),
        #                        'avg_rating')
        # )

        # print('Fit starting!')

        start_time = time.monotonic()

        # print('ratings_df')
        # ratings_df.show()

        rating_stats_df = (
            ratings_df
            .agg(
                F.avg(self.getRatingCol()).alias('avg_rating'),
                F.stddev_samp(self.getRatingCol()).alias('stddev_rating')
            )
        )

        # print('ratings_stats_df:')
        # rating_stats_df.show()

        # if not self.getUseALS():
        #     self.setLambda_1(0.0)
        #     self.setLambda_2(0.0)

        item_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .withColumn(
                'diffs_item_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating')
            )
            .groupBy(self.getItemCol())
            .agg(
                F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_item_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_item_rating'),
                F.count("*").alias('count_item_rating')
            )
            .withColumn(
                'stderr_diffs_item_rating',
                (self.getLambda_1() + F.col('stddev_diffs_item_rating'))
                / F.sqrt('count_item_rating')
            )
            .withColumn(
                'item_bias',
                F.col('avg_diffs_item_rating')
                / (1 +  F.col('stderr_diffs_item_rating'))
            )
            .select(
                self.getItemCol(),
                'item_bias',
                'avg_diffs_item_rating',
                'stderr_diffs_item_rating',
                'stddev_diffs_item_rating',
                'count_item_rating'
            )
        )

        # print('item_bias_df:')
        # item_bias_df.show(5)

        # item_bias_df.printSchema()

        # print('item_bias_df NaN')
        # item_bias_df.where(F.isnan("item_bias")).show()

        user_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .join(item_bias_df, on=self.getItemCol())
            .withColumn(
                'diffs_user_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias')
            )
            .groupBy(self.getUserCol())
            .agg(
                F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_user_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_user_rating'),
                F.count("*").alias('count_user_rating')
            )
            .withColumn(
                'stderr_diffs_user_rating',
                (self.getLambda_2() + F.col('stddev_diffs_user_rating'))
                / F.sqrt('count_user_rating')
            )
            .withColumn(
                'user_bias',
                F.col('avg_diffs_user_rating')
                / (1 + F.col('stderr_diffs_user_rating'))
            )
            .select(
                self.getUserCol(),
                'user_bias',
                'avg_diffs_user_rating',
                'stderr_diffs_user_rating',
                'stddev_diffs_user_rating',
                'count_user_rating'
            )
        )

        # print('user_bias_df:')
        # user_bias_df.show(5)

        # print('user_bias_df NaN')
        # user_bias_df.where(F.isnan("user_bias")).show()

        if self.getUseALS():
            if self.getUseBias():
                residual_df = (
                    ratings_df
                    .crossJoin(rating_stats_df)
                    .join(user_bias_df, on=self.getUserCol())
                    .join(item_bias_df, on=self.getItemCol())
                    .withColumn(
                        self.getRatingCol(),
                        F.col(self.getRatingCol())
                        - F.col('avg_rating')
                        - F.col('user_bias')
                        - F.col('item_bias')
                    )
                    .select(
                        self.getUserCol(),
                        self.getItemCol(),
                        self.getRatingCol()
                    )
                )

            else:
                residual_df = ratings_df
                # self.setColdStartStrategy('drop')

            residual_stats_df = (
                residual_df
                .agg(
                    F.avg(F.col(self.getRatingCol())).alias('avg_residual'),
                    F.stddev(F.col(self.getRatingCol())).alias('stddev_residual')
                )
            )

            # print('residual_df')
            # residual_df.show()

            # print('residual_df NaN')
            # residual_df.where(F.isnan("rating")).show()

            # print('residual_stats_df')
            # residual_stats_df.show()

            als_model = ALS(
                rank=self.getRank(),
                maxIter=self.getMaxIter(),
                regParam=self.getRegParam(),
                numUserBlocks=self.getNumUserBlocks(),
                numItemBlocks=self.getNumItemBlocks(),
                implicitPrefs=self.getImplicitPrefs(),
                alpha=self.getAlpha(),
                userCol=self.getUserCol(),
                itemCol=self.getItemCol(),
                ratingCol=self.getRatingCol(),
                nonnegative=self.getNonnegative(),
                checkpointInterval=self.getCheckpointInterval(),
                intermediateStorageLevel=self.getIntermediateStorageLevel(),
                finalStorageLevel=self.getFinalStorageLevel()
            )

            recommender = als_model.fit(residual_df)

        else:
            recommender = None
            residual_stats_df = None

        print('Fit done in {} seconds'.format(time.monotonic() - start_time))

        return(
            RecommenderModel(
                self.getUseALS(), self.getUseBias(), self.getLambda_3(),
                # self.getColdStartStrategy(),
                recommender, rating_stats_df, residual_stats_df,
                user_bias_df, item_bias_df
            )
        )
def main(spark):

    # sql declaration
    bsad = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \
           "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsad"
    bsid = "select bukrs,belnr,gjahr,buzei, blart,rebzg,cpudt,budat, shkzg,sgtxt,kunnr,prctr, kostl,waers,dmbtr, " \
           "monat, wrbtr,dmbe2 from 200836_az_fi1_1051041.bsid"
    bseg = "select * from 200836_az_fi1_1051041.bseg_nw"
    knb1 = "select bukrs, akont, kunnr from 200836_az_fi1_1051041.knb1 WHERE knb1.AKONT is NOT NULL and knb1.akont<>''"
    tka02 = "select bukrs, kokrs from 200836_az_fi1_1051041.tka02 where tka02.GSBER is null or trim(tka02.GSBER)=''"
    lgl_ent_ldgr = "select lgl_ent_ldgr_id, lgl_ent_ldgr_cd from radar.lgl_ent_ldgr where src_sys_cd='1051041'"
    fscl_prd = "select fscl_prd_id, fscl_prd_yr_nr, fscl_prd_sqn_nr, fscl_prd_yr_vrnt_txt from radar.fscl_prd" \
               " where src_sys_cd='1051041' and fscl_prd_typ_cd='Month'"
    vbrk = "select vbeln, fkart from 200836_az_fi1_1051041.vbrk"
    t001 = "select * from 200836_az_fi1_1051041.t001"

    # create dataframes
    df_bsad = spark.sql(bsad).persist(StorageLevel(True, True, False, False,
                                                   1))
    df_bsid = spark.sql(bsid).persist(StorageLevel(True, True, False, False,
                                                   1))
    df_bseg = spark.sql(bseg)
    df_knb1 = spark.sql(knb1)
    df_tka02 = spark.sql(tka02)
    df_lgl_ent_ldgr = spark.sql(lgl_ent_ldgr)
    df_fscl_prd = spark.sql(fscl_prd)
    df_vbrk = spark.sql(vbrk)
    df_t001 = spark.sql(t001)

    # temporary table
    df_bsad_bsid = df_bsad.select(df_bsad.bukrs, df_bsad.belnr, df_bsad.gjahr, df_bsad.buzei)\
        .unionAll(df_bsid.select(df_bsid.bukrs, df_bsid.belnr, df_bsid.gjahr, df_bsid.buzei))
    df_stg_bsad_bsid_bseg = df_bsad_bsid.join(
        df_bseg,[df_bsad_bsid.bukrs == df_bseg.bukrs, df_bsad_bsid.belnr == df_bseg.belnr,
                 df_bsad_bsid.gjahr == df_bseg.gjahr, df_bsad_bsid.buzei == df_bseg.buzei], 'inner')\
        .select(df_bseg.menge, df_bseg.bukrs, df_bseg.belnr, df_bseg.gjahr, df_bseg.buzei)

    sql_rate = "select from_curr, to_curr, budat, curr_from_dt, CURR_DRVD_EXCH_RATE from ( select stg.from_curr, " \
               "stg.to_curr, budat, stg.curr_from_dt, stg.CURR_DRVD_EXCH_RATE, " \
               "row_number() over(partition by stg.from_curr,stg.to_curr,budat order by stg.curr_from_dt desc) as rn" \
               " from ( select distinct bsad.budat from ( select budat from 200836_az_fi1_1051041.bsad" \
               " union all select budat from 200836_az_fi1_1051041.bsid )bsad )src" \
               "inner join 200836_az_fi1_1051041.stg_tcurf_tcurr stg where budat>=stg.curr_from_dt" \
               ")src1 where rn=1"
    df_rate = spark.sql(sql_rate).persist(
        StorageLevel(True, True, False, False, 1))

    # joins
    # at inner join ati
    df_master = df_bsad.select("bukrs", "belnr", "gjahr", "buzei", "blart", "rebzg", "cpudt", "budat", "shkzg", "sgtxt",
                               "kunnr", "prctr", "kostl", "waers", "dmbtr", "monat", "wrbtr", "dmbe2")\
        .unionAll(df_bsad.select("bukrs", "belnr", "gjahr", "buzei", "blart", "rebzg", "cpudt", "budat", "shkzg",
                                 "sgtxt", "kunnr", "prctr", "kostl", "waers", "dmbtr", "monat", "wrbtr", "dmbe2"))
    joined_df = df_master.join(df_t001, df_master.bukrs == df_t001.bukrs, 'inner') \
        .join(df_vbrk, df_vbrk.vbeln == df_master.rebzg, 'left_outer') \
        .join(df_knb1, [df_master.bukrs == df_knb1.bukrs, df_master.kunnr == df_knb1.kunnr], 'left_outer') \
        .join(df_fscl_prd, [df_master.gjahr == df_fscl_prd.fscl_prd_yr_nr,
                            df_master.monat == df_fscl_prd.fscl_prd_sqn_nr,
                            df_fscl_prd.fscl_prd_yr_vrnt_txt == df_t001.periv], 'left_outer') \
        .join(df_lgl_ent_ldgr, df_lgl_ent_ldgr.lgl_ent_ldgr_cd == df_t001.ktopl, 'left_outer') \
        .join(df_tka02, df_master.bukrs == df_tka02.bukrs, 'left_outer') \
        .join(df_stg_bsad_bsid_bseg, [df_stg_bsad_bsid_bseg.bukrs == df_master.bukrs,
                                      df_stg_bsad_bsid_bseg.belnr == df_master.belnr,
                                      df_stg_bsad_bsid_bseg.gjahr == df_master.gjahr,
                                      df_stg_bsad_bsid_bseg.buzei == df_master.buzei], 'left_outer') \
        .join(df_rate, [df_rate.from_curr == df_master.waers,
                        df_rate.to_curr == 'USD',
                        df_rate.budat == df_master.budat], 'left_outer').alias("stg1") \
        .join(df_rate, [df_rate.from_curr == df_master.waers,
                        df_rate.to_curr == df_t001.waers,
                        df_rate.budat == df_master.budat], 'left_outer').alias("stg2") \
        .filter("blart not in ('DZ', 'ZP', 'CT', 'CZ', 'DZ', 'FZ', 'PA', 'ZP', 'ZU')")

    # please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
    df_sb_ldgr_actv = joined_df.select(
        f.lit("1051041").alias("p_src_sys_cd"),
        f.concat(df_master.bukrs, df_master.belnr,
                 df_master.gjahr.cast("string"),
                 df_master.buzei.cast("string")).alias("sb_ldgr_actv_dcmt_nr"),
        f.lit("AR").alias("sb_ldgr_acct_typ_cd"),
        df_master.blart.alias("sb_ldgr_actv_typ_cd"),
        df_master.rebzg.alias("sb_ldgr_actv_inv_dcmt_nr"),
        f.when(df_master.rebzg != '',
               df_vbrk.fkart).alias("sb_ldgr_actv_inv_dcmt_typ_dn"),
        f.lit(None).alias("sb_ldgr_actv_csh_dcmt_nr"),
        f.lit(None).alias("sb_ldgr_actv_csh_dcmt_typ_dn"),
        f.concat(df_knb1.bukrs, f.lit('---'), df_knb1.akont,
                 f.lit('---')).alias("sb_ldgr_actv_gnrl_ldgr_acct_cd"),
        df_master.cpudt.alias("sb_ldgr_actv_crt_ts"),
        df_master.cpudt.alias("sb_ldgr_actv_etry_ts"),
        df_master.budat.alias("sb_ldgr_actv_to_be_pstg_ts"),
        df_master.shkzg.alias("acct_trsn_itm_etry_typ_cd"),
        df_master.sgtxt.alias("sb_ldgr_actv_cmt"),
        f.concat(
            df_master.bukrs, df_master.belnr,
            df_master.gjahr.cast("string")).alias("sb_ldgr_actv_trsn_btch_id"),
        df_t001.land1.alias("ctry_cd"),
        df_master.bukrs.alias("entrs_lgl_ent_nr"),
        df_lgl_ent_ldgr.lgl_ent_ldgr_id.alias("lgl_ent_ldgr_id"),
        df_master.kunnr.alias("sb_ldgr_acct_id"),
        df_fscl_prd.fscl_prd_id.alias("fscl_prd_id"),
        f.when((f.nanvl(df_tka02.kokrs, f.lit("")) == "").__or__(
            f.nanvl(df_master.prctr, f.lit("")) == ""), f.lit(None)).otherwise(
                f.concat(df_tka02.kokrs,
                         df_master.prctr)).alias("pft_cntr_cd"),
        f.when((f.nanvl(df_tka02.kokrs, f.lit("")) == "").__or__(
            f.nanvl(df_master.kostl, f.lit("")) == ""), f.lit(None)).otherwise(
                f.concat(df_tka02.kokrs,
                         df_master.kostl)).alias("pstg_obj_nn_cntrct_id"),
        f.lit(None).alias("pstg_obj_cntrct_id"),
        f.lit(None).alias("prt_id"),
        f.lit(None).alias("prod_id"),
        df_stg_bsad_bsid_bseg.menge.alias("sb_ldgr_actv_qty"),
        f.lit('M').alias("curr_exch_rate_typ_cd"),
        f.lit(None).alias("sb_ldgr_actv_lcl_to_grp_curr_exch_rate"),
        f.when(df_master.waers == 'USD',
               f.lit(1.0)).otherwise("stg1.CURR_DRVD_EXCH_RATE").alias(
                   "sb_ldgr_actv_trsn_to_grp_curr_exch_rate"),
        f.when(df_master.waers == 'USD',
               f.lit(1.0)).otherwise("stg2.CURR_DRVD_EXCH_RATE").alias(
                   "sb_ldgr_actv_trsn_to_lcl_curr_exch_rate"),
        df_t001.waers.alias("sb_ldgr_actv_lcl_curr_cd"),
        df_master.waers.alias("sb_ldgr_actv_trsn_curr_cd"),
        df_master.dmbtr.alias("sb_ldgr_actv_lcl_curr_amt"),
        df_master.wrbtr.alias("sb_ldgr_actv_trsn_curr_amt"),
        df_master.dmbe2.alias("sb_ldgr_actv_grp_curr_amt"),
        f.current_timestamp(), f.lit('ic_fi1_1051041_sb_ldgr_actv.hql'),
        f.lit('N').alias("radar_dlt_ind"),
        f.lit(None).alias("sb_ldgr_actv_gnrl_ldgr_acct_typ_dn"),
        f.lit("COMPANY").alias("entrs_lgl_ent_bsn_rol_cd"),
        f.lit("1051041").alias("src_sys_cd"))

    # insert
    df_sb_ldgr_actv.write.mode("overwrite").insertInto("radar.sb_ldgr_actv",
                                                       overwrite=True)

    # test data
    spark.sql(
        "select * from radar.sb_ldgr_actv where src_sys_cd=1051041 and sb_ldgr_acct_typ_cd='AR'"
    ).take(10)
def main(spark):

    # sql declaration
    acct_trsn = "select acct_trsn_id,curr_exch_rate_typ_cd,acct_trsn_typ_cd,acct_trsn_bckpst_ind," \
                "rvrsl_rsn_cd from radar.acct_trsn where src_sys_cd='1051041'"
    acct_trsn_itm = "select acct_trsn_id, acct_trsn_itm_etry_typ_cd from radar.acct_trsn_itm where src_sys_cd='1051041'"
    rvrsl_rsn = "select rvrsl_rsn_cd,rvrsl_rsn_nm,rvrsl_rsn_dn from radar.rvrsl_rsn where src_sys_cd='1051041'"
    curr_exch_rate_typ = "select curr_exch_rate_typ_cd,curr_exch_rate_typ_nm,curr_exch_rate_typ_dn" \
                         " from radar.curr_exch_rate_typ where src_sys_cd='1051041'"
    acct_trsn_typ = "select acct_trsn_typ_cd,acct_trsn_typ_nm,acct_trsn_typ_dn" \
                    " from radar.acct_trsn_typ where src_sys_cd='1051041'"
    acct_trsn_itm_etry_typ = "select acct_trsn_itm_etry_typ_cd,acct_trsn_itm_etry_typ_nm,acct_trsn_itm_etry_typ_dn" \
                             " from radar.acct_trsn_itm_etry_typ where src_sys_cd='1051041'"

    # create dataframes
    df_at = spark.sql(acct_trsn)
    df_ati = spark.sql(acct_trsn_itm)
    df_rr = spark.sql(rvrsl_rsn)
    df_cert = spark.sql(curr_exch_rate_typ)
    df_att = spark.sql(acct_trsn_typ)
    df_atiet = spark.sql(acct_trsn_itm_etry_typ)
    df_ss = spark.sql("select * from radar.src_sys")

    # join conditions
    cond_rr = [df_at.rvrsl_rsn_cd == df_rr.rvrsl_rsn_cd]
    cond_cert = [df_at.curr_exch_rate_typ_cd == df_cert.curr_exch_rate_typ_cd]
    cond_att = [df_at.acct_trsn_typ_cd == df_att.acct_trsn_typ_cd]
    cond_atiet = [
        df_ati.acct_trsn_itm_etry_typ_cd == df_atiet.acct_trsn_itm_etry_typ_cd
    ]

    # joins
    # at inner join ati
    df_master = df_at.join(df_ati, df_at.acct_trsn_id == df_ati.acct_trsn_id,
                           "inner")
    joined_df = df_master.join(df_rr, cond_rr, 'left_outer') \
        .join(df_cert, cond_cert, 'left_outer') \
        .join(df_att, cond_att, 'left_outer') \
        .join(df_atiet, cond_atiet, 'left_outer') \
        .join(df_ss, df_ss.src_sys_cd == "1051041", 'left_outer')

    spark.sql(
        "create temporary function gnrl_ldgr_acct_trsn_atr_dmnsn_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'"
    )

    # please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
    df_gnrl_ldgr_acct_trsn_atr_dmnsn = joined_df.select(
        f.nanvl(df_at.curr_exch_rate_typ_cd,
                f.lit("?")).alias("curr_exch_rate_typ_cd"),
        f.coalesce(df_cert.curr_exch_rate_typ_nm,
                   df_cert.curr_exch_rate_typ_dn,
                   f.lit("NO VALUE")).alias("curr_exch_rate_typ_nm"),
        f.coalesce(df_cert.curr_exch_rate_typ_dn,
                   df_cert.curr_exch_rate_typ_nm,
                   f.lit("NO VALUE")).alias("curr_exch_rate_typ_dn"),
        f.nanvl(df_at.acct_trsn_typ_cd, f.lit("?")).alias("acct_trsn_typ_cd"),
        f.coalesce(df_att.acct_trsn_typ_nm, df_att.acct_trsn_typ_dn,
                   f.lit("NO VALUE")).alias("acct_trsn_typ_nm"),
        f.coalesce(df_att.acct_trsn_typ_dn, df_att.acct_trsn_typ_nm,
                   f.lit("NO VALUE")).alias("acct_trsn_typ_dn"),
        f.nanvl(
            f.when(df_ati.acct_trsn_itm_etry_typ_cd == "S", f.lit("DR")).when(
                df_ati.acct_trsn_itm_etry_typ_cd == "H",
                f.lit("CR")).otherwise(df_ati.acct_trsn_itm_etry_typ_cd),
            f.lit("NO VALUE")).alias("acct_trsn_itm_etry_typ_cd"),
        f.coalesce(df_atiet.acct_trsn_itm_etry_typ_nm,
                   df_atiet.acct_trsn_itm_etry_typ_dn,
                   f.lit("NO VALUE")).alias("acct_trsn_itm_etry_typ_nm"),
        f.nanvl(
            f.when(df_at.acct_trsn_bckpst_ind == "X",
                   f.lit("Y")).otherwise(f.lit("N")),
            f.lit("?")).alias("acct_trsn_bckpst_ind"),
        f.when(f.nanvl(df_at.rvrsl_rsn_cd, f.lit("")) == "",
               f.lit("?")).otherwise(df_at.rvrsl_rsn_cd).alias("rvrsl_rsn_cd"),
        f.coalesce(df_rr.rvrsl_rsn_nm, df_rr.rvrsl_rsn_dn,
                   f.lit("NO VALUE")).alias("rvrsl_rsn_nm"),
        f.coalesce(df_rr.rvrsl_rsn_dn, df_rr.rvrsl_rsn_nm,
                   f.lit("NO VALUE")).alias("rvrsl_rsn_dn"), f.lit("1051041"),
        f.nanvl(df_ss.src_sys_nm, f.lit("NO VALUE")).alias("src_sys_nm"),
        f.current_timestamp(), f.lit('rz_gnrl_ldgr_acct_trsn_atr_dmnsn.hql'),
        f.lit('N'), f.lit("1051041"))

    df_hardcode = spark.sql(
        "select -1051041, '?', 'NO VALUE', 'NO VALUE', '?', 'NO VALUE', 'NO VALUE', '?', "
        "'NO VALUE', 'N', '?', 'NO VALUE', 'NO VALUE', '1051041', ss.src_sys_nm, "
        "from_unixtime(unix_timestamp(), 'yyyy-mm-dd hh:mm:ss'), "
        "'rz_gnrl_ldgr_acct_trsn_atr_dmnsn.hql', 'N', '1051041' from radar.src_sys ss "
        "where ss.src_sys_cd=1051041")

    df_gnrl_ldgr_acct_trsn_atr_dmnsn.createOrReplaceTempView("tmp")
    spark.sql("select cast( rpad('1051041',18,'0') as bigint )+gnrl_ldgr_acct_trsn_atr_dmnsn_sequence() as acct_trsn_atr_ky, "
              "tmp.* from tmp").unionAll(df_hardcode).drop_duplicates()\
        .createOrReplaceTempView("gnrl_ldgr_acct_trsn_atr_dmnsn")

    # insert data
    spark.sql(
        "insert overwrite table radar_rz.gnrl_ldgr_acct_trsn_atr_dmnsn partition (src_sys_cd) select * from gnrl_ldgr_acct_trsn_atr_dmnsn"
    )
    # test Data
    spark.sql("select * from radar_rz.gnrl_ldgr_acct_trsn_atr_dmnsn limit 10"
              ).show()