Ejemplo n.º 1
0
    def baseline_model(self, df_processed):
        cond_dvs = F.when(col("dvs_p30days") > 400,
                          400).otherwise(col("dvs_p30days"))


        stats=df_processed\
        .withColumn("dvs_p30days",cond_dvs)\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .groupby(['grt_l2_cat_name'])\
        .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days")
            ,F.stddev(col("dvs_p30days")).alias("std_deal_view")
            ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view")
            ,F.max("dvs_p30days").alias("max_deal_view"))

        w = Window.partitionBy(F.col('consumer_id')).orderBy(
            F.col('normalized_dvs_p30days').desc())

        df_final=df_processed\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .join(stats,on='grt_l2_cat_name')\
        .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\
        .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w))


        df_micro=df_final\
        .filter(col('normalized_dvs_p30days')>=0)\
        .filter(col('normalized_dvs_p30days_rank')==1)\
        .filter(col('grt_l2_purchaser_14d')==0)

        return df_micro
Ejemplo n.º 2
0
    def age_calculator(self, columns, age_colname):
        """
        @columns: a string of column name
        @age_colname: a string of the new column of age, e.g.'age'
        """
        assert isinstance(columns,
                          str), "Error: columns argument must be a string!"
        assert isinstance(
            age_colname,
            str), "Error: the name for age column argument must be a string!"

        df = self._df.select(
            columns,
            from_unixtime(unix_timestamp(columns,
                                         'MM/dd/yyy')).alias('with time'))
        df = df.withColumn('without time', df['with time'].cast(DateType()))
        df = df.withColumnRenamed(columns, 'tmp')
        df = df.withColumn(age_colname, (
            (datediff(current_date(), col('without time')).cast(FloatType())) /
            365).cast(IntegerType()))
        self._df = self._df.join(
            df,
            col(columns) == col('tmp'),
            'leftouter').drop('tmp').drop('with time').drop('without time')

        return self
 def _preprocess(self):
     self._df = self._df[self._REQUIRED_COLUMNS]
     self._df = self._df.toDF(*self._HEADER_NAMES)
     self._df = self._df.withColumn("logged_datetime", to_timestamp(
         self._df.logged_datetime, 'yyyy/MM/dd'))
     self._df = self._df.withColumn("processed_datetime", current_date())
     self._df = self._df.withColumn('processed_datetime',
                                    to_timestamp(self._df.processed_datetime))
     self._df = self._df.withColumn(
         "source_port", self._df["source_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "destination_port", self._df["destination_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "bytes_sent", self._df["bytes_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "bytes_received", self._df["bytes_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_received", self._df["packets_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_sent", self._df["packets_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "time_elapsed", self._df["time_elapsed"].cast(LongType()))
     self._df = self._df.withColumn(
         "repeat_count", self._df["repeat_count"].cast(LongType()))
     self._df = self._df.withColumn("count_events", lit(1))
Ejemplo n.º 4
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(
                    date_format(
                        unix_timestamp(column, dateFormat).cast("timestamp"),
                        Format), current_date()) / 12), 4).alias(nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
Ejemplo n.º 5
0
def process_expense_file(file_path):
  tempDf = spark.read.load(file_path, format="csv", header="true", escape='"')
  tempDf = tempDf.filter(tempDf["Tags"].isNotNull()).select(
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("PreTaxCost"),
    col("ResourceType"),
    col("UsageDateTime"),
    split(col("Tags"), ",").alias("tags")
  )
  return tempDf.withColumn("costCenterNumber", lit(get_cost_center(tempDf["tags"])))\
          .withColumn("team", lit(get_team(tempDf["tags"])))\
          .withColumn("usageMonth", month(tempDf['UsageDateTime']))\
          .withColumn("usageYear", year(tempDf['UsageDateTime']))\
          .withColumn("processedDate", current_date())\
          .select(
    col("processedDate"),
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("ResourceType"),
    col("costCenterNumber"),
    col("team"),
    col("usageMonth"),
    col("usageYear"),
    col("UsageDateTime"),
    col("PreTaxCost")
  ).filter("costCenterNumber is not NULL")
Ejemplo n.º 6
0
    def put(self, df):
        """Writes data to final CSV file and stops spark session. 

        Arguments: 
            df[spark datafarme] -- spark dataframe that
        """
        try:
            self.logger.info('Begin putting data to disk // {}'.format(
                self.etl_id))

            jdbc_params = self._inst_jdbc_params()
            jdbc_params['dbtable'] = 'final_data'

            df = df.withColumn('process_date', F.current_date())

            df.write.format('jdbc').options(**jdbc_params)\
                .mode('overwrite')\
                .save()

            self.logger.info('Job complete // {}'.format(self.etl_id))

            self.spark_session.stop()

        except Exception as e:
            self.logger.error('{} // {}'.format(e, self.etl_id))
Ejemplo n.º 7
0
    def getReadRDBMSDataFrame():

        from_format = "dd/MMM/yyyy:HH:mm:ss"
        to_format = 'dd-MM-yyyy'
        storeIncrementalValue(0, "ignore")

        df = spark.read.format("csv") \
            .option("inferSchema", True) \
            .option("header", True) \
            .load(argv[17])

        df = df.filter(df.createdt.isNotNull() & (df.createdt != "null"))

        result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \
            .withColumn("value",f.col("value").cast(f.StringType()))\
            .withColumn("score",f.col("score").cast(f.StringType()))\
            .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\
            .withColumn("status",f.col("status").cast(f.StringType()))\
            .withColumn("count",f.col("count").cast(f.StringType()))\
            .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\
            .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method",
                    "key", "count", "type", "site", "statuscode") \
            .withColumn("rdbms_current_date", f.current_date()) \
            .filter(f.col("id") > getIncrementalValue())

        result = result.withColumn(
            "id",
            f.col("id").cast(f.StringType())).na.fill("Not_Applicable")

        max_value = df.agg({"id": "max"}).collect()[0]
        storeIncrementalValue(max_value["max(id)"], "overwrite")

        result.printSchema()

        return result
Ejemplo n.º 8
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
Ejemplo n.º 9
0
def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()
Ejemplo n.º 10
0
def get_crm_cust_log(spark):
    #crm_cust_log 变更记录表
    # change_type:  CO_CUST.STATUS  状态变更
    #               CO_CUST.CUST_SEG  档位变更
    crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \
        .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \
        .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \
        .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \
        .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31"))
    return crm_cust_log
def create_oracle_output():
    df_hcp_match_final = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.path'))

    df_hcp_yesterdays_data = \
        spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.path')) \
            .filter(F.col('full_mk').isNotNull()) \
            .filter(F.col('prop_mk').isNotNull()) \
            .filter(F.col('pnr_mk').isNotNull())

    join_fields = ['lastmodified_datetime', 'full_mk', 'prop_mk', 'pnr_mk', 'match_level', 'new_rate_bucket']

    df_hcp_yesterday = df_hcp_yesterdays_data \
        .withColumn('match_flag', F.lit('Y')) \
        .join(df_hcp_match_final.select(join_fields),
              on=join_fields) \
        .persist()

    df_hcp_today = df_hcp_match_final \
        .withColumn('dl_last_modified_date', F.current_date()) \
        .withColumn('rk', F.lit(1)) \
        .withColumn('rk_f', F.lit(1)) \
        .join(df_hcp_yesterday.select(join_fields + ['match_flag']),
              on=join_fields,
              how='left') \
        .filter(F.col('match_flag').isNull())

    # TODO: Remove redundant fields and remnants of deduping keys which only contain a single value
    df_hcp_oracle = \
        cast_fields_for_oracle_output(df_hcp_yesterday) \
            .union(cast_fields_for_oracle_output(df_hcp_today)) \
            .withColumn('dwh_rk',
                        F.row_number().over(Window
                                            .partitionBy(F.col('pnr_mk'))
                                            .orderBy(F.col('pnr_mk').desc()))) \
            .filter(F.col('dwh_rk') == 1) \
            .drop('dwh_rk')

    # TODO: Investigate partitioning of Oracle data suitable for end-user
    # Dump to staging path since Spark cannot simultaneously read and write to the same path
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.table'))
    df_hcp_yesterday.unpersist()
    df_hcp_oracle = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'))
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.table'))

    # Oracle table for EDM
    dump_dataframe(df=df_hcp_oracle
                   .withColumnRenamed('rate_bucket', 'original_rate_bucket')
                   .withColumnRenamed('new_rate_bucket', 'rate_bucket'),
                   path=config.get(config_set, 'hcp.txns.to.dwh.f.path'),
                   table_name=config.get(config_set, 'hcp.txns.to.dwh.f.table')
                   )
Ejemplo n.º 12
0
def adobe_traffic_by_channel(df_traffic_by_channel):
    return (df_traffic_by_channel.select(
        df_traffic_by_channel['id'], df_traffic_by_channel['createdat'],
        df_traffic_by_channel['reportsuiteid'],
        df_traffic_by_channel['reportsuitename'],
        df_traffic_by_channel['server'], df_traffic_by_channel['data'],
        df_traffic_by_channel['hour'], df_traffic_by_channel['channel'],
        df_traffic_by_channel['visits'], df_traffic_by_channel['orders'],
        df_traffic_by_channel['ordersserialized'],
        df_traffic_by_channel['revenue']).withColumn(
            'dataprocessamento', f.trunc(f.current_date(), 'month')))
Ejemplo n.º 13
0
def prepare_features(df):
    df = df.withColumn(
        'exclam',
        length('review_body') -
        length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age',
                       datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(
        ['star_rating', 'helfulness', 'age', 'review_length',
         'label']).na.fill(0)
    return select_cols
Ejemplo n.º 14
0
def merge(spark, update, tableName, cols, key):
    """
    将DataFrame和delta表进行merge操作,insert操作要求DataFrame必须包含delta表所有的列(0.5版本)
    当我们使用merge操作更新/插入delta表其中几列时,指定在DataFrame中不存在的列的值为null。

    注:DataFrame中要写入delta表的列要和delta表一样
    :param spark,SparkSession实例
    :param update,spark DataFrame
    :param tableName,要更新的delta表
    """
    # 如果没有dt列,创建当前日期的dt列
    if "dt" not in cols:
        update = update.withColumn("dt", f.current_date())
        cols.append("dt")

    # 1.构建merge条件
    mergeExpr = f"origin.{key}=update.{key}"
    print(f"merge expression:{mergeExpr}")

    # 2.构建更新表达式
    updateExpr = {}
    for c in cols:
        updateExpr[c] = f"update.{c}"

    print(f"update expression:{updateExpr}")

    origin = DeltaTable.forPath(spark, tableName)
    origin_cols = origin.toDF().columns

    # 3.构建插入表达式
    insertExpr = {}
    for origin_col in origin_cols:
        if origin_col in cols:
            insertExpr[origin_col] = f"update.{origin_col}"
        else:
            # 不存在,插入null值(不是字符串)
            insertExpr[origin_col] = "null"

    print(f"insert expression:{insertExpr}")

    # for origin_col in origin_cols:
    #     if origin_col not in cols:
    #         update=update.withColumn(origin_col,f.lit(None))

    origin.alias("origin") \
        .merge(update.alias("update"), mergeExpr) \
        .whenMatchedUpdate(set=updateExpr) \
        .whenNotMatchedInsert(values=insertExpr) \
        .execute()
Ejemplo n.º 15
0
 def test_current_date(self, input_df, spark_session):
     """Substitute the cleansed values with the current date"""
     cleaning_definitions = dict(status=dict(
         elements=["active", "inactive"], default=F.current_date()))
     expected_output_df = spark_session.createDataFrame([
         Row(id=1, status="active"),
         Row(id=2, status=str(dt.date.today())),
         Row(id=3, status=str(dt.date.today())),
         Row(id=4, status="inactive"),
         Row(id=5, status=None),
         Row(id=6, status=str(dt.date.today())),
     ])
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definitions).transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Ejemplo n.º 16
0
        def _years_between(_new_col_name, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                _new_col_name)
Ejemplo n.º 17
0
    def getRandomUserWebApiDataFrame():

        final_list = []
        user_set = set()
        results_json = dict()
        randomuserurl = argv[7]
        ALLOWED_RANDOM_USER = int(argv[8])

        while True:
            resp = json.loads(
                urllib.request.urlopen(randomuserurl).read().decode('utf-8'))
            for randomuser in resp['results']:
                username = removeDigits(randomuser.get('user').get('username'))
                if username not in user_set:
                    user_set.add(username)
                    final_list.append(randomuser)
                if len(user_set) == ALLOWED_RANDOM_USER:
                    break

            if len(user_set) == ALLOWED_RANDOM_USER:
                break

        results_json["results"] = final_list
        json_df = spark.read.json(sc.parallelize([results_json]))
        users = json_df.withColumn("results", f.explode(f.col("results"))) \
            .withColumn("r_username", f.regexp_replace("results.user.username", '[0-9]', '')) \
            .withColumn("dob", f.from_unixtime(f.unix_timestamp(f.col('results.user.dob').cast('timestamp')), 'dd-MM-yyyy')) \
            .withColumn("user_current_date", f.current_date()) \
            .select("r_username", "results.user.gender", "results.user.name.title",
                    f.col("results.user.name.first").alias("firstname"),
                    f.col("results.user.name.last").alias("lastname"),
                    f.col("results.user.location.street").alias("location_street"),
                    f.col("results.user.location.city").alias("location_city"),
                    f.col("results.user.location.state").alias("location_state"),
                    f.col("results.user.location.zip", ).alias("location_zip"),
                    "results.user.email", "results.user.password", "results.user.salt", "results.user.md5",
                    "results.user.sha1", "results.user.sha256", "results.user.registered", "dob", "results.user.phone",
                    "results.user.cell", f.col("results.user.picture.large").alias("picture_large"),
                    f.col("results.user.picture.medium").alias("picture_medium"), "results.user.picture.thumbnail",
                    "user_current_date")

        return users
def main(inp, outp):
    listings = spark.read.csv(inp, header=True, schema=schema)
    listings = listings.drop('host_id', 'host_name', 'neighbourhood_group',
                             'reviews_per_month',
                             'calculated_host_listings_count',
                             'availability_365')

    # listings.describe(['minimum_nights']).show()

    # Give a range to minimum_nights
    listings = listings.filter("minimum_nights > 0 AND minimum_nights <= 30")

    # Remove listings with last_review posted more than 2 years ago
    listings = listings.withColumn('curr', f.current_date())
    listings = listings.withColumn(
        'diff', f.datediff(f.col('curr'), f.col('last_review')))
    cleaned_listings = listings.filter(listings['diff'] < 730).drop(
        'curr', 'diff', 'number_of_reviews', 'last_review')
    ''' Coalescion is fine, as listing data only contains airbnb locations in Vancouver '''
    cleaned_listings.coalesce(1).write.csv(outp, mode='overwrite', header=True)
Ejemplo n.º 19
0
 def createAgeDateColumns(self, df):
     """Criar colunas de idade do cliente e data do pedido.
     
     Args:
         df: Dataframe
     
     Returns:
         Dataframe.
     """
     log(
         "INFO",
         f"[createAgeDateColumns] Criando colunas de idade e data do pedido."
     )
     return (df.withColumn(
         "data_pedido_date",
         date_format(
             from_unixtime(col('data_pedido')),
             "yyyy-MM-dd")).withColumn('idade', (months_between(
                 current_date(), col('data_nascimento_cliente')) / 12).cast(
                     IntegerType())))
Ejemplo n.º 20
0
def get_grade_cons_except():
    print(f"{str(dt.now())} 近30天零售户档位与周边消费水平不符")
    try:
        consume_level=get_all_consume_level(spark)\
                             .select("city","cust_id","consume_level")
        cust_cluster=get_cust_cluster(spark)

        #只展示目前有效的零售户    去掉 过去有效 现在无效的零售户
        valid_co_cust=get_valid_co_cust(spark).select("cust_id","sale_center_id")

        #近30天零售户档位  一个零售户近30天内多种档位,分别预警
        cust_seg=spark.sql("select * from DB2_DB2INST1_CO_CUST")\
             .where(f.datediff(f.current_date(),col("dt"))<=30)\
             .where(col("com_id").isin(["011114305","011114306","011114302"]))\
             .withColumn("cust_seg",f.regexp_replace("cust_seg","ZZ","31"))\
             .select("com_id","cust_id","cust_seg")\
             .join(valid_co_cust,"cust_id")\
             .dropDuplicates(["cust_id","cust_seg"])

        cols = {"value": "retail_rim_cons",
                "level1_code": "classify_level1_code",
                }
        cust_seg_cons=cust_seg.join(consume_level,"cust_id")\
                             .withColumnRenamed("consume_level",cols["value"])\
                             .join(cust_cluster,"cust_id")\
                             .select("city","cust_id","cust_seg","cluster_index","sale_center_id",cols["value"])

        result=except_grade(cust_seg_cons,cols,["city","cust_seg","cluster_index"],[3,4,5])\
                .withColumnRenamed("cust_seg","retail_grade")\
                .withColumnRenamed("mean","city_grade_cons_avg") \
                .withColumn(cols["level1_code"], f.lit("YJFL003"))

        values = [
                 "avg_orders_plus3", "avg_orders_minu3", "avg_orders_plus4",
                 "avg_orders_minu4", "avg_orders_plus5", "avg_orders_minu5",
                 "warning_level_code", "cust_id", "city", "sale_center_id",
                 "retail_grade", "city_grade_cons_avg"
                 ] + list(cols.values())
        result.foreachPartition(lambda x:write_hbase2(x,values,hbase))
    except Exception:
        tb.print_exc()
Ejemplo n.º 21
0
 def execute(self):
     """
     Method overrides the execute method of the abstract Task class. Method classifies the
     recipe based on the difficulty. Difficulty is classified based on the total time required
     for prepping and cooking the food.
     If sum(prep time + cooking time) > 1 Hour is as Hard.
     If sum(prep time + cooking time) > 30 minutes and less than an hour is as Medium
     If sum(prep time + cooking time) < 30 minutes as Easy
     As 'Unknown' for all other cases.
     :return: pyspark.sql.DataFrame
                     Returns a Spark's DataFrame with the transformation for the recipe.
     """
     return self.data_frame \
         .withColumn('cookTimeInMinutes',
                     when(lower(col('cookTime')).endswith('h'),
                          regexp_extract('cookTime', '\d+', 0) * 60)
                     .when(lower(col('cookTime')).endswith('m'),
                           regexp_extract('cookTime', '\d+', 0)).otherwise("")) \
         .withColumn('prepTimeInMinutes',
                     when(lower(col('prepTime')).endswith('h'),
                          regexp_extract('prepTime', '\d+', 0) * 60)
                     .when(lower(col('prepTime')).endswith('m'),
                           regexp_extract('prepTime', '\d+', 0)).otherwise("")) \
         .withColumn('difficulty',
                     when(expr('(cookTimeInMinutes + prepTimeInMinutes) > 60'), "Hard")
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) <= 60 and '
                                '(cookTimeInMinutes + prepTimeInMinutes) >= 30'), 'Medium')
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) < 30'), "Easy")
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes > 60'), 'Hard')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes >= 30 and  prepTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes > 60'), 'Hard')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes >= 30 and  cookTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes = ""'), 'Unknown')
                     .otherwise("Unknown")) \
         .withColumn('date_of_execution', current_date()) \
         .drop('cookTimeInMinutes') \
         .drop('prepTimeInMinutes')
Ejemplo n.º 22
0
def get_co_co_01(spark, scope: list, filter="day"):
    """

    :param spark: SparkSession
    :param scope: [lower,upper]  日期过滤范围
    :param filter: 过滤类别  "day","week","month"  default:"day"
    :return:
    """
    # 获取co_co_01        unique_kind: 90 退货  10 普通订单    pmt_status:  0 未付款  1 收款完成
    co_co_01 = spark.sql(
        "select  * from DB2_DB2INST1_CO_CO_01 where (unique_kind = 90 and pmt_status=0) or (unique_kind=10 and pmt_status=1)") \
        .where(col("com_id").isin(cities)) \
        .where(col("born_date").rlike("\d{8}")) \
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_sum", col("qty_sum").cast("float")) \
        .withColumn("amt_sum", col("amt_sum").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_01 = co_co_01.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_01=co_co_01.withColumn("week_diff", week_diff("today", "born_date"))\
                         .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_01 = co_co_01.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_01
Ejemplo n.º 23
0
def get_co_co_line(spark, scope: list, filter="day"):
    """

        :param spark: SparkSession
        :param scope: [lower,upper]  日期过滤范围
        :param filter: 过滤类别  "day","week","month"  default:"day"
        :return:
        """
    co_co_line = spark.sql(
        "select * from DB2_DB2INST1_CO_CO_LINE") \
        .where(col("com_id").isin(cities))\
        .where(col("born_date").rlike("\d{8}"))\
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_ord", col("qty_ord").cast("float")) \
        .withColumn("price", col("price").cast("float"))\
        .withColumn("qty_rsn",col("qty_rsn").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_line = co_co_line.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_line = co_co_line.withColumn("week_diff", week_diff("today", "born_date")) \
                    .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_line = co_co_line.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_line
Ejemplo n.º 24
0
def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto):
    df_depara = (df_deparadepto.filter(
        (df_deparadepto['tabela'] == 'departamento')
        & (df_deparadepto['campo'] == 'iddepartamento')).select(
            df_deparadepto['de'], df_deparadepto['para']))

    return (df_traffic_by_department.join(
        df_depara,
        df_traffic_by_department['iddepartament'] == df_depara['de'],
        'left').select(df_traffic_by_department['id'],
                       df_traffic_by_department['createdat'],
                       df_traffic_by_department['reportsuiteid'],
                       df_traffic_by_department['reportsuitename'],
                       df_traffic_by_department['server'],
                       df_traffic_by_department['data'],
                       df_traffic_by_department['hour'],
                       df_traffic_by_department['iddepartament'],
                       f.trim(df_depara['para']).alias('namedepartment'),
                       df_traffic_by_department['visits'],
                       df_traffic_by_department['orders'],
                       df_traffic_by_department['ordersserialized'],
                       df_traffic_by_department['revenue']).withColumn(
                           'dataprocessamento',
                           f.trunc(f.current_date(), 'month')))
Ejemplo n.º 25
0
    def test_create(self):
        data = [
            (1001, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (2202, None, None, "1980-05-03", 45.0, False),  # Add a few nulls
            (2235, "", "", "1980-05-03", 43.0, True)
        ]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])

        # Ensure that a Hyper file can be created with date and timestamp columns
        df.withColumn("hire_date", current_date())
        df.withColumn("last_updated", current_timestamp())

        creator = Creator(df, 'employees', False)
        hyper_file_path = creator.create()
        assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper")
        tables = TestUtils.get_tables(
            "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (len(tables) == 1)
        num_rows = TestUtils.get_row_count(
            "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (num_rows == 5)
Ejemplo n.º 26
0
def main(input_dir, output_dir):
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.session.timeZone", "UTC")

    df = read_pg_dump(spark, input_dir)

    # apply some other transformations to obtain the final table
    result = (
        df.withColumn("parts", F.split("table_name", "_")).withColumn(
            "dimension", F.from_json("dimension", DIMENSION_SCHEMA)).
        select(
            F.current_date().alias("ingest_date"),
            "aggregate_type",
            "ds_nodash",
            # parts[:2] form aggregate_type, this is parsed from the filename
            F.col("parts").getItem(2).alias("channel"),
            F.col("parts").getItem(3).alias("version"),
            # parts[-1] is ds_nodash, this is parsed from the filename
            "dimension.*",
            from_pg_array_udf("aggregate"),
        ))
    # -RECORD 0-----------------------------------------------------
    # aggregate_type | submission_date
    # channel        | nightly
    # version        | 43
    # ds_nodash      | 20191201
    # os             | Windows_NT
    # child          | false
    # label          |
    # metric         | A11Y_INSTANTIATED_FLAG
    # osVersion      | 10.0
    # application    | Firefox
    # architecture   | x86
    # aggregate      | "[0, 2, 0, 2, 2]"

    result.repartition(1).write.parquet(output_dir, mode="overwrite")
Ejemplo n.º 27
0
            jdbc_params = {
                "url": ut.get_mysql_jdbc_url(app_secret),
                "lowerBound": "1",
                "upperBound": "100",
                "dbtable": src_conf["mysql_conf"]["dbtable"],
                "numPartitions": "2",
                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            txn_df = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load() \
                .withColumn("ins_dt", functions.current_date())
            txn_df.show()

            txn_df.write \
                .mode('overwrite')\
                .partitionBy("INS_DT")  \
                .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/staging/SB")

            print("\nWriting SB data to S3 <<")

        elif src == "OL":
            print("\nReading OL data from sftp >>")
            txn_df2 = spark.read.format("com.springml.spark.sftp") \
                .option("host", app_secret["sftp_conf"]["hostname"]) \
                .option("port", app_secret["sftp_conf"]["port"]) \
                .option("username", app_secret["sftp_conf"]["username"]) \
Ejemplo n.º 28
0
                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            # print(jdbcParams)

            # use the ** operator/un-packer to treat a python dictionary as **kwargs
            print(
                "\nReading data from MySQL DB using SparkSession.read.format(),"
            )
            txnDF = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load()\
                .withColumn('ins_date', current_date())

            # add the current date to the above df and then you write it
            txnDF.show()
            txnDF\
                .write.mode("append")\
                .partitionBy('ins_date') \
                .format("parquet") \
                .save(src_loc)

        elif src == 'OL':
            # put the code here to pull rewards data from SFTP server and write it to s3
            ol_txn_df = spark.read\
                .format("com.springml.spark.sftp")\
                .option("host", app_secret["sftp_conf"]["hostname"])\
                .option("port", app_secret["sftp_conf"]["port"])\
def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(3, False)


# COMMAND ----------

from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)
Ejemplo n.º 30
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, current_date, \
    current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()

    spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show()

    spark.read.jdbc

    spark.range(5)\
         .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\
         .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show()

    spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'),
                          to_date(lit('2019-01-05')).alias('date2'))\
                  .select(datediff(col('date2'), col('date1')),
                          months_between(col('date2'), col('date1'))).show()

    spark.range(5).withColumn('date', to_date(
        lit('2019-XX-XX'))).show()  #No emite excepcion

    spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \
                  .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \
        .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \
                  .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show()

    spark.range(5).select(current_timestamp().alias("timestamp")).select(
        hour(col('date')))
Ejemplo n.º 31
0
from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()

#convert string to date, default format is 'YYYY-MM-DD'
spark.range(1).select(
    to_date(lit("2019-02-01")).alias("start_date"),
    to_date(lit("2019-03-06")).alias("end_date")).select(
        datediff(col("start_date"), col("end_date"))).show()

cleanDateDF = spark.range(1).withColumn("date1", current_date())
date_format = 'YYYY-MM-DD'