Ejemplos de current_date en Python, ejemplos de pyspark.sql.functions.current_date en Python

Ejemplo n.º 1

0

Mostrar archivo

    def baseline_model(self, df_processed):
        cond_dvs = F.when(col("dvs_p30days") > 400,
                          400).otherwise(col("dvs_p30days"))


        stats=df_processed\
        .withColumn("dvs_p30days",cond_dvs)\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .groupby(['grt_l2_cat_name'])\
        .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days")
            ,F.stddev(col("dvs_p30days")).alias("std_deal_view")
            ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view")
            ,F.max("dvs_p30days").alias("max_deal_view"))

        w = Window.partitionBy(F.col('consumer_id')).orderBy(
            F.col('normalized_dvs_p30days').desc())

        df_final=df_processed\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .join(stats,on='grt_l2_cat_name')\
        .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\
        .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w))


        df_micro=df_final\
        .filter(col('normalized_dvs_p30days')>=0)\
        .filter(col('normalized_dvs_p30days_rank')==1)\
        .filter(col('grt_l2_purchaser_14d')==0)

        return df_micro

Ejemplo n.º 2

0

Mostrar archivo

Archivo: datetimetransformer.py Proyecto: o9812/Big-Data-2018

    def age_calculator(self, columns, age_colname):
        """
        @columns: a string of column name
        @age_colname: a string of the new column of age, e.g.'age'
        """
        assert isinstance(columns,
                          str), "Error: columns argument must be a string!"
        assert isinstance(
            age_colname,
            str), "Error: the name for age column argument must be a string!"

        df = self._df.select(
            columns,
            from_unixtime(unix_timestamp(columns,
                                         'MM/dd/yyy')).alias('with time'))
        df = df.withColumn('without time', df['with time'].cast(DateType()))
        df = df.withColumnRenamed(columns, 'tmp')
        df = df.withColumn(age_colname, (
            (datediff(current_date(), col('without time')).cast(FloatType())) /
            365).cast(IntegerType()))
        self._df = self._df.join(
            df,
            col(columns) == col('tmp'),
            'leftouter').drop('tmp').drop('with time').drop('without time')

        return self

Ejemplo n.º 3

0

Mostrar archivo

Archivo: DailyTrafficMISEngine.py Proyecto: kaushu42/foxhound-security-solution

 def _preprocess(self):
     self._df = self._df[self._REQUIRED_COLUMNS]
     self._df = self._df.toDF(*self._HEADER_NAMES)
     self._df = self._df.withColumn("logged_datetime", to_timestamp(
         self._df.logged_datetime, 'yyyy/MM/dd'))
     self._df = self._df.withColumn("processed_datetime", current_date())
     self._df = self._df.withColumn('processed_datetime',
                                    to_timestamp(self._df.processed_datetime))
     self._df = self._df.withColumn(
         "source_port", self._df["source_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "destination_port", self._df["destination_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "bytes_sent", self._df["bytes_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "bytes_received", self._df["bytes_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_received", self._df["packets_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_sent", self._df["packets_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "time_elapsed", self._df["time_elapsed"].cast(LongType()))
     self._df = self._df.withColumn(
         "repeat_count", self._df["repeat_count"].cast(LongType()))
     self._df = self._df.withColumn("count_events", lit(1))

Ejemplo n.º 4

0

Mostrar archivo

    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(
                    date_format(
                        unix_timestamp(column, dateFormat).cast("timestamp"),
                        Format), current_date()) / 12), 4).alias(nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self

Ejemplo n.º 5

0

Mostrar archivo

Archivo: azureexpense.py Proyecto: tdang2/databricks_explorer

def process_expense_file(file_path):
  tempDf = spark.read.load(file_path, format="csv", header="true", escape='"')
  tempDf = tempDf.filter(tempDf["Tags"].isNotNull()).select(
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("PreTaxCost"),
    col("ResourceType"),
    col("UsageDateTime"),
    split(col("Tags"), ",").alias("tags")
  )
  return tempDf.withColumn("costCenterNumber", lit(get_cost_center(tempDf["tags"])))\
          .withColumn("team", lit(get_team(tempDf["tags"])))\
          .withColumn("usageMonth", month(tempDf['UsageDateTime']))\
          .withColumn("usageYear", year(tempDf['UsageDateTime']))\
          .withColumn("processedDate", current_date())\
          .select(
    col("processedDate"),
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("ResourceType"),
    col("costCenterNumber"),
    col("team"),
    col("usageMonth"),
    col("usageYear"),
    col("UsageDateTime"),
    col("PreTaxCost")
  ).filter("costCenterNumber is not NULL")

Ejemplo n.º 6

0

Mostrar archivo

    def put(self, df):
        """Writes data to final CSV file and stops spark session. 

        Arguments: 
            df[spark datafarme] -- spark dataframe that
        """
        try:
            self.logger.info('Begin putting data to disk // {}'.format(
                self.etl_id))

            jdbc_params = self._inst_jdbc_params()
            jdbc_params['dbtable'] = 'final_data'

            df = df.withColumn('process_date', F.current_date())

            df.write.format('jdbc').options(**jdbc_params)\
                .mode('overwrite')\
                .save()

            self.logger.info('Job complete // {}'.format(self.etl_id))

            self.spark_session.stop()

        except Exception as e:
            self.logger.error('{} // {}'.format(e, self.etl_id))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: App.py Proyecto: balaspark/webanalytics

    def getReadRDBMSDataFrame():

        from_format = "dd/MMM/yyyy:HH:mm:ss"
        to_format = 'dd-MM-yyyy'
        storeIncrementalValue(0, "ignore")

        df = spark.read.format("csv") \
            .option("inferSchema", True) \
            .option("header", True) \
            .load(argv[17])

        df = df.filter(df.createdt.isNotNull() & (df.createdt != "null"))

        result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \
            .withColumn("value",f.col("value").cast(f.StringType()))\
            .withColumn("score",f.col("score").cast(f.StringType()))\
            .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\
            .withColumn("status",f.col("status").cast(f.StringType()))\
            .withColumn("count",f.col("count").cast(f.StringType()))\
            .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\
            .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method",
                    "key", "count", "type", "site", "statuscode") \
            .withColumn("rdbms_current_date", f.current_date()) \
            .filter(f.col("id") > getIncrementalValue())

        result = result.withColumn(
            "id",
            f.col("id").cast(f.StringType())).na.fill("Not_Applicable")

        max_value = df.agg({"id": "max"}).collect()[0]
        storeIncrementalValue(max_value["max(id)"], "overwrite")

        result.printSchema()

        return result

Ejemplo n.º 8

0

Mostrar archivo

Archivo: DfTransf.py Proyecto: mood-agency/optimus

    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self

Ejemplo n.º 9

0

Mostrar archivo

Archivo: goods_supply.py Proyecto: ZhiYinZhang/study

def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

Ejemplo n.º 10

0

Mostrar archivo

def get_crm_cust_log(spark):
    #crm_cust_log 变更记录表
    # change_type:  CO_CUST.STATUS  状态变更
    #               CO_CUST.CUST_SEG  档位变更
    crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \
        .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \
        .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \
        .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \
        .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31"))
    return crm_cust_log

Ejemplo n.º 11

0

Mostrar archivo

Archivo: ingest_transform_hcp_data.py Proyecto: ashishforgit/additional_py_code

def create_oracle_output():
    df_hcp_match_final = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.path'))

    df_hcp_yesterdays_data = \
        spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.path')) \
            .filter(F.col('full_mk').isNotNull()) \
            .filter(F.col('prop_mk').isNotNull()) \
            .filter(F.col('pnr_mk').isNotNull())

    join_fields = ['lastmodified_datetime', 'full_mk', 'prop_mk', 'pnr_mk', 'match_level', 'new_rate_bucket']

    df_hcp_yesterday = df_hcp_yesterdays_data \
        .withColumn('match_flag', F.lit('Y')) \
        .join(df_hcp_match_final.select(join_fields),
              on=join_fields) \
        .persist()

    df_hcp_today = df_hcp_match_final \
        .withColumn('dl_last_modified_date', F.current_date()) \
        .withColumn('rk', F.lit(1)) \
        .withColumn('rk_f', F.lit(1)) \
        .join(df_hcp_yesterday.select(join_fields + ['match_flag']),
              on=join_fields,
              how='left') \
        .filter(F.col('match_flag').isNull())

    # TODO: Remove redundant fields and remnants of deduping keys which only contain a single value
    df_hcp_oracle = \
        cast_fields_for_oracle_output(df_hcp_yesterday) \
            .union(cast_fields_for_oracle_output(df_hcp_today)) \
            .withColumn('dwh_rk',
                        F.row_number().over(Window
                                            .partitionBy(F.col('pnr_mk'))
                                            .orderBy(F.col('pnr_mk').desc()))) \
            .filter(F.col('dwh_rk') == 1) \
            .drop('dwh_rk')

    # TODO: Investigate partitioning of Oracle data suitable for end-user
    # Dump to staging path since Spark cannot simultaneously read and write to the same path
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.table'))
    df_hcp_yesterday.unpersist()
    df_hcp_oracle = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'))
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.table'))

    # Oracle table for EDM
    dump_dataframe(df=df_hcp_oracle
                   .withColumnRenamed('rate_bucket', 'original_rate_bucket')
                   .withColumnRenamed('new_rate_bucket', 'rate_bucket'),
                   path=config.get(config_set, 'hcp.txns.to.dwh.f.path'),
                   table_name=config.get(config_set, 'hcp.txns.to.dwh.f.table')
                   )

Ejemplo n.º 12

0

Mostrar archivo

def adobe_traffic_by_channel(df_traffic_by_channel):
    return (df_traffic_by_channel.select(
        df_traffic_by_channel['id'], df_traffic_by_channel['createdat'],
        df_traffic_by_channel['reportsuiteid'],
        df_traffic_by_channel['reportsuitename'],
        df_traffic_by_channel['server'], df_traffic_by_channel['data'],
        df_traffic_by_channel['hour'], df_traffic_by_channel['channel'],
        df_traffic_by_channel['visits'], df_traffic_by_channel['orders'],
        df_traffic_by_channel['ordersserialized'],
        df_traffic_by_channel['revenue']).withColumn(
            'dataprocessamento', f.trunc(f.current_date(), 'month')))

Ejemplo n.º 13

0

Mostrar archivo

Archivo: helpers.py Proyecto: 0xd5dc/amazon_review_valitator

def prepare_features(df):
    df = df.withColumn(
        'exclam',
        length('review_body') -
        length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age',
                       datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(
        ['star_rating', 'helfulness', 'age', 'review_length',
         'label']).na.fill(0)
    return select_cols

Ejemplo n.º 14

0

Mostrar archivo

def merge(spark, update, tableName, cols, key):
    """
    将DataFrame和delta表进行merge操作，insert操作要求DataFrame必须包含delta表所有的列(0.5版本)
    当我们使用merge操作更新/插入delta表其中几列时，指定在DataFrame中不存在的列的值为null。

    注：DataFrame中要写入delta表的列要和delta表一样
    :param spark,SparkSession实例
    :param update,spark DataFrame
    :param tableName,要更新的delta表
    """
    # 如果没有dt列，创建当前日期的dt列
    if "dt" not in cols:
        update = update.withColumn("dt", f.current_date())
        cols.append("dt")

    # 1.构建merge条件
    mergeExpr = f"origin.{key}=update.{key}"
    print(f"merge expression:{mergeExpr}")

    # 2.构建更新表达式
    updateExpr = {}
    for c in cols:
        updateExpr[c] = f"update.{c}"

    print(f"update expression:{updateExpr}")

    origin = DeltaTable.forPath(spark, tableName)
    origin_cols = origin.toDF().columns

    # 3.构建插入表达式
    insertExpr = {}
    for origin_col in origin_cols:
        if origin_col in cols:
            insertExpr[origin_col] = f"update.{origin_col}"
        else:
            # 不存在,插入null值(不是字符串)
            insertExpr[origin_col] = "null"

    print(f"insert expression:{insertExpr}")

    # for origin_col in origin_cols:
    #     if origin_col not in cols:
    #         update=update.withColumn(origin_col,f.lit(None))

    origin.alias("origin") \
        .merge(update.alias("update"), mergeExpr) \
        .whenMatchedUpdate(set=updateExpr) \
        .whenNotMatchedInsert(values=insertExpr) \
        .execute()

Ejemplo n.º 15

0

Mostrar archivo

 def test_current_date(self, input_df, spark_session):
     """Substitute the cleansed values with the current date"""
     cleaning_definitions = dict(status=dict(
         elements=["active", "inactive"], default=F.current_date()))
     expected_output_df = spark_session.createDataFrame([
         Row(id=1, status="active"),
         Row(id=2, status=str(dt.date.today())),
         Row(id=3, status=str(dt.date.today())),
         Row(id=4, status="inactive"),
         Row(id=5, status=None),
         Row(id=6, status=str(dt.date.today())),
     ])
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definitions).transform(input_df)
     assert_df_equality(expected_output_df, output_df)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: columns.py Proyecto: marcelomata/Optimus

        def _years_between(_new_col_name, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                _new_col_name)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: App.py Proyecto: balaspark/webanalytics

    def getRandomUserWebApiDataFrame():

        final_list = []
        user_set = set()
        results_json = dict()
        randomuserurl = argv[7]
        ALLOWED_RANDOM_USER = int(argv[8])

        while True:
            resp = json.loads(
                urllib.request.urlopen(randomuserurl).read().decode('utf-8'))
            for randomuser in resp['results']:
                username = removeDigits(randomuser.get('user').get('username'))
                if username not in user_set:
                    user_set.add(username)
                    final_list.append(randomuser)
                if len(user_set) == ALLOWED_RANDOM_USER:
                    break

            if len(user_set) == ALLOWED_RANDOM_USER:
                break

        results_json["results"] = final_list
        json_df = spark.read.json(sc.parallelize([results_json]))
        users = json_df.withColumn("results", f.explode(f.col("results"))) \
            .withColumn("r_username", f.regexp_replace("results.user.username", '[0-9]', '')) \
            .withColumn("dob", f.from_unixtime(f.unix_timestamp(f.col('results.user.dob').cast('timestamp')), 'dd-MM-yyyy')) \
            .withColumn("user_current_date", f.current_date()) \
            .select("r_username", "results.user.gender", "results.user.name.title",
                    f.col("results.user.name.first").alias("firstname"),
                    f.col("results.user.name.last").alias("lastname"),
                    f.col("results.user.location.street").alias("location_street"),
                    f.col("results.user.location.city").alias("location_city"),
                    f.col("results.user.location.state").alias("location_state"),
                    f.col("results.user.location.zip", ).alias("location_zip"),
                    "results.user.email", "results.user.password", "results.user.salt", "results.user.md5",
                    "results.user.sha1", "results.user.sha256", "results.user.registered", "dob", "results.user.phone",
                    "results.user.cell", f.col("results.user.picture.large").alias("picture_large"),
                    f.col("results.user.picture.medium").alias("picture_medium"), "results.user.picture.thumbnail",
                    "user_current_date")

        return users

Ejemplo n.º 18

0

Mostrar archivo

Archivo: listings_data_cleaning.py Proyecto: carlolY/Vancouver-Airbnb

def main(inp, outp):
    listings = spark.read.csv(inp, header=True, schema=schema)
    listings = listings.drop('host_id', 'host_name', 'neighbourhood_group',
                             'reviews_per_month',
                             'calculated_host_listings_count',
                             'availability_365')

    # listings.describe(['minimum_nights']).show()

    # Give a range to minimum_nights
    listings = listings.filter("minimum_nights > 0 AND minimum_nights <= 30")

    # Remove listings with last_review posted more than 2 years ago
    listings = listings.withColumn('curr', f.current_date())
    listings = listings.withColumn(
        'diff', f.datediff(f.col('curr'), f.col('last_review')))
    cleaned_listings = listings.filter(listings['diff'] < 730).drop(
        'curr', 'diff', 'number_of_reviews', 'last_review')
    ''' Coalescion is fine, as listing data only contains airbnb locations in Vancouver '''
    cleaned_listings.coalesce(1).write.csv(outp, mode='overwrite', header=True)

Ejemplo n.º 19

0

Mostrar archivo

 def createAgeDateColumns(self, df):
     """Criar colunas de idade do cliente e data do pedido.
     
     Args:
         df: Dataframe
     
     Returns:
         Dataframe.
     """
     log(
         "INFO",
         f"[createAgeDateColumns] Criando colunas de idade e data do pedido."
     )
     return (df.withColumn(
         "data_pedido_date",
         date_format(
             from_unixtime(col('data_pedido')),
             "yyyy-MM-dd")).withColumn('idade', (months_between(
                 current_date(), col('data_nascimento_cliente')) / 12).cast(
                     IntegerType())))

Ejemplo n.º 20

0

Mostrar archivo

def get_grade_cons_except():
    print(f"{str(dt.now())} 近30天零售户档位与周边消费水平不符")
    try:
        consume_level=get_all_consume_level(spark)\
                             .select("city","cust_id","consume_level")
        cust_cluster=get_cust_cluster(spark)

        #只展示目前有效的零售户    去掉 过去有效 现在无效的零售户
        valid_co_cust=get_valid_co_cust(spark).select("cust_id","sale_center_id")

        #近30天零售户档位  一个零售户近30天内多种档位，分别预警
        cust_seg=spark.sql("select * from DB2_DB2INST1_CO_CUST")\
             .where(f.datediff(f.current_date(),col("dt"))<=30)\
             .where(col("com_id").isin(["011114305","011114306","011114302"]))\
             .withColumn("cust_seg",f.regexp_replace("cust_seg","ZZ","31"))\
             .select("com_id","cust_id","cust_seg")\
             .join(valid_co_cust,"cust_id")\
             .dropDuplicates(["cust_id","cust_seg"])

        cols = {"value": "retail_rim_cons",
                "level1_code": "classify_level1_code",
                }
        cust_seg_cons=cust_seg.join(consume_level,"cust_id")\
                             .withColumnRenamed("consume_level",cols["value"])\
                             .join(cust_cluster,"cust_id")\
                             .select("city","cust_id","cust_seg","cluster_index","sale_center_id",cols["value"])

        result=except_grade(cust_seg_cons,cols,["city","cust_seg","cluster_index"],[3,4,5])\
                .withColumnRenamed("cust_seg","retail_grade")\
                .withColumnRenamed("mean","city_grade_cons_avg") \
                .withColumn(cols["level1_code"], f.lit("YJFL003"))

        values = [
                 "avg_orders_plus3", "avg_orders_minu3", "avg_orders_plus4",
                 "avg_orders_minu4", "avg_orders_plus5", "avg_orders_minu5",
                 "warning_level_code", "cust_id", "city", "sale_center_id",
                 "retail_grade", "city_grade_cons_avg"
                 ] + list(cols.values())
        result.foreachPartition(lambda x:write_hbase2(x,values,hbase))
    except Exception:
        tb.print_exc()

Ejemplo n.º 21

0

Mostrar archivo

 def execute(self):
     """
     Method overrides the execute method of the abstract Task class. Method classifies the
     recipe based on the difficulty. Difficulty is classified based on the total time required
     for prepping and cooking the food.
     If sum(prep time + cooking time) > 1 Hour is as Hard.
     If sum(prep time + cooking time) > 30 minutes and less than an hour is as Medium
     If sum(prep time + cooking time) < 30 minutes as Easy
     As 'Unknown' for all other cases.
     :return: pyspark.sql.DataFrame
                     Returns a Spark's DataFrame with the transformation for the recipe.
     """
     return self.data_frame \
         .withColumn('cookTimeInMinutes',
                     when(lower(col('cookTime')).endswith('h'),
                          regexp_extract('cookTime', '\d+', 0) * 60)
                     .when(lower(col('cookTime')).endswith('m'),
                           regexp_extract('cookTime', '\d+', 0)).otherwise("")) \
         .withColumn('prepTimeInMinutes',
                     when(lower(col('prepTime')).endswith('h'),
                          regexp_extract('prepTime', '\d+', 0) * 60)
                     .when(lower(col('prepTime')).endswith('m'),
                           regexp_extract('prepTime', '\d+', 0)).otherwise("")) \
         .withColumn('difficulty',
                     when(expr('(cookTimeInMinutes + prepTimeInMinutes) > 60'), "Hard")
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) <= 60 and '
                                '(cookTimeInMinutes + prepTimeInMinutes) >= 30'), 'Medium')
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) < 30'), "Easy")
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes > 60'), 'Hard')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes >= 30 and  prepTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes > 60'), 'Hard')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes >= 30 and  cookTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes = ""'), 'Unknown')
                     .otherwise("Unknown")) \
         .withColumn('date_of_execution', current_date()) \
         .drop('cookTimeInMinutes') \
         .drop('prepTimeInMinutes')

Ejemplo n.º 22

0

Mostrar archivo

def get_co_co_01(spark, scope: list, filter="day"):
    """

    :param spark: SparkSession
    :param scope: [lower,upper]  日期过滤范围
    :param filter: 过滤类别  "day","week","month"  default:"day"
    :return:
    """
    # 获取co_co_01        unique_kind: 90 退货  10 普通订单    pmt_status:  0 未付款  1 收款完成
    co_co_01 = spark.sql(
        "select  * from DB2_DB2INST1_CO_CO_01 where (unique_kind = 90 and pmt_status=0) or (unique_kind=10 and pmt_status=1)") \
        .where(col("com_id").isin(cities)) \
        .where(col("born_date").rlike("\d{8}")) \
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_sum", col("qty_sum").cast("float")) \
        .withColumn("amt_sum", col("amt_sum").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_01 = co_co_01.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_01=co_co_01.withColumn("week_diff", week_diff("today", "born_date"))\
                         .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_01 = co_co_01.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_01

Ejemplo n.º 23

0

Mostrar archivo

def get_co_co_line(spark, scope: list, filter="day"):
    """

        :param spark: SparkSession
        :param scope: [lower,upper]  日期过滤范围
        :param filter: 过滤类别  "day","week","month"  default:"day"
        :return:
        """
    co_co_line = spark.sql(
        "select * from DB2_DB2INST1_CO_CO_LINE") \
        .where(col("com_id").isin(cities))\
        .where(col("born_date").rlike("\d{8}"))\
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_ord", col("qty_ord").cast("float")) \
        .withColumn("price", col("price").cast("float"))\
        .withColumn("qty_rsn",col("qty_rsn").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_line = co_co_line.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_line = co_co_line.withColumn("week_diff", week_diff("today", "born_date")) \
                    .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_line = co_co_line.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_line

Ejemplo n.º 24

0

Mostrar archivo

def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto):
    df_depara = (df_deparadepto.filter(
        (df_deparadepto['tabela'] == 'departamento')
        & (df_deparadepto['campo'] == 'iddepartamento')).select(
            df_deparadepto['de'], df_deparadepto['para']))

    return (df_traffic_by_department.join(
        df_depara,
        df_traffic_by_department['iddepartament'] == df_depara['de'],
        'left').select(df_traffic_by_department['id'],
                       df_traffic_by_department['createdat'],
                       df_traffic_by_department['reportsuiteid'],
                       df_traffic_by_department['reportsuitename'],
                       df_traffic_by_department['server'],
                       df_traffic_by_department['data'],
                       df_traffic_by_department['hour'],
                       df_traffic_by_department['iddepartament'],
                       f.trim(df_depara['para']).alias('namedepartment'),
                       df_traffic_by_department['visits'],
                       df_traffic_by_department['orders'],
                       df_traffic_by_department['ordersserialized'],
                       df_traffic_by_department['revenue']).withColumn(
                           'dataprocessamento',
                           f.trunc(f.current_date(), 'month')))

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_creator.py Proyecto: goodwillpunning/hyperleaup

    def test_create(self):
        data = [
            (1001, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (2202, None, None, "1980-05-03", 45.0, False),  # Add a few nulls
            (2235, "", "", "1980-05-03", 43.0, True)
        ]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])

        # Ensure that a Hyper file can be created with date and timestamp columns
        df.withColumn("hire_date", current_date())
        df.withColumn("last_updated", current_timestamp())

        creator = Creator(df, 'employees', False)
        hyper_file_path = creator.create()
        assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper")
        tables = TestUtils.get_tables(
            "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (len(tables) == 1)
        num_rows = TestUtils.get_row_count(
            "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (num_rows == 5)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: pg_dump_to_parquet.py Proyecto: mozilla/mozaggregator2bq

def main(input_dir, output_dir):
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.session.timeZone", "UTC")

    df = read_pg_dump(spark, input_dir)

    # apply some other transformations to obtain the final table
    result = (
        df.withColumn("parts", F.split("table_name", "_")).withColumn(
            "dimension", F.from_json("dimension", DIMENSION_SCHEMA)).
        select(
            F.current_date().alias("ingest_date"),
            "aggregate_type",
            "ds_nodash",
            # parts[:2] form aggregate_type, this is parsed from the filename
            F.col("parts").getItem(2).alias("channel"),
            F.col("parts").getItem(3).alias("version"),
            # parts[-1] is ds_nodash, this is parsed from the filename
            "dimension.*",
            from_pg_array_udf("aggregate"),
        ))
    # -RECORD 0-----------------------------------------------------
    # aggregate_type | submission_date
    # channel        | nightly
    # version        | 43
    # ds_nodash      | 20191201
    # os             | Windows_NT
    # child          | false
    # label          |
    # metric         | A11Y_INSTANTIATED_FLAG
    # osVersion      | 10.0
    # application    | Firefox
    # architecture   | x86
    # aggregate      | "[0, 2, 0, 2, 2]"

    result.repartition(1).write.parquet(output_dir, mode="overwrite")

Ejemplo n.º 27

0

Mostrar archivo

Archivo: source_data_loading.py Proyecto: Kunalred/UnileverData

            jdbc_params = {
                "url": ut.get_mysql_jdbc_url(app_secret),
                "lowerBound": "1",
                "upperBound": "100",
                "dbtable": src_conf["mysql_conf"]["dbtable"],
                "numPartitions": "2",
                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            txn_df = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load() \
                .withColumn("ins_dt", functions.current_date())
            txn_df.show()

            txn_df.write \
                .mode('overwrite')\
                .partitionBy("INS_DT")  \
                .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/staging/SB")

            print("\nWriting SB data to S3 <<")

        elif src == "OL":
            print("\nReading OL data from sftp >>")
            txn_df2 = spark.read.format("com.springml.spark.sftp") \
                .option("host", app_secret["sftp_conf"]["hostname"]) \
                .option("port", app_secret["sftp_conf"]["port"]) \
                .option("username", app_secret["sftp_conf"]["username"]) \

Ejemplo n.º 28

0

Mostrar archivo

Archivo: source_data_loading.py Proyecto: muli2487/datamart

                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            # print(jdbcParams)

            # use the ** operator/un-packer to treat a python dictionary as **kwargs
            print(
                "\nReading data from MySQL DB using SparkSession.read.format(),"
            )
            txnDF = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load()\
                .withColumn('ins_date', current_date())

            # add the current date to the above df and then you write it
            txnDF.show()
            txnDF\
                .write.mode("append")\
                .partitionBy('ins_date') \
                .format("parquet") \
                .save(src_loc)

        elif src == 'OL':
            # put the code here to pull rewards data from SFTP server and write it to s3
            ol_txn_df = spark.read\
                .format("com.springml.spark.sftp")\
                .option("host", app_secret["sftp_conf"]["hostname"])\
                .option("port", app_secret["sftp_conf"]["port"])\

Ejemplo n.º 29

0

Mostrar archivo

Archivo: Structured_APIs-Chapter_6_Working_with_Different_Types_of_Data.py Proyecto: orekoya/spark-demo

def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(3, False)


# COMMAND ----------

from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: _date.py Proyecto: mindis/SparkLearning-1

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, current_date, \
    current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()

    spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show()

    spark.read.jdbc

    spark.range(5)\
         .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\
         .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show()

    spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'),
                          to_date(lit('2019-01-05')).alias('date2'))\
                  .select(datediff(col('date2'), col('date1')),
                          months_between(col('date2'), col('date1'))).show()

    spark.range(5).withColumn('date', to_date(
        lit('2019-XX-XX'))).show()  #No emite excepcion

    spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \
                  .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \
        .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \
                  .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show()

    spark.range(5).select(current_timestamp().alias("timestamp")).select(
        hour(col('date')))

Ejemplo n.º 31

0

Mostrar archivo

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()

#convert string to date, default format is 'YYYY-MM-DD'
spark.range(1).select(
    to_date(lit("2019-02-01")).alias("start_date"),
    to_date(lit("2019-03-06")).alias("end_date")).select(
        datediff(col("start_date"), col("end_date"))).show()

cleanDateDF = spark.range(1).withColumn("date1", current_date())
date_format = 'YYYY-MM-DD'