Ejemplo n.º 1
0
def __appendAggKey(tsdf, freq = None):
    """
    :param tsdf: TSDF object as input
    :param freq: frequency at which to upsample
    :return: return a TSDF with a new aggregate key (called agg_key)
    """
    df = tsdf.df
    checkAllowableFreq(freq)

    # compute timestamp columns
    sec_col = f.second(f.col(tsdf.ts_col))
    min_col = f.minute(f.col(tsdf.ts_col))
    hour_col = f.hour(f.col(tsdf.ts_col))

    if (freq == SEC):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp")
    elif (freq == MIN):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == HR):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == DAY):
        agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp")

    df = df.withColumn("agg_key", agg_key)
    return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
Ejemplo n.º 2
0
    def vwap(self, frequency='m', volume_col="volume", price_col="price"):
        # set pre_vwap as self or enrich with the frequency
        pre_vwap = self.df
        print('input schema: ', pre_vwap.printSchema())
        if frequency == 'm':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'),
                         f.lit(':'),
                         f.lpad(f.minute(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'H':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0')))
        elif frequency == 'D':
            pre_vwap = self.df.withColumn(
                "time_group",
                f.concat(f.lpad(f.day(f.col(self.ts_col)), 2, '0')))

        group_cols = ['time_group']
        if self.partitionCols:
            group_cols.extend(self.partitionCols)
        vwapped = (pre_vwap.withColumn(
            "dllr_value",
            f.col(price_col) * f.col(volume_col)).groupby(group_cols).agg(
                sum('dllr_value').alias("dllr_value"),
                sum(volume_col).alias(volume_col),
                max(price_col).alias("_".join(["max", price_col]))).withColumn(
                    "vwap",
                    f.col("dllr_value") / f.col(volume_col)))

        return TSDF(vwapped, self.ts_col, self.partitionCols)
Ejemplo n.º 3
0
def read_customer_hierarchy_from_db(
    spark: pyspark.SparkContext, ) -> pyspark.sql.DataFrame:
    """
    Reads customer hierarchy data from hierarchies_customer_curr_hierarchy_int_00_001 and changes few columns

    Parameters
    ----------
    spark : pyspark.SparkContext
        Spark context to initialize variables and get data from hive

    Returns
    -------
    pyspark.sql.DataFrame
        PySpark dataframe with ustomer hierarchy data
    """

    # Read cust hiererachy data - for retailer name & ID

    cust_hier = spark.table("hierarchies_customer_curr_hierarchy_int_00_001")

    # Rename Dist channel ID column
    cust_hier = cust_hier.withColumnRenamed("distribution_channel_id",
                                            "distribution_channel_id_cust")

    # pad 0s to store_id col
    cust_hier = cust_hier.withColumn(
        "store_id",
        lpad(cust_hier["store_id"], 10, "0").alias("store_id"))
    cust_hier.createOrReplaceTempView("hierarchies_customer")

    return cust_hier
Ejemplo n.º 4
0
def read_product_hierarchy_from_db(
    spark: pyspark.SparkContext, ) -> pyspark.sql.DataFrame:
    """
    Reads product hierarchy data from hierarchies_product_curr_hierarchy_int_00_001 and changes few columns

    Parameters
    ----------
    spark : pyspark.SparkContext
        Spark context to initialize variables and get data from hive

    Returns
    -------
    pyspark.sql.DataFrame
        PySpark dataframe with product hierarchy data
    """

    # Translates SKU to PPG
    prod_hier = spark.table("hierarchies_product_curr_hierarchy_int_00_001")
    # Rename Dist channel ID column
    prod_hier = prod_hier.withColumnRenamed("distribution_channel_id",
                                            "distribution_channel_id_prod")
    # pad 0s to SKU_Id col
    prod_hier = prod_hier.withColumn(
        "sku_id",
        lpad(prod_hier["sku_id"], 18, "0").alias("sku_id"))

    prod_hier.createOrReplaceTempView("hierarchies_product")

    return prod_hier
Ejemplo n.º 5
0
def compile_lpad(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    length = op.length.op().value
    pad = op.pad.op().value
    return F.lpad(src_column, length, pad)
Ejemplo n.º 6
0
def create_train_data():

    w1 = Window.orderBy("uid")
    w2 = Window.partitionBy("seg").orderBy("uid")
    df_train = spark.read.csv(
        os.path.join("datasets", "train.csv"), header=True,
        schema=schema).withColumn(
            "uid", monotonically_increasing_id()).withColumn(
                "idx",
                row_number().over(w1).cast(IntegerType())).withColumn(
                    "seg",
                    fn.floor(((fn.col("idx") - 1) / 150000)).cast(
                        IntegerType())).withColumn(
                            "no",
                            row_number().over(w2).cast(
                                IntegerType())).withColumn(
                                    "name",
                                    fn.concat(
                                        lit("raw_"),
                                        fn.lpad(fn.col("seg"), 4, "0").cast(
                                            StringType()))).withColumn(
                                                "set", lit(0))

    df_train.createOrReplaceTempView("data")
    df_train_f = spark.sql("""
    SELECT uid, set, seg, no, name, x, y FROM data 
    ORDER BY set, seg, no, uid
    """)

    df_train_f = df_train_f.repartition(1)
    df_train_f.write.mode("overwrite").parquet(
        os.path.join("datasets", "train.parquet"))
Ejemplo n.º 7
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = lpad(
         col=self.column.get_column_spec(source_df=source_df,
                                         current_column=current_column),
         len=self.length,
         pad=self.pad,
     )
     return column_spec
Ejemplo n.º 8
0
 def assign_sk(self, df: DataFrame, orderByCol: str):
     now = datetime.now()  # current date and time
     fmt = '%y%m%d%H'
     yymmddhh = now.strftime(fmt)
     df_with_row_num = df.withColumn(
         "row_num",
         row_number().over(Window.orderBy(col(orderByCol))))
     sk_df = df_with_row_num.select(
         concat(lit(yymmddhh), lpad(col("row_num"), 5,
                                    "0")).cast("long").alias("sys_sk"),
         col("*")).drop(col("row_num"))
     return sk_df
Ejemplo n.º 9
0
def _setup_dataframe(spark,
                     sqlContext,
                     dataset_multiplier_factor,
                     append_ids=True) -> pyspark.sql.DataFrame:
    """Setup a pyspark dataframe to run against.

    Then creates a PySpark dataframe, and crossjoins with a table of length :dataset_multiplier_factor:
    to increase the volume of data for benchmarking.

    Returns:
        A Pyspark dataframe with random phrases for string distance testing.
    """
    df = _fetch_phrase_pairs()

    logger.info(f'{len(df):,} word pairs')

    pyspark_df = spark.createDataFrame(df, ['left', 'right'])

    pyspark_df = pyspark_df.repartition(10)
    pyspark_df.cache().count()

    logger.debug('Increasing data volume')

    range_df = sqlContext.range(dataset_multiplier_factor)

    if append_ids:

        range_df = range_df.withColumn('id_string',
                                       ps_funcs.lpad('id', 12, "0"))

        pyspark_df = range_df.crossJoin(pyspark_df).select(
            ps_funcs.concat_ws(' ', ps_funcs.col('left'),
                               ps_funcs.col('id_string')).alias('left'),
            ps_funcs.concat_ws(' ', ps_funcs.col('right'),
                               ps_funcs.col('id_string')).alias('right'))
    else:
        pyspark_df = range_df.crossJoin(pyspark_df).select(
            ps_funcs.col('left'), ps_funcs.col('right'))

    pyspark_df = pyspark_df.repartition(__DATASET_PARTITIONS)
    record_count = pyspark_df.cache().count()

    logger.info(f'Generated dataframe with {record_count:,} records')

    sample_data = pyspark_df.sample(withReplacement=False,
                                    fraction=0.01).limit(1).collect()
    logger.info(f'Sample of benchmarking data: {sample_data}')

    return pyspark_df
def test_auto_mapper_lpad(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1234"),
            (2, "1234567"),
            (3, "123456789"),
        ],
        ["member_id", "empi"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0"))

    # Assert
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)

    assert str(sql_expressions["my_column"]) == str(
        lpad(col=col("b.empi"), len=9, pad="0").alias("my_column")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "000001234"
    )
    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "001234567"
    )

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "123456789"
    )
Ejemplo n.º 11
0
def generate_customers_from_hierarchies(hierarchy_list):
    """
	Accepts a list of hierarchy numbers and returns a list of customer numbers
	"""
    hierarchy_list = hierarchy_list.withColumnRenamed('_c0', 'input_lvl_1')
    hierarchy_list = hierarchy_list.withColumn(
        'input_lvl_1', F.lpad(hierarchy_list['input_lvl_1'], 10, '0'))

    pricing_hierarchy = spark.read.parquet(location +
                                           'extracts/pricing_hierarchy_table')

    hierarchy_list = (hierarchy_list.join(
        pricing_hierarchy, hierarchy_list.input_lvl_1 ==
        pricing_hierarchy.higherlevel_customer_hkunnr,
        'left_outer').drop('higherlevel_customer_hkunnr'))
    hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr',
                                                      'input_lvl_2')

    hierarchy_list = (hierarchy_list.join(
        pricing_hierarchy, hierarchy_list.input_lvl_2 ==
        pricing_hierarchy.higherlevel_customer_hkunnr,
        'left_outer').drop('higherlevel_customer_hkunnr'))
    hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr',
                                                      'input_lvl_3')

    hierarchy_list = (hierarchy_list.join(
        pricing_hierarchy, hierarchy_list.input_lvl_3 ==
        pricing_hierarchy.higherlevel_customer_hkunnr,
        'left_outer').drop('higherlevel_customer_hkunnr'))
    hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr',
                                                      'input_lvl_4')

    hierarchy_list = (hierarchy_list.join(
        pricing_hierarchy, hierarchy_list.input_lvl_4 ==
        pricing_hierarchy.higherlevel_customer_hkunnr,
        'left_outer').drop('higherlevel_customer_hkunnr'))
    hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr',
                                                      'input_lvl_5')

    def pick_highest_hierarchy(input_1, input_2, input_3, input_4, input_5):
        if (input_5):
            return input_5
        elif (input_4):
            return input_4
        elif (input_3):
            return input_3
        elif (input_2):
            return input_2
        else:
            return input_1

    hierarchy_selector_UDF = F.udf(pick_highest_hierarchy, StringType())
    hierarchy_list = hierarchy_list.withColumn(
        'customer_kunnr',
        hierarchy_selector_UDF(hierarchy_list.input_lvl_1,
                               hierarchy_list.input_lvl_2,
                               hierarchy_list.input_lvl_3,
                               hierarchy_list.input_lvl_4,
                               hierarchy_list.input_lvl_5))

    return hierarchy_list.selectExpr('customer_kunnr as _c0')
Ejemplo n.º 12
0
if (customer_option == '1'):
    customer_input = spark.read.csv("gs://data-discovery/" + customer_file)

elif (customer_option == '2'):
    customer_input = generate_customers_from_hierarchies(
        spark.read.csv("gs://data-discovery/" + customer_file))

# Combine all of the eligibility files into one dataframe
customer_eligibility = filter_by_customer_eligibility(
    spark.read.parquet(location +
                       'eligibility_files/customer_eligibility_table'))

# Filter the eligibility dataframe to the relevant customers
if (customer_option == '1' or customer_option == '2'):
    padded_customer_input = customer_input.withColumn(
        'customer_kunnr', F.lpad(customer_input['_c0'], 10, '0')).drop('_c0')

    customer_eligibility = (customer_eligibility.drop(
        'vertical_mkt_desc').drop('customer_market_description').drop(
            'customer_submarket_description').drop('cot_desc').drop(
                'district_name_bztxt'))

    filtered_eligibility = customer_eligibility.join(
        F.broadcast(padded_customer_input), 'customer_kunnr')

else:
    customer_attributes_dict = create_customer_attributes_dict(
        customer_attributes)
    filtered_eligibility = filter_by_attributes(customer_eligibility,
                                                customer_attributes_dict)
Ejemplo n.º 13
0
def get_events_by_tier_month(
    spark,
    start_date,
    end_date,
    tiers_raw=None,
    remove_raw=None,
    skims_raw=None,
    only_valid_files=False,
    verbose=False,
):
    """
        Generate a pandas dataframe containing data_tier_name, month, nevents
        for the given time period.
        It will add virtual tiers based on the skims.
        args:
            - spark: Spark session
            - start_date: String with the date y format yyyy/MM/dd
            - end_date: String with the date y format yyyy/MM/dd
            - tiers_raw: List of tiers
            - remove_raw: List of remove patterns
            - skims_raw: List of skim patterns
            - only_valid_files: True if you want to take into account only the valid files.
            - verbose: True if you want additional output messages, default False.
    """
    if tiers_raw is None:
        tiers_raw = [".*"]
    if skims_raw is None:
        skims_raw = []
    if remove_raw is None:
        remove_raw = []
    tiers = "^({})$".format("|".join(
        ["{}".format(tier.strip()) for tier in tiers_raw]))
    skims_rlike = (".*-({})-.*".format("|".join([l.strip() for l in skims_raw
                                                 ])) if skims_raw else "^$")
    remove_rlike = (".*({}).*".format("|".join(
        [l.strip().lower() for l in remove_raw])) if remove_raw else "^$")
    tables = spark_utils.dbs_tables(spark, tables=["ddf", "bdf", "fdf", "dtf"])
    if verbose:
        logger.info("remove %s", remove_rlike)
        logger.info("skims %s", skims_rlike)
        for k in tables:
            # tables[k].cache()
            logger.info(k)
            tables[k].printSchema()
            tables[k].show(5, truncate=False)
    datablocks_file_events_df = spark.sql(
        """SELECT sum(fdf.f_event_count) as f_event_count,
                             max(ddf.d_data_tier_id) as d_data_tier_id,
                             d_dataset,
                             b_block_name,
                             max(b_creation_date) as b_creation_date,
                             max(b_block_size) as size
                          FROM ddf JOIN bdf on ddf.d_dataset_id = bdf.b_dataset_id
                                   JOIN fdf on bdf.b_block_id = fdf.f_block_id
                          WHERE d_is_dataset_valid = 1
                          {}
                          group by d_dataset, b_block_name
                      """.format(
            "AND f_is_file_valid = 1" if only_valid_files else ""))
    fiter_field = "b_creation_date"
    datablocks_file_events_df = (datablocks_file_events_df.withColumn(
        fiter_field, from_unixtime(fiter_field)).filter(
            fiter_field +
            " between '{}' AND '{}' ".format(start_date.replace(
                "/", "-"), end_date.replace("/", "-"))).withColumn(
                    "month",
                    concat(year(fiter_field), lpad(month(fiter_field), 2,
                                                   "0"))))

    datablocks_file_events_df.registerTempTable("dbfe_df")
    # Union of two queries:
    # - The first query will get all the selected data tiers,
    #   excluding the datasets who match the skims
    # - The second query will get all the selected data tiers,
    #   but only the dataset who match the skims.
    grouped = spark.sql("""
        select month, data_tier_name, sum(f_event_count) as nevents
        from dbfe_df join dtf on data_tier_id = d_data_tier_id
        where
            data_tier_name rlike '{tiers}'
            and lower(d_dataset) not rlike '{remove}'
            and d_dataset not rlike '{skims}'
            group by month, data_tier_name
        UNION
        select month,
             concat(data_tier_name, '/',regexp_extract(d_dataset,'{skims}',1)) AS data_tier_name,
             sum(f_event_count) as nevents
        from dbfe_df join dtf on dtf.data_tier_id = d_data_tier_id
        where
            data_tier_name rlike '{tiers}'
            and lower(d_dataset) not rlike '{remove}'
            and d_dataset rlike '{skims}'
            group by month, concat(data_tier_name, '/',regexp_extract(d_dataset,'{skims}',1))
        """.format(tiers=tiers, remove=remove_rlike, skims=skims_rlike))
    return grouped.toPandas()
Ejemplo n.º 14
0
def process_covid_country_fact(spark, covid19_lake, output_data):
    """
    Write countries covid data to parquet files on S3.
    
    Keyword arguments:
    spark -- a spark session
    covid19_lake -- the script reads data from S3 or public datalake
    output_data -- writes province and country to partitioned parquet on S3
    """
    # get filepath to country fact table
    covid_country_data = covid19_lake + 'tableau-jhu/csv/COVID-19-Cases.csv'

    # define the data frames
    country_data_df = spark.read.load(
        covid_country_data, format="csv", sep=",",
        inferSchema="true", header="true")
    country_data_df = country_data_df.dropDuplicates()
    split_date = F.split(country_data_df["Date"], '/')
    country_data_df = country_data_df.withColumn(
        'Month',
        split_date.getItem(0))
    country_data_df = country_data_df.withColumn(
        'Day',
        split_date.getItem(1))
    country_data_df = country_data_df.withColumn(
        'Year',
        split_date.getItem(2))

    country_data_df.show(5)

    country_data_df = country_data_df.select(
        country_data_df.Country_Region, \
        country_data_df.Cases, \
        country_data_df.Case_Type, \
        country_data_df.Combined_Key, \
        country_data_df.Date, \
        F.lpad(country_data_df.Month,2,'0').alias('Month'), \
        F.lpad(country_data_df.Day,2,'0').alias('Day'), \
        F.lpad(country_data_df.Year,4,'0').alias('Year'), \
        F.to_date(
            F.concat_ws(
                '-', country_data_df.Month,
                country_data_df.Day, country_data_df.Year), \
            'MM-dd-yyyy').alias('Date_format'))

    country_data_df.createOrReplaceTempView("country_data")
    country_data_df.createOrReplaceTempView("country_p_data")

    country_data_df.show(5)

    fact_covid_country_confirmed = spark.sql(
        "SELECT DISTINCT \
            bd.country_region as country_name, \
            to_date(bd.Date_format) as date, \
            case when bd.case_type = 'Confirmed' \
                then (bd.cases - bp.cases) end as confirmed_cases, \
            case when bd.case_type = 'Deaths' \
                then (bd.cases - bp.cases) end as death_cases, \
            case when bd.case_type = 'Confirmed' \
                then bd.cases end as sum_confirmed_cases, \
            case when bd.case_type = 'Deaths' \
                then bd.cases end as sum_death_cases \
        FROM country_data bd \
            JOIN country_p_data bp \
                ON (date_add(to_date(bd.Date_format), -1) \
                     = to_date(bp.Date_format) \
                    AND bd.case_type = bp.case_type \
                    AND bd.country_region = bp.country_region \
                    AND bd.combined_key = bp.combined_key) \
        WHERE \
            bd.case_type = 'Confirmed' \
            AND bp.case_type = 'Confirmed'"
        )

    fact_covid_country_confirmed.show(10)

    fact_covid_country_death = spark.sql(
        "SELECT DISTINCT \
            bd.country_region as country_name, \
            to_date(bd.Date_format) as date, \
            case when bd.case_type = 'Confirmed' \
                then (bd.cases - bp.cases) end as confirmed_cases, \
            case when bd.case_type = 'Deaths' \
                then (bd.cases - bp.cases) end as death_cases, \
            case when bd.case_type = 'Confirmed' \
                then bd.cases end as sum_confirmed_cases, \
            case when bd.case_type = 'Deaths' \
                then bd.cases end as sum_death_cases \
        FROM country_data bd \
            JOIN country_p_data bp \
                ON (date_add(to_date(bd.Date_format),-1) \
                    = to_date(bp.Date_format) \
                    AND bd.case_type = bp.case_type \
                    AND bd.country_region = bp.country_region \
                    ANd bd.combined_key = bp.combined_key) \
        WHERE \
            bd.country_region not in ('Brazil', 'US') \
            AND bp.country_region not in ('Brazil', 'US') \
            AND bd.case_type = 'Deaths' \
            AND bp.case_type = 'Deaths'"
        )
    
    fact_covid_country = fact_covid_country_confirmed \
        .unionByName(fact_covid_country_death)

    fact_covid_country.createOrReplaceTempView("fact_covid_country")

    fact_covid_country = spark.sql(
        "SELECT \
            country_name, \
            date, \
            sum(confirmed_cases) as confirmed_cases, \
            sum(death_cases) as death_cases, \
            sum(sum_confirmed_cases) as sum_confirmed_cases, \
            sum(sum_death_cases) as sum_death_cases \
        FROM fact_covid_country \
        GROUP BY \
            country_name, date"
        )

    fact_covid_country.show(10)

    # write tables to partitioned parquet files
    fact_covid_country.write.mode('append') \
        .partitionBy('country_name', 'date') \
        .parquet(output_data + 'fact_covid_country')

    count_fact = {}
    count_fact['country'] = spark.sql(
        "SELECT COUNT (DISTINCT country_name) as countries \
        FROM fact_covid_country").collect()[0].countries
    count_fact['date'] = spark.sql(
        "SELECT COUNT (DISTINCT date) as dates \
        FROM fact_covid_country").collect()[0].dates
    return count_fact
Ejemplo n.º 15
0
def process_covid_dimension(spark, input_data, covid19_lake, output_data):
    """
    Write country, province and time dimensions to parquet files on S3.

    Keyword arguments:
    spark -- a spark session
    input_data -- the script reads data from S3 or public datalake
    covid19_lake -- the script reads data from S3 or public datalake
    output_data -- the script writes dimension to partitioned parquet on S3
    """

    # get filepath to dimensions
    covid_global_data = covid19_lake + 'tableau-jhu/csv/COVID-19-Cases.csv'
    covid_brazil_data = input_data + 'COVID-19-Brazil.csv.gz'
    brazil_provinces = input_data + 'provinces_brazil.csv'

    # define the data frames
    global_data_df = spark.read.load(
        covid_global_data, format="csv", sep=",",
        inferSchema="true", header="true")
    global_data_df = global_data_df.dropDuplicates()
    global_data_df.createOrReplaceTempView("global_data")

    brazil_data_df = spark.read.load(
        covid_brazil_data, format="csv", sep=",",
        inferSchema="true", header="true")
    brazil_data_df = brazil_data_df.dropDuplicates()
    brazil_data_df.createOrReplaceTempView("brazil_data")

    brazil_provinces_df = spark.read.load(
        brazil_provinces, format="csv", sep=",",
        inferSchema="true", header="true")
    brazil_provinces_df = brazil_provinces_df.dropDuplicates()
    brazil_provinces_df.createOrReplaceTempView("brazil_provinces")

    date_df = global_data_df.filter("country_region = 'Brazil'")
    date_df = date_df.select("Date")
    date_df = date_df.dropDuplicates()
    split_date = F.split(date_df["Date"],'/')
    date_df = date_df.withColumn('Month', split_date.getItem(0))
    date_df = date_df.withColumn('Day', split_date.getItem(1))
    date_df = date_df.withColumn('Year', split_date.getItem(2))
    date_df = date_df.select(
        date_df.Date,
        F.lpad(date_df.Month,2,'0').alias('Month'),
        F.lpad(date_df.Day,2,'0').alias('Day'),
        F.lpad(date_df.Year,4,'0').alias('Year'))
    date_df = date_df.select(
        date_df.Date,
        date_df.Month,
        date_df.Day,
        date_df.Year,
        F.to_date(F.concat_ws('-',date_df.Month, date_df.Day, date_df.Year),
                  'MM-dd-yyyy').alias('Date_format'))
    date_df = date_df.select(
        date_df.Date,
        date_df.Date_format,
        date_df.Year,
        F.weekofyear(date_df.Date_format).alias('Week'),
        date_df.Month,
        date_df.Day,
        F.dayofweek(date_df.Date_format).alias('Week_Day'))
    date_df.show(2)
    date_df.createOrReplaceTempView("date")

    # extract columns to create tables
    dim_country = spark.sql(
        "SELECT DISTINCT \
            Country_Region, \
            iso2, \
            iso3, \
            sum(Population_Count) as population_count \
        FROM global_data \
        WHERE \
            case_type = 'Deaths' \
            AND Date = '5/19/2020' \
        GROUP BY \
            Country_Region, \
            iso2, \
            iso3"
        )

    dim_province_us = spark.sql(
        "SELECT DISTINCT \
            country_region, \
            province_state, \
            max(Population_Count) as population_count \
        FROM global_data \
        WHERE \
            case_type = 'Deaths' \
            AND country_region = 'US' \
            AND Date = '5/19/2020' \
        GROUP BY \
            Country_Region, \
            province_state"
        )

    dim_province_br = spark.sql(
        "SELECT DISTINCT \
            'Brazil' as country_region, \
            bp.province_state as province_state, \
            sum(estimated_population_2019) as population_count \
        FROM brazil_data bd \
            JOIN brazil_provinces bp ON bd.state = bp.state \
        WHERE \
            place_type = 'state' \
            AND date = '2020-05-19' \
        GROUP BY \
            Country_Region, \
            province_state"
        )

    dim_province = dim_province_us.unionByName(dim_province_br)

    dim_time = spark.sql(
        "SELECT DISTINCT \
            date, year, week, month, day, week_day \
        FROM date"
        )

    # write tables to partitioned parquet files
    dim_country.write.mode('overwrite') \
        .partitionBy('country_region') \
        .parquet(output_data + 'dim_country')

    dim_province.write.mode('overwrite') \
        .partitionBy('country_region', 'province_state') \
        .parquet(output_data + 'dim_province')

    dim_time.write.mode('overwrite') \
        .partitionBy('year','month') \
        .parquet(output_data + 'dim_time')

    count_dim = {}
    count_dim['country'] = spark.sql(
        "SELECT COUNT (DISTINCT country_region) as countries \
        FROM global_data").collect()[0].countries
    count_dim['date'] = spark.sql(
        "SELECT COUNT (DISTINCT date) as dates \
        FROM global_data").collect()[0].dates
    return count_dim
def main():
    # Download data
    now = datetime.datetime.now().strftime("%m-%d-%H-%M-%S")
    folder = config['local_folder']
    os.system(f"rm -r {folder}")
    # Daily dataset
    ##big_table_name = 'vartefact.forecast_sprint4_add_dm_to_daily'

    # Promo dataset
    big_table_name = f"{config['database']}.forecast_sprint4_promo_mecha_v4"
    file_name = 'test_to_delete_' + now
    download_data(folder=folder,
                  big_table_name=big_table_name,
                  file_name=file_name)

    os.system(
        f"cp {config['config_folder']}/calendar.pkl {config['local_folder']}")
    # Preprocess the datase
    big_table = file_name + '.csv'
    sql_table = big_table_name
    target_value = 'dm_sales_qty'
    dataset_name = 'dataset_test'
    preprocess_promo(folder=folder,
                     big_table=big_table,
                     sql_table=sql_table,
                     target_value=target_value,
                     dataset_name=dataset_name)

    # Train the model
    desc = 'promo_test'
    data_name = dataset_name + '.pkl'
    target_value = 'dm_sales_qty'
    date_stop_train = config['date_stop_train']
    learning_rate = 0.3

    folder_name = train(desc=desc,
                        folder=folder,
                        data_name=data_name,
                        target_value=target_value,
                        learning_rate=learning_rate,
                        date_stop_train=date_stop_train)

    # Make a prediction
    prediction(folder_name, folder, data_name, target_value, learning_rate,
               date_stop_train)

    # save csv as table
    print('saving csv as table')
    file_list = os.listdir(config['local_folder'])

    for f in file_list:
        if desc in f:
            result_file_name = f
    result_csv_path = f"{config['local_folder']}/{result_file_name}/promo_sales_order_prediction_by_item_store_dm.csv"
    os.system(
        f"hadoop fs -rm promo_sales_order_prediction_by_item_store_dm.csv")
    os.system(f"hadoop fs -put -f {result_csv_path}")

    spark = SparkSession \
        .builder \
        .appName("Forecast_saveastable") \
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config("spark.num.executors", '10') \
        .config("spark.executor.memory", '15G') \
        .config("spark.executor.cores", '20') \
        .enableHiveSupport() \
        .getOrCreate()

    sqlContext = SQLContext(spark)

    spark_df = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true').load(
            f"promo_sales_order_prediction_by_item_store_dm.csv")
    spark_df = spark_df.withColumn("item_id",
                                   spark_df["item_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("sub_id",
                                   spark_df["sub_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn(
        "current_dm_theme_id",
        spark_df["current_dm_theme_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("store_code",
                                   lpad(spark_df['store_code'], 3, '0'))
    spark_df = spark_df.withColumn("sales",
                                   spark_df["sales"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "sales_prediction", spark_df["sales_prediction"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "squared_error_predicted",
        spark_df["squared_error_predicted"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "std_dev_predicted", spark_df["std_dev_predicted"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "confidence_interval_max",
        spark_df["confidence_interval_max"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "order_prediction", spark_df["order_prediction"].cast(FloatType()))
    spark_df = spark_df.withColumn('sub_family_code',
                                   regexp_replace('sub_family_code', '.0', ''))
    spark_df.write.mode('overwrite').saveAsTable(
        f"{config['database']}.promo_sales_order_prediction_by_item_store_dm")
    spark.stop()

    sql = f""" invalidate metadata {config['database']}.promo_sales_order_prediction_by_item_store_dm """
    impalaexec(sql)
    print('csv saved in the table')
  .option("inferSchema","true")\
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)

#color name 추출
Ejemplo n.º 18
0
df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
#   trim('      HELLO       '),
#   lpad('HELLO', 3, ' '),
#   rpad('HELLO', 10, ' ')
print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves),
      flush=True)
print("Execution time #0 %f" % (time.time() - startTime), flush=True)

# ## Read two mapping files into dataframes
# - Read files from Amazon S3 bucket into Spark dataframes
# - Format columns as required to enable joins to dataset below

# read and process city FIPS to county FIPS mapping file
city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \
                    .csv(latlonFilename)
#                          .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv")

city_to_fips = city_to_fips.withColumn(
    "county_FIPS", f.lpad(city_to_fips['county_FIPS'], 5, "0"))
city_to_fips = city_to_fips.drop("city", "zip", "id", "source", "population")
city_to_fips = city_to_fips.withColumn(
    "city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.'))
city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \
                           .withColumnRenamed("state_name","StateDesc") \
                           .withColumnRenamed("county_FIPS","FIPS")

print((city_to_fips.count(), len(city_to_fips.columns)))
city_to_fips.limit(5).toPandas()

# read and process commuting zone to county FIPS mappingfile
cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \
                  .csv(lmaFilename)
#                        .csv("/home/ubuntu/project/data/1990LMAascii.csv")
Ejemplo n.º 20
0
def process_temperatures_data(spark, input_data, output_data):
    """
    input_parameters:
        - spark: Spark session object
        - input_data: s3a route to public udacity input data
        - output_data: s3 route to private results 
        
    output parameters:
        - none
    
    Description:
    
    Reads input source for temperatures data and perfoms etl and wrangling.
    Store final information in output source with parquet format
    """
    
    # get filepath to temperatures data file
    temp_data_path = input_data+"city_temperature.zip"
    print ("Getting Path to Temperatures ...")
    
    # Read in the data here
    df3 = pd.read_csv(temp_data_path,sep=',',low_memory=False, compression='zip')

    # Filter for only US cities
    df3=df3[df3['Country']=='US']

    # Drop unsued columns and duplicates
    df3.drop('Region',  axis='columns', inplace=True)
    df3.drop('Country',  axis='columns', inplace=True)
    df3=df3.drop_duplicates()

    # Create a reverse dict from SAS i94addrl
    i94addrl_r = dict(zip(i94addrl.values(),i94addrl.keys()))

    # Convert to spark Dataframe
    df3_spark=spark.createDataFrame(df3)

    # Define a lookup UDF and create StateCode column based on State name informationç
    def getState(key, default=None):
        if str(key).upper() in i94addrl_r:
            return i94addrl_r[str(key).upper()]
        return default
    get_i94addrl_r = udf(lambda x: getState(x))
    df3_spark = df3_spark.withColumn("stateCode",get_i94addrl_r(df3_spark.State))

    # Create CityCode with contatenation
    df3_spark = df3_spark.withColumn('cityCode', sf.concat(sf.col('City'),sf.lit('_'), sf.col('stateCode')))

    # Group observation date into a single column
    df3_spark = df3_spark.withColumn('temperatureDate', 
                        sf.concat(sf.col('Year'),sf.lit('-'), sf.lpad(df3_spark['Month'],2,'0'),
                        sf.lit('-'), sf.lpad(df3_spark['Day'],2,'0')))

    # Remove unused columns
    df3_spark=df3_spark.drop('State')
    df3_spark=df3_spark.drop('City')

    # Remove null values
    df3_spark=df3_spark.na.drop()

    # Write to parquet file (pending partition by STATECODE / YEAR / MONTH)
    df3_spark.write.mode('append').partitionBy("stateCode","Year", "Month").parquet(output_data+"temperatures/temperatures.parquet")
    print ("Writing Temperatures Table to Parquet")
    print ("Number of rows written: ",df3_spark.count())
Ejemplo n.º 21
0
    def transform(self, sources: dict) -> DataFrame:
        ri = self.invoice_dataframe(sources['rptt_invoice'])

        rst = self.read_source(source=sources['rptm_sbu_subset_txt'])

        cmf = self.read_source(source=sources['customer_mapping'])
        cmf = cmf.withColumnRenamed('sales_rep', 'sales_rep_override')
        cmf = cmf.withColumnRenamed('sales_rep_id', 'sales_rep_id_override')
        cmf = cmf.withColumnRenamed('end_market', 'cmf_end_market')

        mmf = self.read_source(source=sources['material_mapping'])
        srtr = self.read_source(source=sources['sales_rep_to_region'])

        rsrt = self.read_source(source=sources['rptm_sales_rep_txt'])
        rsrt = rsrt.withColumnRenamed('med_desc', 'sales_rep_original')

        edataA = self.read_source(source=sources['exclusion_dataA'])
        edataA = edataA.withColumnRenamed('sold_customer_id',
                                          'edataA_sold_customer_id')

        # Source contains system_id/material_id pairs that need excluded
        excmat = self.read_source(source=sources['exclude_mat'])

        cerd = self.read_source(source=sources['currency_exchange_rates'])
        cerd = fixCurExchangeToAvg(self, cerd)
        cers = cerd.select('currency_code_from', 'cur_year', 'cur_month',
                           'conversion_rate_multiplier')
        cers = cers.withColumnRenamed('currency_code_from',
                                      'std_currency_code_from')
        cers = cers.withColumnRenamed('cur_year', 'std_cur_year')
        cers = cers.withColumnRenamed('cur_month', 'std_cur_month')
        cers = cers.withColumnRenamed('conversion_rate_multiplier',
                                      'std_conversion_rate_multiplier')

        dcust_sold = self.read_source(source=sources['dim_customer'])
        dcust_sold = dcust_sold.withColumnRenamed('dim_customer_id',
                                                  'sold_dim_customer_id')

        dcust_ship = self.read_source(source=sources['dim_customer'])
        dcust_ship = dcust_ship.withColumnRenamed('dim_customer_id',
                                                  'ship_dim_customer_id')

        dcust_brand = self.read_source(source=sources['dim_customer'])
        dcust_brand = dcust_brand.withColumnRenamed('dim_customer_id',
                                                    'brand_dim_customer_id')

        dloc_ship = self.read_source(source=sources['dim_location'])
        dloc_ship = dloc_ship.withColumnRenamed('dim_location_id',
                                                'ship_from_dim_location_id')

        dloc_inv = self.read_source(source=sources['dim_location'])
        dloc_inv = dloc_inv.withColumnRenamed('dim_location_id',
                                              'invoice_dim_location_id')

        dmat = self.read_source(source=sources['dim_material'])
        dmat = dmat.withColumnRenamed('dim_material_id',
                                      'ship1_dim_material_id')

        df = (ri.join(excmat, [
            excmat.material_id == ri.ship1_material_id_int, excmat.system
            == ri.system_id
        ], 'left_anti').join(
            rst, [rst.sbu_subset_id == ri.sbu_subset_id], 'left_outer').join(
                mmf, [mmf.material == ri.mmf_material],
                'left_outer').join(dmat, [
                    dmat.billing_system == ri.system_id, dmat.material_id
                    == ri.ship_mat1_id, dmat.end_market_or_prime == F.when(
                        ri.prime_flag == 1, 'Prime').otherwise('Non-Prime')
                ], 'left_outer').join(cmf, [
                    F.upper(F.trim(cmf.sold_to_ship_to))
                    == ri.commercial_print_customer_key,
                    F.upper(F.trim(cmf.cmf_end_market)) == F.upper(
                        dmat.end_market)
                ], 'left_outer').join(srtr, [
                    srtr.sales_rep_id == cmf.sales_rep_id_override
                ], 'left_outer').join(cerd, [
                    cerd.currency_code_from == ri.currency_id, cerd.cur_year
                    == ri.inv_year, cerd.cur_month == ri.inv_month
                ], 'left_outer').join(cers, [
                    cers.std_currency_code_from == ri.std_cost_currency_id,
                    cers.std_cur_year == ri.inv_year, cers.std_cur_month
                    == ri.inv_month
                ], 'left_outer').join(dcust_sold, [
                    dcust_sold.billing_system == ri.system_id,
                    dcust_sold.customer_id == ri.sold_customer_id
                ], 'left_outer').join(dcust_ship, [
                    dcust_ship.billing_system == ri.system_id,
                    dcust_ship.customer_id == ri.ship_customer_id
                ], 'left_outer').join(dcust_brand, [
                    dcust_brand.billing_system == ri.system_id,
                    dcust_brand.customer_id == ri.brand_owner
                ], 'left_outer').join(dloc_ship, [
                    dloc_ship.location_id == ri.ship_location_id
                ], 'left_outer').join(
                    dloc_inv, [dloc_inv.location_id == ri.mfg_location_id],
                    'left_outer').join(edataA, [
                        edataA.edataA_sold_customer_id
                        == ri.sold_customer_id_lstrip_0, ri.system_id == 'S3',
                        ri.rev_acct_id == 'R6000'
                    ], 'left_anti').join(
                        rsrt, [rsrt.sales_rep_id == ri.ri_sales_rep_id],
                        'left_outer').select(
                            ri.system_id, ri.invoice_id, ri.line_number,
                            ri.month, ri.source_type, ri.rev_acct_id,
                            ri.weight_qty, ri.currency_id,
                            ri.std_cost_currency_id, ri.inv_date,
                            ri.quality_class, ri.sale_type,
                            ri.invoice_line_value, ri.line_qty,
                            ri.invoice_uom_id, ri.inv_line_std_cost, ri.period,
                            ri.year, ri.sales_order, ri.ri_sales_rep_id,
                            ri.line_desc1, rst.med_desc, mmf.cp_subset,
                            cmf.channel, cmf.drop_ship_into_stock,
                            cmf.sales_rep_override, cmf.cmf_end_market,
                            cmf.sales_rep_id_override,
                            cerd.conversion_rate_multiplier,
                            cers.std_conversion_rate_multiplier,
                            dmat.ship1_dim_material_id, dmat.product_code,
                            dmat.force_product_code, dmat.nominal_basis_weight,
                            dmat.material_id, dmat.end_market,
                            dloc_ship.ship_from_dim_location_id,
                            dloc_inv.invoice_dim_location_id,
                            dcust_ship.ship_dim_customer_id,
                            dcust_sold.sold_dim_customer_id,
                            dcust_brand.brand_dim_customer_id,
                            rsrt.sales_rep_original, srtr.region,
                            ri.invoice_volume))

        df = df.where(
            "case when system_id = 'S3' then product_code else '~' end not in ('SC', 'CR')"
        )

        df = df.withColumn('iptmeta_source_system', F.lit('dataA'))
        df = df.withColumn('bol_number', F.lit(MISSING_NUMBER))

        df = df.withColumn(
            'product_sold_flag',
            F.when((df.weight_qty.isNull()) | (df.weight_qty == 0),
                   F.lit('N')).otherwise(F.lit('Y')))

        df = df.withColumn(
            'fx_conversion_to_usd',
            F.coalesce(
                F.when(df.currency_id == 'USD', 1).otherwise(
                    df.conversion_rate_multiplier.cast(T.DoubleType())),
                F.lit(MISSING_NUMBER)))

        df = df.withColumn(
            'std_fx_conversion_to_usd',
            F.coalesce(
                F.when(df.std_cost_currency_id == 'USD', 1).otherwise(
                    df.std_conversion_rate_multiplier.cast(T.DoubleType())),
                F.lit(MISSING_NUMBER)))

        df = df.withColumn('grade', df.product_code)

        df = df.withColumn('invoice_date', F.to_date(df.inv_date))

        df = prime_enrich(df)

        df = df.withColumn('sales_order_number',
                           F.coalesce(df.sales_order, F.lit('0')))

        df = df.withColumn(
            'sale_type',
            F.when(df.sale_type == 'I', F.lit('Internal')).when(
                df.sale_type == 'E',
                F.lit('External')).otherwise(df.sale_type))

        df = df.withColumn(
            'subset',
            F.coalesce(df.cp_subset, df.med_desc, F.lit(NOT_APPLICABLE_DESC)))

        df = (
            df.withColumn(
                'claims',
                F.when(df.rev_acct_id.isin('R4900', 'R4350'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'discounts',
                    F.when(df.rev_acct_id.isin('R4500'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn("freight_invoice_calc", F.lit('actual')).withColumn(
                'freight_invoice',
                F.when(df.rev_acct_id.isin('R8200'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'freight_upcharge',
                    F.when(df.rev_acct_id.isin('R0300'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn(
                'gross_price',
                F.when(df.rev_acct_id.isin('R0100', 'R0500', 'R0700', 'R0105'),
                       df.invoice_line_value * df.fx_conversion_to_usd).
                otherwise(MISSING_NUMBER)).withColumn(
                    'other_deductions',
                    F.when(df.rev_acct_id.isin('R5300'),
                           df.invoice_line_value *
                           df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)).
            withColumn(
                'standard_cost',
                F.coalesce(
                    df.inv_line_std_cost *
                    df.std_fx_conversion_to_usd,
                    F.lit(MISSING_NUMBER))).withColumn(
                        'rebates',
                        F.when(
                            df.rev_acct_id.isin(
                                'R4110', 'R4130'), df.invoice_line_value *
                            df.fx_conversion_to_usd).otherwise(MISSING_NUMBER))
            # TODO Confirm exclusions and/or data predicate should be here
            .withColumn(
                'service_allowances',
                F.when(df.rev_acct_id.isin('R6000'), df.invoice_line_value *
                       df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)))

        df = df.withColumn(
            'msf',
            F.when(df.invoice_uom_id == 'MSF',
                   df.line_qty).when(df.invoice_uom_id == 'M2',
                                     df.line_qty * .0107639).otherwise(0))

        df = df.withColumn('nominal_tons',
                           df.nominal_basis_weight * df.msf / 2000)

        df = df.withColumn(
            'net_price',
            df.gross_price + df.discounts + df.rebates + df.claims +
            df.freight_upcharge + df.other_deductions + df.service_allowances)

        df = df.withColumn(
            'standard_gross_margin',
            df.net_price - (df.standard_cost + df.freight_invoice))

        df = dataA_sales_rep_override(df)
        df = df.withColumn(
            'sales_rep_id',
            F.coalesce(df.sales_rep_id_override, df.ri_sales_rep_id,
                       F.lit(MISSING_NUMBER)))

        df = (
            df.withColumn(
                'ship_from_dim_location_id',
                F.coalesce(
                    df.ship_from_dim_location_id,
                    F.lit(MISSING_STRING_ID))).withColumn(
                        'invoice_dim_location_id',
                        F.coalesce(
                            df.invoice_dim_location_id,
                            F.lit(MISSING_STRING_ID))).withColumn(
                                'ship1_dim_material_id',
                                F.coalesce(
                                    df.ship1_dim_material_id,
                                    F.lit(MISSING_STRING_ID))).withColumn(
                                        'channel',
                                        F.coalesce(
                                            df.channel,
                                            F.lit(MISSING_DESC))).withColumn(
                                                'drop_ship_into_stock',
                                                F.coalesce(
                                                    df.drop_ship_into_stock,
                                                    F.lit(MISSING_DESC))).
            withColumn('region', F.coalesce(
                df.region, F.lit(MISSING_DESC))).withColumn(
                    'ship_dim_customer_id',
                    F.coalesce(
                        df.ship_dim_customer_id,
                        F.lit(MISSING_STRING_ID))).withColumn(
                            'sold_dim_customer_id',
                            F.coalesce(
                                df.sold_dim_customer_id,
                                F.lit(MISSING_STRING_ID))).withColumn(
                                    'brand_dim_customer_id',
                                    F.coalesce(
                                        df.brand_dim_customer_id,
                                        F.lit(MISSING_STRING_ID))).withColumn(
                                            'invoice_period',
                                            F.lpad(df.month, 6, '0')))

        df = (df.withColumnRenamed(
            'system_id', 'billing_system').withColumnRenamed(
                'rev_acct_id', 'invoice_line_code').withColumnRenamed(
                    'invoice_id', 'invoice_number').withColumnRenamed(
                        'line_number',
                        'invoice_line_number').withColumnRenamed(
                            'source_type',
                            'invoice_source_type').withColumnRenamed(
                                'channel',
                                'commercial_print_channel').withColumnRenamed(
                                    'drop_ship_into_stock',
                                    'commercial_print_mode').withColumnRenamed(
                                        'region', 'commercial_print_region').
              withColumnRenamed('currency_id',
                                'invoiced_currency').withColumnRenamed(
                                    'weight_qty',
                                    'actual_tons').withColumnRenamed(
                                        'period',
                                        'report_month').withColumnRenamed(
                                            'year',
                                            'report_year').withColumnRenamed(
                                                'line_desc1',
                                                'invoice_line_desc_1'))

        df = df.select(
            df.billing_system, df.invoice_number, df.invoice_line_number,
            df.invoice_period, df.invoice_source_type, df.invoice_line_code,
            df.iptmeta_source_system, df.product_sold_flag,
            df.commercial_print_channel, df.commercial_print_mode,
            df.fx_conversion_to_usd, df.grade, df.invoice_date,
            df.ship_from_dim_location_id, df.invoiced_currency,
            df.ship1_dim_material_id, df.prime, df.sales_order_number,
            df.sale_type, df.sales_representative, df.ship_dim_customer_id,
            df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset,
            df.actual_tons, df.claims, df.discounts, df.freight_invoice,
            df.freight_invoice_calc, df.freight_upcharge, df.gross_price,
            df.msf, df.net_price, df.nominal_tons, df.other_deductions,
            df.rebates, df.service_allowances, df.standard_cost,
            df.standard_gross_margin, df.invoice_dim_location_id,
            df.commercial_print_region, df.invoice_volume, df.invoice_uom_id,
            df.bol_number, df.report_month, df.report_year, df.sales_rep_id,
            df.invoice_line_desc_1)

        return df
Ejemplo n.º 22
0
def generate_data(sale_hdfs_path, store_path):

    spark = SparkSession.builder.\
    master("local[*]").\
    appName("spark_df").\
    config("spark.some.config.option", "some-value").\
    getOrCreate()

    # sale qty
    try:
        salesfile = "hdfs://10.230.0.11:8020" + sale_hdfs_path
        sale_df = spark.read.parquet(salesfile)
    except:
        salesfile = "hdfs://10.230.0.10:8020" + sale_hdfs_path
        sale_df = spark.read.parquet(salesfile)

    sele_col = ["store_id", "good_gid", "date", "fill_sale_qty"]
    sale_df = sale_df.select(*sele_col)
    # set_name = ["store_id","good_gid","date","fill_sale_qty"]
    # sale_df = sale_df.toDF(*set_name)
    #94626013

    from pyspark.sql.functions import lpad
    # df22 = sale_df.withColumn("store_id", lpad(sale_df['store_id'],6,'0')).withColumn("good_gid", sale_df['good_gid'].cast('string')).cache()
    df22 = sale_df.withColumn("store_id",
                              lpad(sale_df['store_id'], 6, '0')).withColumn(
                                  "good_gid",
                                  sale_df['good_gid'].cast('string'))

    # store

    try:
        storfile = "hdfs://10.230.0.11:8020" + store_path
        store_df = spark.read.parquet(storfile)
    except:
        storefile = "hdfs://10.230.0.10:8020" + store_path
        store_df = spark.read.parquet(storefile)

    # store_path = "/home/test/nby/data_dir/storeinfo.csv"
    # store_df = spark.read.format("com.databricks.spark.csv").option("inferSchema","true").option("header","true").load(store_path)
    store_col = ["store_code", "wh_name", "alcscm"]
    store_df = store_df.select(*store_col)
    store_set = ["store_id", "warename", "ALCSCM"]
    store_df = store_df.toDF(*store_set)
    from pyspark.sql import functions as F
    store_df = store_df.withColumn(
        'ALCSCM',
        F.when(store_df['ALCSCM'] == '-', '01').otherwise(store_df['ALCSCM']))

    store_df = store_df.withColumn("store_id",
                                   lpad(store_df['store_id'], 6, '0'))
    df22_join = df22.join(store_df, df22.store_id == store_df.store_id,
                          "inner").drop(store_df.store_id)

    from pyspark.sql.functions import concat, col, lit
    from pyspark.sql.functions import year, month, dayofmonth
    df22_join = df22_join.withColumn(
        "date", concat(year("date"), lit("-"), lpad(month("date"), 2, '0')))

    # 89847788

    # 3148347
    # count store
    df22_join_uniq = df22_join.dropDuplicates(['store_id', 'good_gid', 'date'])
    # 14467
    df22_join_uniq_gb = df22_join_uniq.groupBy("good_gid", "date",
                                               "ALCSCM").count()
    # sum qty
    df22_join_sum = df22_join.groupBy(
        "good_gid", "date", "ALCSCM",
        "warename").sum("fill_sale_qty").withColumnRenamed(
            'sum(fill_sale_qty)', 'sum_sale_qty')

    # df22_join_uniq_gb.filter("good_gid =='30018144' AND date =='2017-09'").show()
    df22_join_storecount = df22_join_sum.alias("a").join(
        df22_join_uniq_gb.alias("b"),
        (df22_join_sum.good_gid == df22_join_uniq_gb.good_gid) &
        (df22_join_sum.ALCSCM == df22_join_uniq_gb.ALCSCM) &
        (df22_join_sum.date == df22_join_uniq_gb.date),
        "left_outer").select("a.good_gid", "a.date", "a.ALCSCM", "a.warename",
                             "a.sum_sale_qty", "b.count")
    # df22_join.groupBy("good_gid","date","ALCSCM").agg(sum("fill_sale_qty").alias("sum_sale_qty"),func.countDistinct("good_gid","date","ALCSCM")).show()
    sale_data_top100 = df22_join_storecount.toPandas()
    sale_data_top100['good_gid'] = sale_data_top100['good_gid'].astype(str)
    sale_data_top100 = sale_data_top100.sort_values(
        by=['warename', 'good_gid', 'date'], ascending=True)
    sale_data_top100['ALCSCM'] = sale_data_top100['ALCSCM'].astype(str).map(
        lambda x: x.zfill(2))
    sale_data_top100['sale_qty_perstore'] = sale_data_top100[
        'sum_sale_qty'] / sale_data_top100['count']
    sale_data_top100['uniq_ID'] = sale_data_top100[
        'good_gid'] + '_' + sale_data_top100['ALCSCM']
    sale_data_top100 = sale_data_top100.loc[sale_data_top100.ALCSCM != '09', :]

    return sale_data_top100
Ejemplo n.º 23
0
# for the cols. However, if you want to custom specify a schema, you can use the approach below. A benefit for the below
# approach is that you might have, through your pre-processing data validation stage identified data types most suited
# to your dataset. Additionally, you can pass in the lists below from various sources including as parameters
# to your pipeline.
to_int_cols = [
    'WindSpeed', 'WindDirection', 'WindGust', 'Pressure',
    'SignificantWeatherCode'
]
to_long_cols = ['ForecastSiteCode', 'Visibility']
to_date_cols = ['ObservationDate']
to_double_cols = ['ScreenTemperature', 'Latitude', 'Longitude']
# the assumption was that time fields in the weather datasets were of int type, and required formatting to
# a time format
clean_df = clean_df.withColumn(
    'ObservationTime',
    F.lpad(clean_df['ObservationTime'], 4, '0').substr(3, 4))
clean_df = clean_df.withColumn('ObservationTime', F.rpad(clean_df['ObservationTime'], 6, '0')).\
    withColumn("ObservationTime", (F.regexp_replace('ObservationTime',"""(\d\d)""", "$1:")).substr(0,8))

# clean_df.select('ObservationTime').distinct().show()
# using a cast function from spark to modify the data types
for col in clean_df.columns:
    try:
        if col in to_int_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('int'))
        elif col in to_long_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('long'))
        elif col in to_date_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('date'))
        elif col in to_double_cols:
            clean_df = clean_df.withColumn(col, F.col(col).cast('double'))
Ejemplo n.º 24
0
def collect_orderby(sdf,
                    cols,
                    groupby,
                    orderby=None,
                    suffix='__collect',
                    sep='____',
                    orderby_func={},
                    dtype=StringType(),
                    ascending=True,
                    drop_null=True):
    # 暂时不考虑空值填充,orderby=None时将去除空值
    """
    Paramter:
    ----------
    sdf: pyspark dataframe to be processed
    cols: str/list of the sdf'cols to be processed
    groupby: str/list of sdf' cols to be groupbyed when collect_orderby
    orderby: str/list of sdf' cols to be orderbyed when collect_orderby
    suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix)
    sep: str of the sep when concat_ws(don't change by default)
    dtype: pyspark.sql.types of the return values
    Return:
    ----------
    sdf: pyspark dataframe of collect_list orderby
    Example:
    ----------
    sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time')
    """
    # cols:需collect_list项
    # groupby:为空时可传入[]
    # orderby:必为string、int、float项(也可有int,float型)
    assert not orderby_func or orderby
    orderby_agg_func = []
    orderby_agg_cols = []
    orderby_copy_cols_dict, orderby_recover_cols_dict = {}, {
    }  # 用于orderby中有非string字段进行collect时的名称统一
    if not isinstance(cols, list):
        cols = [cols]
    if not isinstance(groupby, list):
        groupby = [groupby]
    if orderby is None:
        orderby = []
        orderby_func = {}
    if not isinstance(orderby, list):
        orderby = [orderby]
    # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01'
    for i, c in enumerate(orderby):
        if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string':
            c_orderby = f"{c}{sep}orderby"
            sdf = sdf.withColumn(c_orderby, F.col(c))
            orderby[i] = c_orderby
            orderby_copy_cols_dict[c_orderby] = c
    if not isinstance(orderby_func, dict):
        if not isinstance(orderby_func, list):
            orderby_func = [orderby_func]
        orderby_func = dict(zip(orderby, [orderby_func] * len(orderby)))
    if not drop_null:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]
                if len(i.split(sep)) > 1 else None  #当原始字段包含sep时,这里将有问题!!!!
                for i in x
            ],
            ArrayType(dtype))
    else:
        split_udf = F.udf(
            lambda x: [
                i.split(sep)[-1]  #当原始字段包含sep时,这里将有问题!!!!
                for i in x if len(i.split(sep)) > 1
            ],
            ArrayType(dtype))
    for c in [
            k for k, v in sdf.dtypes if k in cols
            and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0
    ]:
        logstr(f'{c}类型转换为StringType')
        sdf = sdf.withColumn(c, cast2str_udf(c))  # 不符合要求的先统计转为StringType()
    logstr('orderby', orderby)
    if len(orderby) != 0:
        # 处理orderby_func
        for c, f_list in orderby_func.items():
            if not isinstance(f_list, list):
                f_list = [f_list]
            for i, f in enumerate(f_list):
                if c not in orderby:
                    continue
                if isinstance(f, str):
                    f = f_list[i] = eval(f"F.{f}")
                key = f"{c}{sep}{f.__name__}"
                orderby_agg_func.append(f(c).alias(key))
                orderby_agg_cols.append(key)
                if c in orderby_copy_cols_dict:
                    orderby_recover_cols_dict[
                        key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}"
        # 处理非字符型orderby
        order_int_list = [
            k for k, v in sdf.dtypes
            if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0
        ]
        order_float_list = [
            k for k, v in sdf.dtypes if k in orderby
            and len(re.findall(re.compile(r'(float|double)'), v)) > 0
        ]
        if order_int_list:
            logstr('order_int_list', order_int_list)
            order_int_max_sdf = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list])
            order_int_max_df = sdf.select(order_int_list).agg(
                *[F.max(c).alias(c) for c in order_int_list]).toPandas()
            order_int_max_dict = dict(
                zip(order_int_max_df.keys(),
                    order_int_max_df.values.flatten().tolist()))
            logstr('order_int_max_dict', order_int_max_dict)
            for c in order_int_list:
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()),
                        len(str(order_int_max_dict[c])), '0'))
        if order_float_list:
            logstr('order_float_list', order_float_list)
            for c in order_float_list:
                sdf = sdf.withColumn(c, F.col(c).cast(StringType()))
                max_df = sdf.select(F.split(c, r"\.").alias(c)).select([
                    F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2)
                ]).agg(*[
                    F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2)
                ]).toPandas()
                max_dict = dict(
                    zip(max_df.keys(),
                        max_df.values.flatten().tolist()))
                logstr('max_dict', max_dict)
                sdf = sdf.withColumn(
                    c,
                    F.lpad(
                        F.col(c).cast(StringType()), max_dict[c + "__0"],
                        '0')).withColumn(
                            c,
                            F.rpad(
                                F.col(c).cast(StringType()),
                                max_dict[c + "__1"], '0'))
        agg_fun_list = [
            F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)),
                         asc=ascending).alias(c + '_temp') for c in cols
        ]
        # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出
        sdf = sdf.select([
            F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" %
                                                '_'.join(orderby))
            for c in cols
        ] + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select(
            [split_udf(c + '_temp').alias(c + suffix)
             for c in cols] + orderby_agg_cols + groupby)
    else:
        agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols]
        sdf = sdf.select(cols + groupby + orderby)
        sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func))
        sdf = sdf.select([
            F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix)
            for c in cols
        ] + orderby_agg_cols + groupby)
    for c1, c2 in orderby_recover_cols_dict.items():
        sdf = sdf.withColumnRenamed(c1, c2)
    return sdf
Ejemplo n.º 25
0
pd.set_option('display.width',110); pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth', 200); pd.set_option('display.max_rows', 500)

print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves), flush=True)
print("Execution time #0 %f" % (time.time() - startTime),flush=True)

# ## Read two mapping files into dataframes
# - Read files from Amazon S3 bucket into Spark dataframes
# - Format columns as required to enable joins to dataset below

# read and process city FIPS to county FIPS mapping file
city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \
                    .csv(latlonFilename)
#                          .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv")

city_to_fips = city_to_fips.withColumn("county_FIPS", f.lpad(city_to_fips['county_FIPS'],5,"0"))
city_to_fips = city_to_fips.drop("city","zip","id","source","population")
city_to_fips = city_to_fips.withColumn("city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.'))
city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \
                           .withColumnRenamed("state_name","StateDesc") \
                           .withColumnRenamed("county_FIPS","FIPS")

print((city_to_fips.count(), len(city_to_fips.columns)))
city_to_fips.limit(5).toPandas()


# read and process commuting zone to county FIPS mappingfile
cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \
                  .csv(lmaFilename)
#                        .csv("/home/ubuntu/project/data/1990LMAascii.csv")
    
# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import translate
Ejemplo n.º 27
0
    def execute(self, spark: SparkSession, args: Namespace):

        # select and cleanup the raw events for the data date
        events = spark \
            .table(f'{args.input_database_name}.{self.partner}') \
            .withColumnRenamed('impressionid', 'impression_id') \
            .withColumnRenamed('sourceid', 'measurement_source_id') \
            .withColumnRenamed('advertiserid', 'partner_measured_advertiser_id') \
            .withColumnRenamed('campaignid', 'partner_measured_campaign_id') \
            .withColumn('partner_measured_channel_id', lit(None).cast(LongType())) \
            .withColumnRenamed('placementid', 'partner_measured_placement_id') \
            .withColumnRenamed('creativeid', 'partner_measured_creative_id') \
            .withColumnRenamed('mediatypeid', 'media_type_id') \
            .withColumnRenamed('inview50', 'in_view_50') \
            .withColumnRenamed('inview100', 'in_view_100') \
            .withColumnRenamed('noncontinuousplaytime', 'non_continuous_play_time') \
            .withColumnRenamed('audibleplaytime', 'audible_play_time') \
            .withColumnRenamed('fullscreen', 'full_screen') \
            .withColumnRenamed('fraudcategory', 'fraud_category') \
            .withColumnRenamed('givt', 'general_invalid') \
            .filter(get_source_partitions_condition(
            args.data_date, args.data_date_tz_offset
        )) \
            .dropDuplicates(['impression_id', 'type', 'timestamp'])

        # assemble ad sessions
        adsessions = events \
            .groupBy('impression_id') \
            .agg(
            first(lpad(
                (
                    (
                        col('utchour').cast(IntegerType())
                        + lit(24 + args.data_date_tz_offset / 3600).cast(IntegerType())
                    ) % lit(24)
                ).cast(StringType()),
                2, '0'
            )).alias('hour'),
            collect_set('type').alias('events'),
            _event_prop('impression', col('site')).alias('site'),
            _event_prop('impression', col('measurement_source_id')).alias('measurement_source_id'),
            _event_prop('impression', col('partner_measured_advertiser_id')) \
                .alias('partner_measured_advertiser_id'),
            _event_prop('impression', col('partner_measured_campaign_id')).alias('partner_measured_campaign_id'),
            _event_prop('impression', col('partner_measured_channel_id')).alias('partner_measured_channel_id'),
            _event_prop('impression', col('partner_measured_placement_id')).alias('partner_measured_placement_id'),
            _event_prop('impression', col('partner_measured_creative_id')).alias('partner_measured_creative_id'),
            _event_prop('impression', col('media_type_id')).alias('media_type_id'),
            _event_prop('impression', col('endpoint')).alias('endpoint'),
            _event_prop('impression', col('suspicious')).alias('suspicious'),
            _event_prop('impression', col('general_invalid')).alias('general_invalid'),
            _event_prop('impression', col('fraud_category')).alias('fraud_category'),
            _event_prop('impression', (col('measurable') == 1) | (col('measurable') == 3)) \
                .alias('mrc_measurable'),
            _event_prop('impression', (col('measurable') == 2) | (col('measurable') == 3)) \
                .alias('groupm_measurable'),
            first(
                when(
                    (
                        ((col('endpoint') == 'display') & (col('type') == 'impression'))
                        | ((col('endpoint') == 'video') & (col('type') == 'started'))
                    ),
                    col('in_view_50')
                ),
                ignorenulls=True
            ).alias('mrc_in_view_on_load'),
            _event_prop('unload', col('in_view_50')).alias('mrc_in_view_on_unload'),
            _event_prop('first-quartile', col('in_view_50')).alias('mrc_in_view_1q'),
            _event_prop('second-quartile', col('in_view_50')).alias('mrc_in_view_2q'),
            _event_prop('thrid-quartile', col('in_view_50')).alias('mrc_in_view_3q'),
            _event_prop('completed', col('in_view_50')).alias('mrc_in_view_4q'),
            _event_prop('started', col('in_view_100')).alias('groupm_in_view_on_load'),
            _event_prop('unload', col('in_view_100')).alias('groupm_in_view_on_unload'),
            _event_prop('first-quartile', col('in_view_100')).alias('groupm_in_view_1q'),
            _event_prop('second-quartile', col('in_view_100')).alias('groupm_in_view_2q'),
            _event_prop('thrid-quartile', col('in_view_100')).alias('groupm_in_view_3q'),
            _event_prop('completed', col('in_view_100')).alias('groupm_in_view_4q'),
            ((smax('non_continuous_play_time') - smax('audible_play_time')) > lit(1000)).alias('muted'),
            _event_prop('impression', col('full_screen')).alias('full_screen')
        ) \
            .filter(array_contains(col('events'), 'impression')) \
            .withColumn('date', lit(''.join(args.data_date.split('-')))) \
            .withColumn('mrc_viewable', (
            col('mrc_measurable') & (
            ((col('endpoint') == 'display') & array_contains(col('events'), 'in-view'))
            | ((col('endpoint') == 'video') & array_contains(col('events'), 'in-view-50'))
        )
        )) \
            .withColumn('groupm_viewable', (
            col('groupm_measurable') & array_contains(col('events'), '100-percent-in-view')
        )) \
            .withColumn('clean', (
            ~col('suspicious') & ~col('general_invalid')
        )) \
            .select(
            'date',
            'hour',
            'impression_id',
            'site',
            'measurement_source_id',
            'partner_measured_advertiser_id',
            'partner_measured_campaign_id',
            'partner_measured_channel_id',
            'partner_measured_placement_id',
            'partner_measured_creative_id',
            'media_type_id',
            lit(None).cast(BooleanType()).alias('below_the_fold'),
            lit(None).cast(BooleanType()).alias('on_the_fold'),
            lit(None).cast(BooleanType()).alias('above_the_fold'),
            lit(None).cast(IntegerType()).alias('time_on_page'),
            lit(None).cast(IntegerType()).alias('in_view_time'),
            (col('clean') & col('mrc_viewable')).alias('in_view'),
            (col('clean') & array_contains(col('events'), 'in-view-5s')).alias('in_view_5s'),
            (col('clean') & array_contains(col('events'), 'in-view-15s')).alias('in_view_15s'),
            (col('clean') & (
                ((col('endpoint') == 'display') & ~array_contains(col('events'), 'in-view'))
                | ((col('endpoint') == 'video') & ~array_contains(col('events'), 'in-view-50'))
            )).alias('not_in_view'),
            lit(None).cast(BooleanType()).alias('never_in_view'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_on_load')).alias('in_view_load'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_on_unload')).alias('in_view_unload'),
            (col('clean') & array_contains(col('events'), 'first-quartile')).alias('completed_1q'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_1q')).alias('in_view_1q'),
            (col('clean') & array_contains(col('events'), 'second-quartile')).alias('completed_2q'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_2q')).alias('in_view_2q'),
            (col('clean') & array_contains(col('events'), 'thrid-quartile')).alias('completed_3q'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_3q')).alias('in_view_3q'),
            (col('clean') & array_contains(col('events'), 'completed')).alias('completed_4q'),
            (col('clean') & col('mrc_measurable') & col('mrc_in_view_4q')).alias('in_view_4q'),
            (col('clean') & ~array_contains(col('events'), 'started')).alias('never_started'),
            (col('clean') & col('muted')).alias('muted'),
            (col('clean') & col('full_screen')).alias('full_screen'),
            lit(False).alias('click_through'),
            (~col('clean') & col('mrc_viewable')).alias('sivt_in_view'),
            (~col('clean') & ~col('mrc_viewable')).alias('sivt_not_in_view'),
            lit(None).cast(IntegerType()).alias('groupm_time_on_page'),
            lit(None).cast(IntegerType()).alias('groupm_in_view_time'),
            (col('clean') & col('groupm_viewable')).alias('groupm_in_view'),
            lit(False).alias('groupm_in_view_5s'),
            lit(False).alias('groupm_in_view_15s'),
            (col('clean') & ~col('groupm_viewable')).alias('groupm_not_in_view'),
            lit(False).alias('groupm_never_in_view'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_on_load')).alias('groupm_in_view_load'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_on_unload')) \
                .alias('groupm_in_view_unload'),
            (col('clean') & array_contains(col('events'), 'first-quartile')).alias('groupm_completed_1q'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_1q')).alias('groupm_in_view_1q'),
            (col('clean') & array_contains(col('events'), 'second-quartile')).alias('groupm_completed_2q'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_2q')).alias('groupm_in_view_2q'),
            (col('clean') & array_contains(col('events'), 'thrid-quartile')).alias('groupm_completed_3q'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_3q')).alias('groupm_in_view_3q'),
            (col('clean') & array_contains(col('events'), 'completed')).alias('groupm_completed_4q'),
            (col('clean') & col('groupm_measurable') & col('groupm_in_view_4q')).alias('groupm_in_view_4q'),
            (col('clean') & ~array_contains(col('events'), 'started')).alias('groupm_never_started'),
            (col('clean') & col('muted')).alias('groupm_muted'),
            (col('clean') & col('full_screen')).alias('groupm_full_screen'),
            lit(False).alias('groupm_click_through'),
            (~col('clean') & col('groupm_viewable')).alias('groupm_sivt_in_view'),
            (~col('clean') & ~col('groupm_viewable')).alias('groupm_sivt_not_in_view'),
            'suspicious',
            lit(None).cast(BooleanType()).alias('measured'),
            lit(None).cast(BooleanType()).alias('groupm_measured'),
            'general_invalid',
            col('mrc_measurable').alias('viewability_measurement_trusted'),
            (col('suspicious') & (col('fraud_category') == lit('Sitting Duck'))).alias('sitting_duck_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Standard'))).alias('standard_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Volunteer'))).alias('volunteer_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Profile'))).alias('profile_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Masked'))).alias('masked_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Nomadic'))).alias('nomadic_bot'),
            (col('suspicious') & (col('fraud_category') == lit('Other'))).alias('other_bot'),
            lit(None).cast(BooleanType()).alias('true_view_viewable'),
            lit(None).cast(BooleanType()).alias('true_view_measurable'),
            lit(None).cast(BooleanType()).alias('yahoo_gemini_billable'),
            (col('clean') & col('groupm_measurable') & array_contains(
                col('events'), '100-percent-in-view-2-seconds'
            )).alias('full_ad_in_view'),
            (col('clean') & col('groupm_measurable') & array_contains(
                col('events'), '100-percent-in-view-2-seconds'
            )).alias('publicis_in_view'),
            lit(None).cast(BooleanType()).alias('yahoo_gemini_billable_suspicious'),
            lit(None).cast(DoubleType()).alias('average_in_view_time'),
            lit(None).cast(BooleanType()).alias('in_view_lt_1s'),
            lit(None).cast(BooleanType()).alias('in_view_1s_2s'),
            lit(None).cast(BooleanType()).alias('in_view_2s_5s'),
            lit(None).cast(BooleanType()).alias('in_view_5s_10s'),
            lit(None).cast(BooleanType()).alias('in_view_10s_15s'),
            lit(None).cast(BooleanType()).alias('in_view_15s_20s'),
            lit(None).cast(BooleanType()).alias('in_view_20s_25s'),
            lit(None).cast(BooleanType()).alias('in_view_25s_30s'),
            lit(None).cast(BooleanType()).alias('in_view_30s_35s'),
            lit(None).cast(BooleanType()).alias('in_view_35s_40s'),
            lit(None).cast(BooleanType()).alias('in_view_40s_45s'),
            lit(None).cast(BooleanType()).alias('in_view_45s_50s'),
            lit(None).cast(BooleanType()).alias('in_view_ge_50s'),
            (col('mrc_measurable') | col('groupm_measurable') | ~col('clean')) \
                .alias('viewability_measured_or_fraud')
        )

        # save ad sessions
        adsessions \
            .write \
            .mode('overwrite') \
            .partitionBy('date', 'hour') \
            .format('avro') \
            .option('compression', 'snappy') \
            .save(
            f's3://{args.output_bucket_name}/{args.output_bucket_data_prefix}'
            f'/{self.partner}/'
        )
Ejemplo n.º 28
0
 temp = temp.withColumn('measure', lit('total distinct collectors'))
 temp = temp.withColumn('customer_type', lit('Overall'))
 totals = totals.union(temp.select('customer_type', 'total', 'measure'))
 totals.write.format('jdbc').options(
     url='jdbc:mysql://35.238.212.81:3306/assignment_db',
     driver='com.mysql.jdbc.Driver',
     dbtable='totals',
     user='******',
     password='******').mode('overwrite').save()
 #Making transaction date consistent (/,-, consistent) and replacing day part of the trans_dt to 01 to calculate monthly aggregate
 df_trans_date_slash = df.filter(df.trans_dt.contains('/'))
 split_col = split(df_trans_date_slash['trans_dt'], '/')
 df_trans_date_slash = df_trans_date_slash.withColumn(
     'month', split_col.getItem(0))
 df_trans_date_slash = df_trans_date_slash.withColumn(
     'month', lpad('month', 2, "0"))
 df_trans_date_slash = df_trans_date_slash.withColumn(
     'year', split_col.getItem(2))
 df_trans_date_slash = df_trans_date_slash.withColumn(
     'trans_dt', concat(col('month'), lit('-01-'), col('year')))
 df_trans_date_hyphen = df.filter(df.trans_dt.contains('-'))
 split_col = split(df_trans_date_hyphen['trans_dt'], '-')
 df_trans_date_hyphen = df_trans_date_hyphen.withColumn(
     'month', split_col.getItem(1))
 df_trans_date_hyphen = df_trans_date_hyphen.withColumn(
     'month', lpad('month', 2, "0"))
 df_trans_date_hyphen = df_trans_date_hyphen.withColumn(
     'year', split_col.getItem(0))
 df_trans_date_hyphen = df_trans_date_hyphen.withColumn(
     'trans_dt', concat(col('month'), lit('-01-'), col('year')))
 df = df_trans_date_hyphen.union(df_trans_date_slash)
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300):
    """Create `dim_date` table containing various date feature columns.

    Args:
        spark (SparkSession): Instantiated SparkSession
        start_year (int): starting year for dim_date table.
        number_years_out_from_start (int): number out from `start_year` to increment.

    Returns:
        Spark DataFrame.
    """
    years = [start_year + i for i in range(number_years_out_from_start + 1)]
    months = [i for i in range(1, 13)]
    days = [i for i in range(1, 32)]

    years_df = spark.createDataFrame(
        pd.DataFrame({
            'year': years,
            'temp_join_key': '1'
        }))
    months_df = spark.createDataFrame(
        pd.DataFrame({
            'month': months,
            'temp_join_key': '1'
        }))
    days_df = spark.createDataFrame(
        pd.DataFrame({
            'day_of_month': days,
            'temp_join_key': '1'
        }))

    years_months_df = (years_df.join(months_df, ['temp_join_key'],
                                     how='inner'))

    years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'],
                                                how='inner'))

    date_keys = (
        years_month_days_df.withColumn(
            'date',
            to_date(
                concat(col('year'), lpad(col('month'), 2, '0'),
                       lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd'))
        # remove invalid dates
        .filter("date IS NOT NULL").withColumn(
            'date_key',
            regexp_replace(col('date').cast('string'), '-',
                           '').cast('integer')))

    date_features = (date_keys
                     # get `week` and `quarter`
                     .withColumn('week', weekofyear(col('date')))
                     .withColumn('quarter', quarter(col('date')))
                     # get `day_name` and `month_name`
                     .withColumn('day_name', date_format(col('date'), 'EEEE'))
                     .withColumn('month_name', date_format(col('date'), 'MMMM'))
                     # get `date_year`, `date_quarter`, `date_month`, `date_week`
                     .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)"))
                     .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01'))
                     .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)"))
                     .withColumn('date_year', date_format(col('date'), 'yyyy-01-01'))
                     # get `day_of_week`, `day_of_quarter`, `day_of_year`
                     .withColumn('day_of_week', dayofweek(col('date')))
                     .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1))
                     .withColumn('day_of_year', dayofyear(col('date')))
                     # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`,
                     # `month_start_flag`, `month_end_flag`
                     .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N'))
                     .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp')))
                     .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) |
                                                         (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N'))
                     .withColumn('leap_year_flag',
                                 when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y')
                                 .otherwise('N'))
                     .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N'))
                     .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N'))
                     # get `pct_into_month`, `pct_into_quarter`, `pct_into_year`
                     .withColumn('pct_into_month',
                                 (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)'))
                     .withColumn('date_quarter_end',
                                 when(col('quarter') == lit(1), concat(col('year'), lit('-03-31')))
                                 .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30')))
                                 .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30')))
                                 .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31')))
                                 .otherwise(None)
                                 .cast('date'))
                     .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1))
                     .withColumn('pct_into_quarter',
                                 (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)'))
                     .withColumn('pct_into_year',
                                 (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0))
                                 .cast('decimal(7, 6)'))
                     # get seasons
                     .withColumn('season_northern',
                                 when(col('month').isin(12, 1, 2), 'Winter')
                                 .when(col('month').isin(3, 4, 5), 'Spring')
                                 .when(col('month').isin(6, 7, 8), 'Summer')
                                 .when(col('month').isin(9, 10, 11), 'Fall')
                                 .otherwise('UNKNOWN'))
                     .withColumn('season_southern',
                                 when(col('month').isin(6, 7, 8), 'Winter')
                                 .when(col('month').isin(9, 10, 11), 'Spring')
                                 .when(col('month').isin(12, 1, 2), 'Summer')
                                 .when(col('month').isin(3, 4, 5), 'Fall')
                                 .otherwise('UNKNOWN')))

    dim_date = (date_features.sort('date').select([
        'date_key', 'date', 'date_week', 'date_month', 'date_quarter',
        'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter',
        'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter',
        'day_name', 'month_name', 'season_northern', 'season_southern',
        'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag',
        'month_start_flag', 'month_end_flag', 'leap_year_flag',
        'pct_into_month', 'pct_into_quarter', 'pct_into_year'
    ]))
    return dim_date
Ejemplo n.º 30
0
def main():
    # Download data
    now = datetime.datetime.now().strftime("%m-%d-%H-%M-%S")
    folder = config['local_folder']
    os.system(f"rm -r {folder}")

    # Daily dataset
    ##big_table_name = 'vartefact.forecast_sprint4_add_dm_to_daily'

    # Promo dataset
    ##big_table_name = 'vartefact.forecast_sprint4_promo_mecha_v4'

    # Weekly dataset
    big_table_name = f"{config['database']}.forecast_sprint3_v10_flag_sprint4"
    file_name = 'test_to_delete_' + now
    download_data(folder=folder,
                  big_table_name=big_table_name,
                  file_name=file_name)

    # Preprocess the datase
    big_table = file_name + '.csv'
    sql_table = big_table_name
    target_value = 'sales_qty_sum'
    dataset_name = 'dataset_test_weekly'
    preprocess(folder=folder,
               big_table=big_table,
               sql_table=sql_table,
               target_value=target_value,
               dataset_name=dataset_name)

    # Train the model
    desc = 'weekly_test'
    data_set1 = dataset_name + '_part1.pkl'
    data_set2 = dataset_name + '_part2.pkl'

    date_stop_train = config['date_stop_train']
    os.system(
        f"cp {config['config_folder']}/calendar.pkl {config['local_folder']}")
    # learning_rate = 0.3

    run_model(folder=folder,
              data_set1=data_set1,
              data_set2=data_set2,
              futur_prediction=True,
              date_stop_train=date_stop_train)
    # save csv as table
    print('saving csv as table')
    file_list = os.listdir(config['local_folder'])

    for f in file_list:
        if 'weekly_model_training_for_futur_predictions' in f:
            result_file_name = f
    result_csv_path = f"{config['local_folder']}/{result_file_name}/resulst_forecast_10w_on_the_fututre.csv"
    os.system(f"hadoop fs -rm resulst_forecast_10w_on_the_fututre.csv")
    os.system(f"hadoop fs -put -f {result_csv_path}")

    spark = SparkSession \
        .builder \
        .appName("Forecast_saveastable") \
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config("spark.num.executors", '10') \
        .config("spark.executor.memory", '15G') \
        .config("spark.executor.cores", '20') \
        .config("spark.sql.crossJoin.enable", True) \
        .enableHiveSupport() \
        .getOrCreate()

    sqlContext = SQLContext(spark)

    spark_df = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true').load(f"resulst_forecast_10w_on_the_fututre.csv")
    split_col = pyspark.sql.functions.split(spark_df['full_item'], '_')
    spark_df = spark_df.withColumn('item_id', split_col.getItem(0))
    spark_df = spark_df.withColumn('sub_id', split_col.getItem(1))
    spark_df = spark_df.withColumn("item_id",
                                   spark_df["item_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("sub_id",
                                   spark_df["sub_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("week",
                                   spark_df["week"].cast(IntegerType()))
    spark_df = spark_df.withColumn(
        "train_mape_score", spark_df["train_mape_score"].cast(FloatType()))
    spark_df = spark_df.withColumn("predict_sales",
                                   spark_df["predict_sales"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "predict_sales_error_squared",
        spark_df["predict_sales_error_squared"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "predict_sales_max_confidence_interval",
        spark_df["predict_sales_max_confidence_interval"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "order_prediction", spark_df["order_prediction"].cast(FloatType()))
    spark_df = spark_df.withColumn("store_code",
                                   lpad(spark_df['store_code'], 3, '0'))
    spark_df = spark_df.withColumnRenamed('week', 'week_key')
    spark_df = spark_df.withColumnRenamed('predict_sales', 'sales_prediction')
    spark_df = spark_df.withColumnRenamed(
        'predict_sales_max_confidence_interval', 'max_confidence_interval')
    spark_df.write.mode('overwrite').saveAsTable(
        f"{config['database']}.result_forecast_10w_on_the_fututre")
    sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """
    impalaexec(sql)
    print('csv saved in the table')

    ############ forecast_sprint3_v10_flag_sprint4 sales information
    ## get max_week_key min_week_key
    big_table_name = f"{config['database']}.forecast_sprint3_v10_flag_sprint4"
    week_numbes_to_roll_median = 4
    sql_query = f"""
    select 
        cast((cast(max(week_key) as int) - {week_numbes_to_roll_median})  as varchar(10)) as min_week_key,
        max(week_key) as max_week_key
    from {big_table_name}
    """
    week_keys = spark.sql(sql_query).toPandas()
    max_week = week_keys['max_week_key'].iloc[0]
    min_week = week_keys['min_week_key'].iloc[0]

    ## get median median_sales
    sql_query = f"""
    select sales_qty_sum, sub_id, week_key, store_code
    from {big_table_name}
    where week_key >= {min_week} 
    """
    df_from_sql = spark.sql(sql_query).toPandas()
    df_from_sql['sales_qty_sum'] = df_from_sql['sales_qty_sum'].astype(float)
    median_df = df_from_sql[['sales_qty_sum', 'sub_id',
                             'store_code']].groupby(['sub_id', 'store_code'
                                                     ]).median().reset_index()
    median_df = median_df.rename({'sales_qty_sum': 'median_sales'}, axis=1)
    ############ result_forecast_10w_on_the_fututre_all prediction result information
    results_table = f"{config['database']}.result_forecast_10w_on_the_fututre"
    sql_query = f"""
    select *
    from {results_table}
    """
    file_name = 'results'
    forecast_df = download_csv_by_spark(spark, sql_query, file_name)
    print('length before merge: ', len(forecast_df))
    key_columns = ['sub_id', 'store_code']
    for col in key_columns:
        median_df[col] = median_df[col].astype(float)
        forecast_df[col] = forecast_df[col].astype(float)
    forecast_df_w_sales = forecast_df.merge(median_df, how='left')
    print('after merge: ', len(forecast_df_w_sales))
    forecast_df_w_sales.loc[:, 'forecast_min_median'] = forecast_df_w_sales[[
        'median_sales', 'order_prediction'
    ]].max(axis=1)
    forecast_df_w_sales.loc[(forecast_df_w_sales['sales_prediction'] > 7)
                            & (forecast_df_w_sales['forecast_min_median'] >
                               forecast_df_w_sales['median_sales']),
                            'max_order_3xmedian'] = np.minimum(
                                3 * forecast_df_w_sales['median_sales'],
                                forecast_df_w_sales['forecast_min_median'])
    df_prediction_final = forecast_df_w_sales.copy()
    df_prediction_final['max_confidence_interval'] = df_prediction_final[[
        'max_order_3xmedian', 'forecast_min_median'
    ]].min(axis=1)
    df_prediction_final['sales_prediction'] = df_prediction_final[[
        'max_confidence_interval', 'sales_prediction'
    ]].min(axis=1)
    df_prediction_final['order_prediction'] = df_prediction_final[[
        'order_prediction', 'max_confidence_interval'
    ]].min(axis=1)

    df_prediction_final = df_prediction_final.drop(
        ['median_sales', 'forecast_min_median', 'max_order_3xmedian'], axis=1)
    df_prediction_final.describe().T
    df_prediction_final.to_csv(config['local_folder'] +
                               'df_prediction_final_roger_normal_adhoc.csv',
                               index=False)
    result_csv_path = f"{config['local_folder']}df_prediction_final_roger_normal_adhoc.csv"
    os.system(f"hadoop fs -rm df_prediction_final_roger_normal_adhoc.csv")
    os.system(f"hadoop fs -put -f {result_csv_path}")
    spark_df = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true').load(f"df_prediction_final_roger_normal_adhoc.csv")
    split_col = split(spark_df['full_item'], '_')
    spark_df = spark_df.withColumn('item_id', split_col.getItem(0))
    spark_df = spark_df.withColumn('sub_id', split_col.getItem(1))
    spark_df = spark_df.withColumn("item_id",
                                   spark_df["item_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("sub_id",
                                   spark_df["sub_id"].cast(IntegerType()))
    spark_df = spark_df.withColumn("week_key",
                                   spark_df["week_key"].cast(IntegerType()))
    spark_df = spark_df.withColumn(
        "train_mape_score", spark_df["train_mape_score"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "sales_prediction", spark_df["sales_prediction"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "predict_sales_error_squared",
        spark_df["predict_sales_error_squared"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "max_confidence_interval",
        spark_df["max_confidence_interval"].cast(FloatType()))
    spark_df = spark_df.withColumn(
        "order_prediction", spark_df["order_prediction"].cast(FloatType()))
    spark_df = spark_df.withColumn("store_code",
                                   lpad(spark_df['store_code'], 3, '0'))
    spark_df.write.mode('overwrite').saveAsTable(
        f"{config['database']}.result_forecast_10w_on_the_fututre")

    sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """
    impalaexec(sql)
    print('csv saved in the table')
    spark.conf.set("spark.sql.crossJoin.enabled", True)
    sql = f'''
    with
        latest_prediction as (
            select
                *
            from {config['database']}.result_forecast_10w_on_the_fututre
        ),
        week_key_list as (
            select
                distinct
                week_key
            from latest_prediction
        ),
        count_predict_week_number as (
            select
                item_id, sub_id, store_code,
                count(distinct week_key) as week_number
            from latest_prediction
            group by item_id, sub_id, store_code
        ),
        get_missing_item_list as (
            select
                distinct
                item_id,
                sub_id,
                store_code
            from count_predict_week_number
            where week_number != 10
        ),
        get_missed_item_full_list as (
            select
                get_missing_item_list.item_id,
                get_missing_item_list.sub_id,
                get_missing_item_list.store_code,
                week_key_list.week_key
            from get_missing_item_list
            left join week_key_list
                on 1=1
        ),
        get_missing_item_week_list as (
            select
                distinct
                item_id,
                sub_id,
                store_code,
                week_key
            from latest_prediction
            where item_id in (select item_id from get_missing_item_list)
        ),
        get_final_list as (
            select
                get_missed_item_full_list.item_id,
                get_missed_item_full_list.sub_id,
                get_missed_item_full_list.store_code,
                get_missed_item_full_list.week_key,
                missed.item_id as m_item_id,
                missed.week_key as m_week_key
            from get_missed_item_full_list
            left join get_missing_item_week_list missed
                on  missed.item_id  = get_missed_item_full_list.item_id
                and missed.sub_id = get_missed_item_full_list.sub_id
                and missed.week_key = get_missed_item_full_list.week_key
                and missed.store_code = get_missed_item_full_list.store_code
        )
    insert into {config['database']}.result_forecast_10w_on_the_fututre 
    select
        concat(cast(item_id as string),'_',cast(sub_id as string)) as full_item,
        store_code,
        week_key,
        0 as train_mape_score,
        0 as sales_prediction,
        0 as predict_sales_error_squared,
        0 as max_confidence_interval,
        0 as order_prediction,
        item_id,
        sub_id
    from get_final_list
    where m_item_id is null
    '''
    df_get_final_list2 = spark.sql(sql).toPandas()
    spark.stop()

    sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """
    impalaexec(sql)
    print('csv saved in the table')
Ejemplo n.º 31
0
                f.col('dpc').isNotNull(),
                f.col('dpc'))
                             .otherwise(f.regexp_extract(f.col('housenumber'), '([0-9]+)([.*-])([0-9]+)', 3)))

df_test = df_test.withColumn('dpc', blanks_to_null('dpc'))

df_test.show(30)


# In[ ]:


#rule13
df_test = df_test.withColumn('dpc', f.coalesce(f.col('dpc'), f.lit('99')))
df_test.show(30)


# In[ ]:


#rule3
df_test = df_test.withColumn('dpc', f.lpad('dpc', 2, '0'))
df_test.show(30)


# In[ ]: