Exemple #1
0
def get_filtered_by_month(data: DataFrame) -> DataFrame:
    """
    Method transforms periods "start_date - end_date" to year and month number

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-01-01|2018-01-09|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|      year|     month|
    +---+----------+----------+
    |  5|      2018|         1|
    +---+----------+----------+
    """

    transformed_data = data \
        .withColumn("start_date", F.trunc("start_date", "month")) \
        .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \
        .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", F.posexplode("repeat").alias("date", "val")) \
        .withColumn("month", F.expr("add_months(datab, date)")) \
        .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \
        .dropDuplicates()

    return transformed_data
def _generate_select_expression_for_TimestampMonth(source_column,
                                                   name):  # noqa: N802
    """
    Used for Anonymizing. Can be used to keep the age but obscure the explicit birthday.
    This custom datatype requires a :any:`pyspark.sql.types.TimestampType` column as input.
    The datetime value will be set to the first day of the month.

    Example
    -------
    >>> from pyspark.sql import Row
    >>> from datetime import datetime
    >>> from spooq.transformer import Mapper
    >>>
    >>> input_df = spark.createDataFrame(
    >>>     [Row(birthday=datetime(2019, 2, 9, 2, 45)),
    >>>      Row(birthday=None),
    >>>      Row(birthday=datetime(1988, 1, 31, 8))]
    >>> )
    >>>
    >>> mapping = [("birthday", "birthday", "TimestampMonth")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(birthday=datetime.datetime(2019, 2, 1, 0, 0)),
     Row(birthday=None),
     Row(birthday=datetime.datetime(1988, 1, 1, 0, 0))]
    """
    return F.trunc(source_column, "month").cast(T.TimestampType()).alias(name)
def getGroupedPointOwners():
    if environment == "local":
        pass
    elif environment == "cluster":
        pass
    else:
        raise AssertionError(
            "Bad environment variable (environment = {}). Should be local or cluster"
            .format(environment))

    points_df = spark.read.format("orc").load(files_directory)

    # df modification
    # points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType()))
    # points_df = points_df.withColumn("period_full_date",
    #                                  from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss'))
    points_df = points_df.withColumn(
        "period",
        from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss'))
    #
    # print(points_df.printSchema)
    # points_df.show(10)
    #
    # points_stats = points_df \
    #     .groupBy(["period_year_month", "organisationid", "customerid", "typeid"]) \
    #     .agg(_sum("qty").alias("total_qty")).orderBy(desc("period_year_month"))

    points_df = points_df.withColumn("qty",
                                     points_df["qty"].cast(DoubleType()))
    points_df = points_df.withColumn('month', trunc(points_df['period'], 'MM'))

    points_df = points_df.groupby(
        ['organisationid', 'customerid', 'typeid', 'month']).sum('qty')

    points_df = points_df.withColumn(
        "cumulativeSum",
        _sum('sum(qty)').over(
            Window.partitionBy(['organisationid', 'customerid',
                                'typeid']).orderBy('month')))

    points_df = points_df.withColumn('aggdate',
                                     add_months(points_df['month'], 1))
    points_df = points_df.withColumn('aggdate_ts',
                                     to_timestamp(points_df['aggdate']))
    points_df = points_df.withColumn('aggdate_date',
                                     points_df['month'].cast(DateType()))
    points_df = points_df.withColumn("qty", points_df["cumulativeSum"])

    points_df = points_df.drop('cumulativeSum')
    points_df = points_df.drop('sum(qty)')
    points_df = points_df.drop('month')

    points_df.show(100)

    return points_df
Exemple #4
0
def adobe_traffic_by_channel(df_traffic_by_channel):
    return (df_traffic_by_channel.select(
        df_traffic_by_channel['id'], df_traffic_by_channel['createdat'],
        df_traffic_by_channel['reportsuiteid'],
        df_traffic_by_channel['reportsuitename'],
        df_traffic_by_channel['server'], df_traffic_by_channel['data'],
        df_traffic_by_channel['hour'], df_traffic_by_channel['channel'],
        df_traffic_by_channel['visits'], df_traffic_by_channel['orders'],
        df_traffic_by_channel['ordersserialized'],
        df_traffic_by_channel['revenue']).withColumn(
            'dataprocessamento', f.trunc(f.current_date(), 'month')))
Exemple #5
0
def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto):
    df_depara = (df_deparadepto.filter(
        (df_deparadepto['tabela'] == 'departamento')
        & (df_deparadepto['campo'] == 'iddepartamento')).select(
            df_deparadepto['de'], df_deparadepto['para']))

    return (df_traffic_by_department.join(
        df_depara,
        df_traffic_by_department['iddepartament'] == df_depara['de'],
        'left').select(df_traffic_by_department['id'],
                       df_traffic_by_department['createdat'],
                       df_traffic_by_department['reportsuiteid'],
                       df_traffic_by_department['reportsuitename'],
                       df_traffic_by_department['server'],
                       df_traffic_by_department['data'],
                       df_traffic_by_department['hour'],
                       df_traffic_by_department['iddepartament'],
                       f.trim(df_depara['para']).alias('namedepartment'),
                       df_traffic_by_department['visits'],
                       df_traffic_by_department['orders'],
                       df_traffic_by_department['ordersserialized'],
                       df_traffic_by_department['revenue']).withColumn(
                           'dataprocessamento',
                           f.trunc(f.current_date(), 'month')))
Exemple #6
0
## sellers who have got orders in each of the first six months of 1997
# spark sql code
spark.sql(
    'select s.companyName,count(distinct trunc(o.orderdate,"month")) as monthcount \
    from orders o \
        inner join order_details od \
            on o.orderid=od.orderid \
        inner join products p \
            on od.productid= p.productid \
        inner join suppliers s \
            on p.supplierid =s.supplierid \
        where \
            o.orderdate between   to_date("1997-01-01") and to_date("1997-06-30") \
        group by s.companyName \
        having monthcount=6').show()

#equivalent python code using dataframes
orders = dataframes['orders']
order_details = dataframes['order_details']
products = dataframes['products']
suppliers = dataframes['suppliers']

joined=orders.filter(orders.OrderDate >= '1997-01-01').filter(orders.OrderDate <= '1997-06-30' )\
    .join(order_details, orders.OrderID==order_details.OrderID)\
    .join(products, order_details.ProductID==products.ProductID)\
    .join(suppliers, products.SupplierID==suppliers.SupplierID)\
    .groupBy(suppliers.CompanyName)\
    .agg(F.countDistinct(F.trunc(orders.OrderDate,"month")).alias('monthcount'))
joined.filter(joined.monthcount==6)\
    .show()
Exemple #7
0
     max_table_name_key = 0
    
  #set dataframe for inserted records
  df_inserted_table_name =  s_df_table_name\
                          .alias("silver")\
                          .join(g_df_table_name.alias("gold"),\
                               (s_df_table_name.Forecast_Date==g_df_table_name.Forecast_Date) & \
                               (s_df_table_name.Snapshot_Date==g_df_table_name.Snapshot_Date) & \
                               (s_df_table_name.End_of_Month==g_df_table_name.End_of_Month) & \
                               (s_df_table_name.Sub_Region_Code==g_df_table_name.Sub_Region_Code) &\
                               (s_df_table_name.Snp_Seq_No==g_df_table_name.Snp_Seq_No) ,"left_anti")\
                          .where(s_df_table_name.IBIOperationType != "Delete")\
                          .selectExpr("NULL as MergeKey1","NULL as MergeKey2","NULL as MergeKey3","NULL as MergeKey4", "NULL as MergeKey5",\
                                      "silver.*", "silver.IBIUpdatedDate as s_IBIUpdatedDate", "silver.IBIUpdatedBy as s_IBIUpdatedBy")

  df_inserted_table_name = df_inserted_table_name.withColumn("ValidFrom",trunc("End_of_Month", "month"))
  df_inserted_table_name = df_inserted_table_name.withColumn("ValidTo",df_inserted_table_name.End_of_Month)

  
  df_inserted_table_name = df_inserted_table_name.withColumn("IBIChangeReason",lit("New record has been created in source system."))
  df_inserted_table_name = df_inserted_table_name.withColumn("IBIOperationType",lit("Insert"))
  df_inserted_table_name = df_inserted_table_name.withColumn("IsDeleted",lit("false"))
  df_inserted_table_name = df_inserted_table_name.withColumn("IsLatest",lit("true"))
  
  #set dataframe for modified records 
  df_modified_table_name =  s_df_table_name\
                          .alias("silver")\
                          .join(df_inserted_table_name.alias("inserted"),\
                               (s_df_table_name.Forecast_Date==df_inserted_table_name.Forecast_Date) & \
                               (s_df_table_name.Snapshot_Date==df_inserted_table_name.Snapshot_Date) & \
                               (s_df_table_name.End_of_Month==df_inserted_table_name.End_of_Month) & \