def filter_fiscal_year_and_month( df: DataFrame, fiscal_year: str, month_abbr: str, date_fmt: str = 'MM/dd/yyyy', filter_column_year: str = 'voucher_creation_date', filter_column_month: str = 'shipment_pickup_date') -> DataFrame: filtered = ( df .withColumn( '_fiscal_year', F.coalesce( F.year(F.add_months( F.to_date(filter_column_year, date_fmt), 3)), F.year(F.add_months( F.to_date(filter_column_month, date_fmt), 3)) )) .withColumn( '_month_abbr', F.coalesce( F.upper(F.date_format( F.to_date(filter_column_year, date_fmt), 'MMM')), F.upper(F.date_format( F.to_date(filter_column_month, date_fmt), 'MMM')) )) .where(F.col('_fiscal_year') == fiscal_year) .where(F.col('_month_abbr') == month_abbr) ) # yapf: disable return filtered
def test_add_months_function(self): dt = datetime.date(2021, 12, 27) # Note; number in Python gets converted to LongType column; # this is not supported by the function, so cast to Integer explicitly df = self.spark.createDataFrame([Row(date=dt, add=2)], "date date, add integer") self.assertTrue( all( df.select( add_months(df.date, df.add) == datetime.date(2022, 2, 27), add_months(df.date, "add") == datetime.date(2022, 2, 27), add_months(df.date, 3) == datetime.date(2022, 3, 27), ).first()))
def get_date_dimension(spark, year, month, logger): """Create the dataframe for the date dimension for this year and month""" start_date = datetime.date(int(year), int(month), 1) basedates_df = spark.createDataFrame([(start_date,)], ['start_date']) basedates_df = basedates_df.withColumn('end_date', add_months(basedates_df.start_date, 1)) end_date = basedates_df.collect()[0][1] numdays = (end_date - start_date).days datelist = [start_date + datetime.timedelta(days=x) for x in range(numdays)] datesofmonth_df = spark.createDataFrame(datelist, DateType())\ .withColumnRenamed('value', 'date') date_df = datesofmonth_df.withColumn('str_weekday', date_format(datesofmonth_df.date, 'EEEE'))\ .selectExpr("date as date_id", "day(date) as day", "weekofyear(date) as week", "month(date) as month", "year(date) as year", "str_weekday as weekday")\ .distinct()\ .orderBy('date_id') date_df.persist() logger.info("date_df row count: {}".format(date_df.count())) return date_df
def getGroupedPointOwners(): if environment == "local": pass elif environment == "cluster": pass else: raise AssertionError( "Bad environment variable (environment = {}). Should be local or cluster" .format(environment)) points_df = spark.read.format("orc").load(files_directory) # df modification # points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType())) # points_df = points_df.withColumn("period_full_date", # from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss')) points_df = points_df.withColumn( "period", from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss')) # # print(points_df.printSchema) # points_df.show(10) # # points_stats = points_df \ # .groupBy(["period_year_month", "organisationid", "customerid", "typeid"]) \ # .agg(_sum("qty").alias("total_qty")).orderBy(desc("period_year_month")) points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType())) points_df = points_df.withColumn('month', trunc(points_df['period'], 'MM')) points_df = points_df.groupby( ['organisationid', 'customerid', 'typeid', 'month']).sum('qty') points_df = points_df.withColumn( "cumulativeSum", _sum('sum(qty)').over( Window.partitionBy(['organisationid', 'customerid', 'typeid']).orderBy('month'))) points_df = points_df.withColumn('aggdate', add_months(points_df['month'], 1)) points_df = points_df.withColumn('aggdate_ts', to_timestamp(points_df['aggdate'])) points_df = points_df.withColumn('aggdate_date', points_df['month'].cast(DateType())) points_df = points_df.withColumn("qty", points_df["cumulativeSum"]) points_df = points_df.drop('cumulativeSum') points_df = points_df.drop('sum(qty)') points_df = points_df.drop('month') points_df.show(100) return points_df
def add_fiscal_year_and_month_abbr( df, date_fmt: str = 'yyyy/MM/dd', filter_column_year: str = 'voucher_creation_date', filter_column_month: str = 'shipment_pickup_date') -> DataFrame: expr_mapping = { '_fiscal_year': (F.coalesce( F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)), F.year(F.add_months(F.to_date(filter_column_month, date_fmt), 3)))), '_month_abbr': (F.coalesce( F.upper( F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')), F.upper( F.date_format(F.to_date(filter_column_month, date_fmt), 'MMM')))) } select_expr = build_col_expr(expr_mapping) transformed = df.select(F.expr('*'), *select_expr) return transformed
def preprocess_educations(df, output_path): """ 简历教育经历信息预处理 :param df: 教育经历数据,DataFrame数据类型 :param output_path: 输出文件路径 :return: """ # 预处理后输出字段 output_columns = [ "resume_id", "school_name", "school_area", "major", "category", "subcategory", "edu_start_date", "edu_end_date", "edu_start_year", "edu_end_year", "degree", "edu_index" ] # 归一化处理学校名,学历 df = df.withColumn("school_name", F.udf(normalization_school_name)(df.school)) df = df.withColumn("school_area", F.udf(mark_school_area)(df.school_name)) df = df.withColumn("major", F.udf(normalization_major)(df.major)) df = df.withColumn("degree", F.udf(normalization_degree)(df.degree)) # df = df.withColumn("major_category", F.udf(get_major_level2)(df.major, df.degree)) df = df.withColumn( "subjects", F.udf(normalized_major_tuple, ArrayType(ArrayType(StringType())))(df.major, df.degree)) df = df.withColumn("subjects", F.explode("subjects")) \ .withColumn("category", F.col("subjects")[0]) \ .withColumn("subcategory", F.col("subjects")[1]) \ .withColumn("major", F.col("subjects")[2]) # 标准化日期格式 df = df.withColumn("start_time", F.udf(lambda x: format_date.sub("-", x))(df.start_time)) df = df.withColumn("end_time", F.udf(lambda x: format_date.sub("-", x))(df.end_time)) df = df.withColumn("edu_start_date", F.date_format(df.start_time, "yyy-MM")) df = df.withColumn("edu_end_date", F.date_format(df.end_time, "yyy-MM")) df = df.withColumn( "edu_end_date", F.when(df.edu_end_date.isNotNull(), df.edu_end_date).otherwise( F.date_format(F.add_months(df.edu_start_date, 48), "yyy-MM"))) df = df.withColumn("edu_start_year", F.year(df.start_time)) df = df.withColumn("edu_end_year", F.year(df.end_time)) df = df.withColumn("edu_index", df.index) # 过滤条件 df = df.dropna(how="all", subset=["school_name", "major", "degree"]) # 将简历基本信息预处理后的结果保存至指定文件 df.select(output_columns).write.mode('overwrite').save(output_path)
def cad_to_usd_rate(currency_exchange_rates: DataFrame, fiscal_year: str, month_abbr: str) -> float: """Currently returns latest exchange rate for the given month.""" filtered = ( currency_exchange_rates .where(F.col('currency_code_from') == 'CAD') .where(F.col('currency_code_to') == 'USD') .where(F.year(F.add_months(F.to_date( 'effective_date', 'yyyyMMdd'), 3)) == fiscal_year) .where(F.upper(F.date_format(F.to_date( 'effective_date', 'yyyyMMdd'), 'MMM')) == month_abbr) .sort('effective_date', ascending=False) ) # yapf: disable return filtered.first().conversion_rate_multiplier
def create_scoring_output(pargs, params): """ Generate final renewal scoring CSV file for MD dashboard :inputs: renewal_scoring_output :outputs: md_prediction_1_firstyr_renewal """ # Read and refine scoring output table model_result = spark.read.parquet( data_paths["renewal_scoring_output"].format(run_mode=run['run_mode'], run_id=run['run_id'])) model_result = model_result.dropDuplicates(['imc_no']) output_tbl = model_result.groupBy( ["n_signup_year", "n_signup_month"]).count().withColumnRenamed('count', 'num_population') # Create new fields for MD dashboard output_prob = model_result.groupBy([ "n_signup_year", "n_signup_month" ]).sum('renewal_label').withColumnRenamed('sum(renewal_label)', 'num_renew') output_tbl = output_tbl.join(output_prob, ["n_signup_year", "n_signup_month"]) output_tbl = output_tbl.withColumn( 'renewal_pct', output_tbl['num_renew'] / output_tbl['num_population']) output_tbl = output_tbl.withColumn('signup_month', F.concat(F.col("n_signup_year"), F.lit(""), F.col("n_signup_month"))) \ .withColumn('signup_month', F.to_date(F.col('signup_month'), 'yyyyMM')).drop(*["n_signup_year", "n_signup_month"]) \ .withColumn('month', F.add_months('signup_month', 15)) output_tbl = output_tbl.withColumn( 'signup_month', F.date_format(F.col("signup_month"), "yyyyMM")).withColumn( 'month', F.date_format(F.col("month"), "yyyyMM")).orderBy('signup_month') # Write final table output_tbl.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite') \ .csv(data_paths['md_prediction_1_firstyr_renewal'].format(run_mode=run['run_mode'], run_id=run['run_id']), header='true')
"make_timestamp(YEAR, MONTH, DAY, HOUR, MINUTE, SECOND) as THE_TIMESTAMP") ts.show(truncate=False) # COMMAND ---------- # Add 1 month to the next dt column df = spark.createDataFrame([('2015-04-08', )], ['dt']) # Expected: # +----------+ # |next_month| # +----------+ # |2015-05-08| # +----------+ # Answer df.select(F.add_months(df.dt, 1).alias('next_month')).show() # COMMAND ---------- # Get the current date and current timestamp: # Expected like this: # +--------------+-----------------------+ # |current_date()|current_timestamp() | # +--------------+-----------------------+ # |2021-01-24 |2021-01-24 22:27:35.045| # +--------------+-----------------------+ # Answer df.select(F.current_date(), F.current_timestamp()).show(truncate=False) # COMMAND ----------
# .where(f"date=='{date}'") #Credit to the professor, I leverage this piece of code from class def find_median(values_list): try: median = np.median(values_list) return round(float(median), 2) except Exception: return None categories = [ "big_box_grocers", "convenience_stores", "drinking_places", "full_service_restaurants", "limited_service_restaurants", "pharmacies_and_drug_stores", "snack_and_bakeries", "specialty_food_stores", "supermarkets_except_convenience_stores" ] for c in categories: df.join(filteredCorePlaces, ["placekey"], "inner").groupBy("date","file_name")\ .agg(F.collect_list("visits").alias('visit'), F.round(F.stddev("visits")).cast("integer").alias('std'))\ .withColumn("median", median_finder('visit'))\ .withColumn("low", when(F.col("std") > F.col("median"), 0).otherwise(F.col("median") - (F.col("std"))))\ .withColumn("high", F.col("median") + F.col("std"))\ .withColumn("year", F.year("date"))\ .withColumn("project_date", F.add_months(F.col("date"), 12))\ .sort(F.col("year"), F.col("project_date"))\ .where((F.col("year").isin(2019, 2020)) & (F.col("file_name") == c))\ .select(F.col("year"), F.col("project_date").alias('date'), F.col("median"),F.col("low"),F.col("high"))\ .coalesce(1).write.mode("overwrite").option("header",True).format("csv").save("/{}/{}".format(OUTPUT_PREFIX, c))
option("header", "true").option( "path", os.path.join(DATA_DIRECTORY, "flights", "departuredelays.csv")).load()) # %% df.show(5, False) # %% df = df.withColumn("date_iso", F.to_date(F.col("date"), "MMddHHmm")) # %% df.show(50, False) # %% # Assume the data are for 2019 df = df.withColumn("date_iso", F.add_months(F.col("date_iso"), 12 * 49)) # %% df.show(50, False) # %% df.createOrReplaceTempView("departuredelays") # %% # subquery spark.sql(""" SELECT date_iso, delay, origin, destination FROM departuredelays WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
def create_migration_monthly_target_variables(pargs, params): """ This function creates bns migration target variables table. Target variable: bns_migration_label, contains values (UP, SAME, DOWN) :inputs: behaviors_combined_data :outputs: bns_migration_3m_outcomes or bns_migration_3m_outcomes """ # Load data behaviors_combined_data = sqlContext.read.parquet( data_paths['behaviors_combined_data'].format(run_mode=run['run_mode'], run_id=run['run_id'])) # Create time realted columns and adjust columns type. Generate temporary table for SQL queries behaviors_combined_data = behaviors_combined_data.withColumn( 'mo_yr_key_no', F.to_date(F.col('mo_yr_key_no_str'), 'yyyyMM')) behaviors_combined_data = behaviors_combined_data.withColumn( 'calendar_year', year(F.col('mo_yr_key_no'))) behaviors_combined_data = behaviors_combined_data.withColumn('imc_months_after_signup', behaviors_combined_data['imc_months_after_signup'].cast('int')) \ .withColumn('bns_pct', behaviors_combined_data['bns_pct'].cast('int')) behaviors_combined_data.createOrReplaceTempView("behaviors_combined") # Set parameters for 3 months or 6 months tables if configs['target_3m']: monthPara = "3m" monthNum = "3" monthLag = '2' else: monthPara = "6m" monthNum = "6" monthLag = '5' # Processes of generating bns migration target table in Spark SQL bns_migration_eda_step_0 = sqlContext.sql( queries.bns_migration_eda_step0_sql.format( monthLag=monthLag, monthPara=monthPara, cntry_list=','.join(str(x) for x in run['cntry_key_no']))) bns_migration_eda_step_0 = bns_migration_eda_step_0.withColumn( "mo_yr_key_no_" + monthPara + "_pre", F.add_months('mo_yr_key_no', -int(monthNum))) bns_migration_eda_step_0.createOrReplaceTempView( "bns_migration_eda_step_0") bns_migration_eda_step_1 = sqlContext.sql( queries.bns_migration_eda_step1_sql.format(monthPara=monthPara)) bns_migration_eda_step_1.createOrReplaceTempView( "bns_migration_eda_step_1") bns_migration_monthly_outcomes = sqlContext.sql( queries.bns_migration_outcomes_sql.format(monthLag=monthLag, monthPara=monthPara)) bns_migration_monthly_outcomes = bns_migration_monthly_outcomes.withColumn( "original_back_" + monthNum, F.add_months('mo_yr_key_no', -int(monthNum))) # Write final table writePath = data_paths['bns_migration_{monthPara}_outcomes'.format( monthPara=monthPara)] bns_migration_monthly_outcomes.write.mode('overwrite').option("maxRecordsPerFile", configs["maxRecordsPerFile"]) \ .parquet(writePath.format(run_mode=run['run_mode'], run_id=run['run_id']))
#list_1 = ["col_1", "col_2"] #df_1 = spark.createDataFrame(list_1, StringType()) #df_1.select(substring_index("value", "_", 1)).show() # COMMAND ---------- #Date from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit orders_new_col.select( current_timestamp(), current_date(), date_format(current_timestamp(), "yyyy-MM-dd a hh:HH:mm").alias("dat"), dayofyear(current_timestamp()).alias("dayofyear_val"), month(current_timestamp()).alias("mon"), date_sub(current_timestamp(), 1).alias("date_d")).select( datediff(current_timestamp(), "date_d")).show(1, truncate=False) orders_new_col.select(add_months( current_timestamp(), -1).alias("prev_mon")).select( months_between(current_timestamp(), "prev_mon").alias("no_of_mon")).select( next_day(current_timestamp(), "Mon")).select( last_day(current_timestamp())).show(1) orders_new_col.select(date_trunc('year', lit('2020-04-01'))).show(1) # COMMAND ---------- from pyspark.sql.functions import concat_ws #orders_new_col.printSchema() #spark.sql("create database if not exists newdb") #spark.sql("show databases").show() ordersText = orders_new_col.select( concat_ws("~", "order_id", "order_date", "order_customer_id", "order_status").alias("concal_col"))
def process_history_df_mom(self, df): """ Process function for history data, generate result dataframe that contains week, number of create events """ # There are two versions of API for CreateEvent of repository: # - One is col("payload")['object'] == 'repository' # - Another is col("payload")['ref_type'] == 'repository' # try: df_columns = df.columns df_first_record = df.first() # keyword = 'object' if 'object' in df_first_record['payload'] else 'ref_type' num_create_events_df = \ df \ .filter((col('payload')['ref_type'] == 'repository') | (col('payload')['object'] == 'repository')) \ .filter((col('type') == 'CreateEvent') | (col('type') == 'Event')) # count the number of create events happened in one week (group by week) num_create_events_by_month_df = num_create_events_df.groupby( date_trunc('month', df.created_at).alias('month_created_at')).count() # calculate the grawth rate of that day compare to last week # dulicated two dataframes, for each day in the first dataframe # find the number fo create events in the second dataframe # of a day that is 7 days before the day in the first dataframe # [df1] 2015-01-07 -> [df2] 2015-01-01 (7 days) num_create_events_by_month_df_1 = num_create_events_by_month_df.alias( 'num_create_events_by_month_df_1') num_create_events_by_month_df_1 = \ num_create_events_by_month_df_1 \ .select( col('month_created_at').alias('month_created_at_1'), col('count').alias('count_1')) num_create_events_by_month_df_2 = num_create_events_by_month_df.alias( 'num_create_events_by_month_df_2') num_create_events_by_month_df_2 = \ num_create_events_by_month_df_2 \ .select( col('month_created_at').alias('month_created_at_2'), col('count').alias('count_2')) joined_num_create_events_df = \ num_create_events_by_month_df_1 \ .withColumn( 'last_week_month_created_at', date_trunc( 'month', add_months(num_create_events_by_month_df_1.month_created_at_1, -1))) \ .join( num_create_events_by_month_df_2, col('last_week_month_created_at') == col('month_created_at_2'), how='left_outer') joined_num_create_events_df.show() joined_num_create_events_df = joined_num_create_events_df.withColumn( 'count_2', coalesce('count_2', 'count_1')) num_create_events_with_growth_rate_df = \ joined_num_create_events_df \ .withColumn( 'monthly_increase_rate', ((joined_num_create_events_df.count_1 - joined_num_create_events_df.count_2) / joined_num_create_events_df.count_2) ) \ .select( 'month_created_at_1', 'count_1', 'monthly_increase_rate') num_create_events_with_growth_rate_df.show() return num_create_events_with_growth_rate_df
print("JOB STARTED...") car_sales = spark.sql("SELECT * FROM sales.car_sales") customer_data = spark.sql("SELECT * FROM marketing.customer_data") car_installs = spark.sql("SELECT * FROM factory.car_installs") factory_data = spark.sql("SELECT * FROM factory.experimental_motors") geo_data = spark.sql("SELECT postalcode as zip, latitude, longitude FROM marketing.geo_data_xref") print("\tREAD TABLE(S) COMPLETED") #--------------------------------------------------- # APPLY FILTERS # - Remove under aged drivers (less than 16 yrs old) #--------------------------------------------------- before = customer_data.count() customer_data = customer_data.filter(col('birthdate') <= F.add_months(F.current_date(),-192)) after = customer_data.count() print(f"\tFILTER DATA (CUSTOMER_DATA): Before({before}), After ({after}), Difference ({after - before}) rows") #--------------------------------------------------- # JOIN DATA INTO ONE TABLE #--------------------------------------------------- # SQL way to do things salesandcustomers_sql = "SELECT customers.*, sales.sale_date, sales.saleprice, sales.model, sales.VIN \ FROM sales.car_sales sales JOIN marketing.customer_data customers \ ON sales.customer_id = customers.customer_id " tempTable = spark.sql(salesandcustomers_sql) if (_DEBUG_): print("\tTABLE: CAR_SALES")
@author rambabu.posa """ from pyspark.sql import SparkSession from pyspark.sql import (functions as F) import os current_dir = os.path.dirname(__file__) relative_path = "../../resources/data/sparkInActionData/dates.csv" absolute_file_path = os.path.join(current_dir, relative_path) # Creates a session on a local master spark = SparkSession.builder.appName("add_months function") \ .master("local[*]").getOrCreate() df = spark.read.format("csv") \ .option("header", True) \ .option("imferSchema", True) \ .load(absolute_file_path) df = df.withColumn("add_months+2", F.add_months(F.col("date_time"), 2)) # As of now, no support for this operation in PySpark API # df = df.withColumn("add_months+val", F.add_months(F.col("date_time"), F.col("val"))) df.show(5, False) df.printSchema() df.show(5) spark.stop()
# #一个单位每天报警数量 # unit_day_alarm_count=unit_hour_alarm_count.withColumn('alarm_time',date_format(unit_hour_alarm_count.alarm_time, 'yyyy-MM-dd'))\ # .groupby('unit_id','dept_id','alarm_time')\ # .sum('count')\ # .withColumnRenamed("sum(count)","count")\ # .sort('count',ascending=False) # unit_day_alarm_count.show() # 用电采集 new_df = files['dev_id', 'unit_id', 'dept_id', 'voltage', 'current', 'current_power', 'frequency', 'factor', 'current_power', 'max_power', 'reactive_power', 'apparent_power', 'temperature_value', 'leakage_value', 'alarm_time'] new_df = new_df.filter( new_df['alarm_time'] > add_months(current_date(), -1)).sort('alarm_time') new_df.show() max_count = new_df.groupby('dev_id').agg({ 'voltage': 'max', 'current': 'max', 'current_power': 'max', 'temperature_value': 'max' }) max_count.show() min_count=new_df.groupby('dev_id').agg({'voltage':'min','current':'min','current_power':'min','temperature_value':'min'})\ .withColumnRenamed("dev_id","dev_id2") min_count.show() data_count=max_count.join(min_count['min(current)','min(voltage)','min(temperature_value)','min(current_power)','dev_id2'] ,(min_count.dev_id2==max_count.dev_id)
def vendor_subbusiness_transactions_table(tables, transactions_date, graph_spec): """ Return a dataframe that contains vendor, voucher id, sub-business unit for a given transactions date. :param tables: dictionary that maps (table_name, src_name, dst_name) or (table_name, node_name) to the appropriate DataFrame. - "table_name" is the name of the table. - "src_name"/"dst_name" are the `safe_name` of the `source_column`/`target_column` for the edge list. - "node_name" is the `safe_name` of the `index_column` for node list. :type tables: dict :param transactions_date: date of the vouchers for checking :type transactions_date: str :param graph_spec: graph specification :type graph_spec: fncore.utils.graph_specification.GraphSpec :return: a dataframe that contains vendor, voucher id, sub-business unit for a given transactions date. The schema is as follows: | |-- vendor: str | |-- subbusiness_unit: str | |-- voucher_id: str | |-- gross_amt: decimal(27,3) | |-- invoice_dt: date :rtype: pyspark.sql.DataFrame """ table_name = 'voucher sub_business_unit-entity edge' edge_list = graph_spec.name_to_edge_list[table_name] voucher_id_col_name = edge_list.index_column.safe_name.lower() gross_amt_col_name = ( edge_list .name_to_metadata_column['GROSS_AMT_BSE'] .safe_name ) invoice_dt_col_name = ( edge_list .name_to_metadata_column['INVOICE_DT'] .safe_name ) account_code_col_name = ( edge_list .name_to_metadata_column['ACCOUNT'] .safe_name ) fiscal_year_col_name = ( edge_list .name_to_metadata_column['ACCOUNTING_DATE'] .safe_name ) # 1. Get vendor to subbusiness unit relations # subbusiness_vendor: (source) subbusiness unit, (target) vendor # (metadata) voucher_id, gross_amt, invoice_dt, fiscal_year subbusiness_vendor = _df_col_names_lower( _get_dataframe_from_tables(tables, table_name, graph_spec) ) vendor_subbusiness_transactions = ( subbusiness_vendor .select( col('_canonical_id_source').alias('subbusiness_unit'), col('_canonical_id_target').alias('vendor'), col(voucher_id_col_name).alias('voucher_id'), col(account_code_col_name).alias('account_code'), col(gross_amt_col_name).alias('gross_amt'), to_date(col(invoice_dt_col_name)).alias('invoice_dt'), year(add_months(to_date(col(fiscal_year_col_name)), -3)).alias('fiscal_year')) .filter("invoice_dt='{}'".format(transactions_date)) # Remove account codes starting with 1 which are EOM payments .filter("substr(account_code, 0, 1) != '1'") ) # 2. Get vendor to vendor_name vendor = vendor_table(tables, graph_spec) # 3. Join vendor with vendor_subbusiness_transactions vendor_subbusiness_transactions = vendor_subbusiness_transactions.join(vendor, on='vendor') return vendor_subbusiness_transactions
output_dir = path1 glueContext = GlueContext(SparkContext.getOrCreate()) #Get DynamicFrames fron GLue Catalog datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "c360view_stage", table_name = "mf_transactions_pqt", transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "c360view_stage", table_name = "gbank_pqt", transformation_ctx = "datasource1") #Convert to Dataframes df_transactions = datasource0.toDF() df_dispositions = datasource1.toDF() df_dispositions = df_dispositions.drop("type") #Filter and grouping by time windows (last 3 and 6 months) and Customer + Type of transaction Aggregation df_trans_acc_aggr_l3m = df_transactions.filter(F.col("date") >= F.add_months(F.current_date(), -3)).groupBy("account_id","type").agg(F.avg('amount'), F.count('amount')) df_trans_acc_aggr_l6m = df_transactions.filter(F.col("date") >= F.add_months(F.current_date(), -6)).groupBy("account_id","type").agg(F.avg('amount'), F.count('amount')) #Join DF with customer data df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.join(df_dispositions,'account_id' , how="inner") df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.join(df_dispositions,'account_id' , how="inner") #Renaming columns df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.withColumnRenamed("avg(amount)","amount_avg").withColumnRenamed("count(amount)","amount_count") df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.withColumnRenamed("avg(amount)","amount_avg").withColumnRenamed("count(amount)","amount_count") #Aggregations 2 df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.withColumnRenamed("round(avg(amount_avg), 0)","amount_avg_trans").withColumnRenamed("sum(amount_count)","count_trans") df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.withColumnRenamed("round(avg(amount_avg), 0)","amount_avg_trans").withColumnRenamed("sum(amount_count)","count_trans") #Date time partition columns generation