def AggregateSignal(glueContext, dfc) -> DynamicFrameCollection: from pyspark.sql.functions import lit df = dfc.select(list(dfc.keys())[0]).toDF() df = df.groupby('ticker', 'date').agg({"signal1": "sum"}).withColumnRenamed('sum(signal1)', 'signal1') df = df.withColumn('market', lit('US')) aws_df = DynamicFrame.fromDF(df, glueContext, "conversion") return(DynamicFrameCollection({"CustomTransform0": aws_df}, glueContext))
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: from pyspark.sql.functions import lit df = dfc.select(list(dfc.keys())[0]).toDF() df = df.drop_duplicates(subset=['ticker']) df = df.withColumn("market", lit('US')) aws_df = DynamicFrame.fromDF(df, glueContext, "conversion") return(DynamicFrameCollection({"CustomTransform0": aws_df}, glueContext))
def Aggregate_Tickets(glueContext, dfc) -> DynamicFrameCollection: selected = dfc.select(list(dfc.keys())[0]).toDF() selected.createOrReplaceTempView("ticketcount") totals = spark.sql( "select court_location as location, infraction_description as infraction, count(infraction_code) as total FROM ticketcount group by infraction_description, infraction_code, court_location order by court_location asc" ) results = DynamicFrame.fromDF(totals, glueContext, "results") return DynamicFrameCollection({"results": results}, glueContext)
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: import pyspark.sql.functions as F # convert dymamic frame to data frame df = dfc.select(list(dfc.keys())[0]).toDF() df = df.withColumn("title", F.upper(F.col("title"))) # create dynamic frame from dataframe upperDf = DynamicFrame.fromDF(df, glueContext, "filter_votes") return (DynamicFrameCollection({"CustomTransform0": upperDf}, glueContext))
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: newdf = dfc.select(list(dfc.keys())[0]).toDF() from pyspark.sql import functions as sf from pyspark.sql.functions import col, substring newdf = newdf.withColumn('length', sf.length('id')) newdf = newdf.withColumn('IDnew', col('id').substr(sf.lit(2), col('length'))) newdatadyc = DynamicFrame.fromDF(newdf, glueContext, 'newData') return (DynamicFrameCollection({"CustomTransform0": newdatadyc}, glueContext))
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: selected = dfc.select(list(dfc.keys())[0]).toDF() from pyspark.sql.functions import regexp_replace as regxx modeCa = selected.groupby("ca").count().orderBy("count", ascending=False).first()[0] newDF = selected.withColumn('ca', regxx('ca', '\?', modeCa)) modeThal = newDF.groupby("thal").count().orderBy("count", ascending=False).first()[0] newDF = newDF.withColumn('thal', regxx('thal', '\?', modeThal)) results = DynamicFrame.fromDF(newDF, glueContext, "results") return DynamicFrameCollection({"results": results}, glueContext)
def write(self, dynamic_frame_or_dfc, info = ""): if isinstance(dynamic_frame_or_dfc, DynamicFrame): return self.writeFrame(dynamic_frame_or_dfc, info) elif isinstance(dynamic_frame_or_dfc, DynamicFrameCollection): res_frames = [self.writeFrame(frame) for frame in dynamic_frame_or_dfc.values()] return DynamicFrameCollection(res_frames, self._sql_ctx) else: raise TypeError("dynamic_frame_or_dfc must be an instance of" "DynamicFrame or DynamicFrameCollection. Got " + str(type(dynamic_frame_or_dfc)))
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: df = dfc.select(list(dfc.keys())[0]).toDF() from pyspark.sql.functions import input_file_name from pyspark.sql.functions import regexp_extract df = df.withColumn("filename", input_file_name( )) # non-null for 1st run (lots of files), null for 2nd run (few files) df = df.withColumn( "submitteddatehour", regexp_extract(df['filename'], r".+analytics-.+(202[01]-..-..-..).*parquet", 1)) df = df.repartition("submitteddatehour") dyf = DynamicFrame.fromDF(df, glueContext, "submitteddatehour-extracted") return (DynamicFrameCollection({"CustomTransform0": dyf}, glueContext))
transformation_ctx="Transform4") ## @type: Join ## @args: [keys2 = ["parking_ticket_number"], keys1 = ["ticket_number"], transformation_ctx = "Transform2"] ## @return: Transform2 ## @inputs: [frame1 = Transform4, frame2 = Transform1] Transform2 = Join.apply(frame1=Transform4, frame2=Transform1, keys2=["parking_ticket_number"], keys1=["ticket_number"], transformation_ctx="Transform2") ## @type: CustomCode ## @args: [dynamicFrameConstruction = DynamicFrameCollection({"Transform2": Transform2}, glueContext), className = Aggregate_Tickets, transformation_ctx = "Transform3"] ## @return: Transform3 ## @inputs: [dfc = Transform2] Transform3 = Aggregate_Tickets( glueContext, DynamicFrameCollection({"Transform2": Transform2}, glueContext)) ## @type: SelectFromCollection ## @args: [key = list(Transform3.keys())[0], transformation_ctx = "Transform0"] ## @return: Transform0 ## @inputs: [dfc = Transform3] Transform0 = SelectFromCollection.apply(dfc=Transform3, key=list(Transform3.keys())[0], transformation_ctx="Transform0") ## @type: DataSink ## @args: [connection_type = "s3", format = "parquet", connection_options = {"path": "s3://glue-studio-blog-537808241319/parking_tickets_count/", "compression": "gzip", "partitionKeys": []}, transformation_ctx = "DataSink0"] ## @return: DataSink0 ## @inputs: [frame = Transform0] DataSink0 = glueContext.write_dynamic_frame.from_options( frame=Transform0, connection_type="s3", format="parquet",
return DynamicFrameCollection({"results": results}, glueContext) ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [format_options = {"withHeader":True,"separator":",","quoteChar":"\""}, connection_type = "s3", format = "csv", connection_options = {"paths": ["s3://aws-glue-lab-demo-landing/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012_part00000.csv"]}, transformation_ctx = "DataSource0"] ## @return: DataSource0 ## @inputs: [] DataSource0 = glueContext.create_dynamic_frame.from_options(format_options = {"withHeader":True,"separator":",","quoteChar":"\""}, connection_type = "s3", format = "csv", connection_options = {"paths": ["s3://aws-glue-lab-demo-landing/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012_part00000.csv"]}, transformation_ctx = "DataSource0") ## @type: CustomCode ## @args: [dynamicFrameConstruction = DynamicFrameCollection({"DataSource0": DataSource0}, glueContext), className = MyTransform, transformation_ctx = "Transform1"] ## @return: Transform1 ## @inputs: [dfc = DataSource0] Transform1 = MyTransform(glueContext, DynamicFrameCollection({"DataSource0": DataSource0}, glueContext)) ## @type: SelectFromCollection ## @args: [key = list(Transform1.keys())[0], transformation_ctx = "Transform0"] ## @return: Transform0 ## @inputs: [dfc = Transform1] Transform0 = SelectFromCollection.apply(dfc = Transform1, key = list(Transform1.keys())[0], transformation_ctx = "Transform0") ## @type: DataSink ## @args: [connection_type = "s3", format = "csv", connection_options = {"path": "s3://aws-glue-lab-demo-tgt/", "partitionKeys": []}, transformation_ctx = "DataSink0"] ## @return: DataSink0 ## @inputs: [frame = Transform0] DataSink0 = glueContext.write_dynamic_frame.from_options(frame = Transform0, connection_type = "s3", format = "csv", connection_options = {"path": "s3://aws-glue-lab-demo-tgt/", "partitionKeys": []}, transformation_ctx = "DataSink0") job.commit()
sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) DataSource0 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}chain_venues", transformation_ctx = "DataSource0") Transform10 = ApplyMapping.apply(frame = DataSource0, mappings = [("venueid", "string", "venues_venueid", "string"), ("chainid", "string", "chainid", "string")], transformation_ctx = "Transform10") DataSource2 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}chain_tickers", transformation_ctx = "DataSource2") Transform4 = SelectFields.apply(frame = DataSource2, paths = ["ticker"], transformation_ctx = "Transform4") Transform7 = MyTransform(glueContext, DynamicFrameCollection({"Transform4": Transform4}, glueContext)) Transform8 = SelectFromCollection.apply(dfc = Transform7, key = list(Transform7.keys())[0], transformation_ctx = "Transform8") DataSink1 = glueContext.write_dynamic_frame.from_options(frame = Transform8, connection_type = "s3", format = "csv", connection_options = {"path": f"s3://{args['bucket']}/data/etl-output/mapping/identities/", "compression": "gzip", "partitionKeys": []}, transformation_ctx = "DataSink1") Transform1 = ApplyMapping.apply(frame = DataSource2, mappings = [("chainid", "string", "chain_tickers_chainid", "string"), ("ticker", "string", "ticker", "string")], transformation_ctx = "Transform1") Transform0 = Join.apply(frame1 = Transform10, frame2 = Transform1, keys2 = ["chain_tickers_chainid"], keys1 = ["chainid"], transformation_ctx = "Transform0") Transform11 = SelectFields.apply(frame = Transform0, paths = ["venues_venueid", "ticker"], transformation_ctx = "Transform11") DataSource1 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}data_set", transformation_ctx = "DataSource1") Transform5 = ApplyMapping.apply(frame = DataSource1, mappings = [("venueid", "string", "venueid", "string"), ("utc_date", "string", "date", "string"), ("dwell", "long", "signal1", "long")], transformation_ctx = "Transform5")
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: selected = dfc.select(list(dfc.keys())[0]).toDF() selected.createOrReplaceTempView("CountyPolicyMx") totals = spark.sql("SELECT county, policyID, Max(eq_site_limit) as Mx_eq_site_limit FROM CountyPolicyMx GROUP BY county, policyID") results = DynamicFrame.fromDF(totals, glueContext, "results") return DynamicFrameCollection({"results": results}, glueContext)
## @inputs: [] DataSource0 = glueContext.create_dynamic_frame.from_options( connection_type="s3", format="parquet", connection_options={ "paths": [f"{args['SOURCE_BUCKET_URI']}/"], "recurse": True }, transformation_ctx="DataSource0") ## @type: CustomCode ## @args: [dynamicFrameConstruction = DynamicFrameCollection({"DataSource0": DataSource0}, glueContext), className = MyTransform, transformation_ctx = "Transform0"] ## @return: Transform0 ## @inputs: [dfc = DataSource0] Transform0 = MyTransform( glueContext, DynamicFrameCollection({"DataSource0": DataSource0}, glueContext)) ## @type: SelectFromCollection ## @args: [key = list(Transform0.keys())[0], transformation_ctx = "Transform1"] ## @return: Transform1 ## @inputs: [dfc = Transform0] Transform1 = SelectFromCollection.apply(dfc=Transform0, key=list(Transform0.keys())[0], transformation_ctx="Transform1") ## @type: ApplyMapping ## @args: [mappings = [("submitteddatehour", "string", "submitteddatehour", "string"), ("startdate", "string", "startdate", "string"), ("enddate", "string", "enddate", "string"), ("postaldistrict", "string", "postaldistrict", "string"), ("localauthority", "string", "localauthority", "string"), ("devicemodel", "string", "devicemodel", "string"), ("latestapplicationversion", "string", "latestapplicationversion", "string"), ("operatingsystemversion", "string", "operatingsystemversion", "string"), ("cumulativedownloadbytes", "int", "cumulativedownloadbytes", "int"), ("cumulativeuploadbytes", "int", "cumulativeuploadbytes", "int"), ("cumulativecellulardownloadbytes", "int", "cumulativecellulardownloadbytes", "int"), ("cumulativecellularuploadbytes", "int", "cumulativecellularuploadbytes", "int"), ("cumulativewifidownloadbytes", "int", "cumulativewifidownloadbytes", "int"), ("cumulativewifiuploadbytes", "int", "cumulativewifiuploadbytes", "int"), ("checkedin", "int", "checkedin", "int"), ("canceledcheckin", "int", "canceledcheckin", "int"), ("receivedvoidtestresult", "int", "receivedvoidtestresult", "int"), ("isisolatingbackgroundtick", "int", "isisolatingbackgroundtick", "int"), ("hashadriskycontactbackgroundtick", "int", "hashadriskycontactbackgroundtick", "int"), ("receivedpositivetestresult", "int", "receivedpositivetestresult", "int"), ("receivednegativetestresult", "int", "receivednegativetestresult", "int"), ("hasselfdiagnosedpositivebackgroundtick", "int", "hasselfdiagnosedpositivebackgroundtick", "int"), ("completedquestionnaireandstartedisolation", "int", "completedquestionnaireandstartedisolation", "int"), ("encounterdetectionpausedbackgroundtick", "int", "encounterdetectionpausedbackgroundtick", "int"), ("completedquestionnairebutdidnotstartisolation", "int", "completedquestionnairebutdidnotstartisolation", "int"), ("totalbackgroundtasks", "int", "totalbackgroundtasks", "int"), ("runningnormallybackgroundtick", "int", "runningnormallybackgroundtick", "int"), ("completedonboarding", "int", "completedonboarding", "int"), ("includesmultipleapplicationversions", "boolean", "includesmultipleapplicationversions", "boolean"), ("receivedvoidtestresultenteredmanually", "int", "receivedvoidtestresultenteredmanually", "int"), ("receivedpositivetestresultenteredmanually", "int", "receivedpositivetestresultenteredmanually", "int"), ("receivednegativetestresultenteredmanually", "int", "receivednegativetestresultenteredmanually", "int"), ("receivedvoidtestresultviapolling", "int", "receivedvoidtestresultviapolling", "int"), ("receivedpositivetestresultviapolling", "int", "receivedpositivetestresultviapolling", "int"), ("receivednegativetestresultviapolling", "int", "receivednegativetestresultviapolling", "int"), ("hasselfdiagnosedbackgroundtick", "int", "hasselfdiagnosedbackgroundtick", "int"), ("hastestedpositivebackgroundtick", "int", "hastestedpositivebackgroundtick", "int"), ("isisolatingforselfdiagnosedbackgroundtick", "int", "isisolatingforselfdiagnosedbackgroundtick", "int"), ("isisolatingfortestedpositivebackgroundtick", "int", "isisolatingfortestedpositivebackgroundtick", "int"), ("isisolatingforhadriskycontactbackgroundtick", "int", "isisolatingforhadriskycontactbackgroundtick", "int"), ("receivedriskycontactnotification", "int", "receivedriskycontactnotification", "int"), ("startedisolation", "int", "startedisolation", "int"), ("receivedpositivetestresultwhenisolatingduetoriskycontact", "int", "receivedpositivetestresultwhenisolatingduetoriskycontact", "int"), ("receivedactiveipctoken", "int", "receivedactiveipctoken", "int"), ("haveactiveipctokenbackgroundtick", "int", "haveactiveipctokenbackgroundtick", "int"), ("selectedisolationpaymentsbutton", "int", "selectedisolationpaymentsbutton", "int"), ("launchedisolationpaymentsapplication", "int", "launchedisolationpaymentsapplication", "int"), ("receivedpositivelfdtestresultviapolling", "int", "receivedpositivelfdtestresultviapolling", "int"), ("receivednegativelfdtestresultviapolling", "int", "receivednegativelfdtestresultviapolling", "int"), ("receivedvoidlfdtestresultviapolling", "int", "receivedvoidlfdtestresultviapolling", "int"), ("receivedpositivelfdtestresultenteredmanually", "int", "receivedpositivelfdtestresultenteredmanually", "int"), ("receivednegativelfdtestresultenteredmanually", "int", "receivednegativelfdtestresultenteredmanually", "int"), ("receivedvoidlfdtestresultenteredmanually", "int", "receivedvoidlfdtestresultenteredmanually", "int"), ("hastestedlfdpositivebackgroundtick", "int", "hastestedlfdpositivebackgroundtick", "int"), ("isisolatingfortestedlfdpositivebackgroundtick", "int", "isisolatingfortestedlfdpositivebackgroundtick", "int"), ("totalexposurewindowsnotconsideredrisky", "int", "totalexposurewindowsnotconsideredrisky", "int"), ("totalexposurewindowsconsideredrisky", "int", "totalexposurewindowsconsideredrisky", "int"), ("acknowledgedstartofisolationduetoriskycontact", "int", "acknowledgedstartofisolationduetoriskycontact", "int"), ("hasriskycontactnotificationsenabledbackgroundtick", "int", "hasriskycontactnotificationsenabledbackgroundtick", "int"), ("totalriskycontactremindernotifications", "int", "totalriskycontactremindernotifications", "int"), ("receivedunconfirmedpositivetestresult", "int", "receivedunconfirmedpositivetestresult", "int"), ("isisolatingforunconfirmedtestbackgroundtick", "int", "isisolatingforunconfirmedtestbackgroundtick", "int"), ("launchedtestordering", "int", "launchedtestordering", "int"), ("didhavesymptomsbeforereceivedtestresult", "int", "didhavesymptomsbeforereceivedtestresult", "int"), ("didrememberonsetsymptomsdatebeforereceivedtestresult", "int", "didrememberonsetsymptomsdatebeforereceivedtestresult", "int"), ("didaskforsymptomsonpositivetestentry", "int", "didaskforsymptomsonpositivetestentry", "int"), ("declarednegativeresultfromdct", "int", "declarednegativeresultfromdct", "int"), ("receivedpositiveselfrapidtestresultviapolling", "int", "receivedpositiveselfrapidtestresultviapolling", "int"), ("receivednegativeselfrapidtestresultviapolling", "int", "receivednegativeselfrapidtestresultviapolling", "int"), ("receivedvoidselfrapidtestresultviapolling", "int", "receivedvoidselfrapidtestresultviapolling", "int"), ("receivedpositiveselfrapidtestresultenteredmanually", "int", "receivedpositiveselfrapidtestresultenteredmanually", "int"), ("receivednegativeselfrapidtestresultenteredmanually", "int", "receivednegativeselfrapidtestresultenteredmanually", "int"), ("receivedvoidselfrapidtestresultenteredmanually", "int", "receivedvoidselfrapidtestresultenteredmanually", "int"), ("isisolatingfortestedselfrapidpositivebackgroundtick", "int", "isisolatingfortestedselfrapidpositivebackgroundtick", "int"), ("hastestedselfrapidpositivebackgroundtick", "int", "hastestedselfrapidpositivebackgroundtick", "int"), ("receivedriskyvenuem1warning", "int", "receivedriskyvenuem1warning", "int"), ("receivedriskyvenuem2warning", "int", "receivedriskyvenuem2warning", "int"), ("hasreceivedriskyvenuem2warningbackgroundtick", "int", "hasreceivedriskyvenuem2warningbackgroundtick", "int"), ("totalalarmmanagerbackgroundtasks", "int", "totalalarmmanagerbackgroundtasks", "int"), ("missingpacketslast7days", "int", "missingpacketslast7days", "int"), ("consentedtosharevenuehistory", "int", "consentedtosharevenuehistory", "int"), ("askedtosharevenuehistory", "int", "askedtosharevenuehistory", "int")], transformation_ctx = "Transform2"] ## @return: Transform2 ## @inputs: [frame = Transform1] Transform2 = ApplyMapping.apply( frame=Transform1, mappings=[ ("submitteddatehour", "string", "submitteddatehour", "string"),
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: dyf_soggetti = dfc.select("Soggetti") dyf_contratti = dfc.select("Contratti") dyf_credito = dfc.select("Credito") dyf_prodotti = dfc.select("Prodotti") dyf_punti_di_fornitura = dfc.select("PuntiDiFornitura") #### Deduplica Soggetti deduplica_soggetti = dyf_soggetti.toDF() windowSpec=Window.partitionBy(deduplica_soggetti.nome,deduplica_soggetti.cognome).\ orderBy(F.col("key_soggetti").desc()) deduplica_soggetti = deduplica_soggetti.withColumn( "rank", F.row_number().over(windowSpec)).filter(F.col("rank") == 1) ### Numero Contratti per soggetto df_contratti_per_soggetto = dyf_contratti.toDF() df_contratti_per_soggetto = df_contratti_per_soggetto.groupBy( "key_soggetti").count() ### Debito Medio per soggetto df_debito_medio_per_cliente = dyf_credito.toDF() df_debito_medio_per_cliente=df_debito_medio_per_cliente.withColumn("d_importo", (F.col("importo").\ substr(F.lit(1), F.instr(F.col("importo"), '€')-2)).cast('double')) df_debito_medio_per_cliente = df_debito_medio_per_cliente.groupBy( "key_soggetti").agg(F.mean("d_importo")) df_debito_medio_per_cliente = df_debito_medio_per_cliente.withColumnRenamed( "avg(d_importo)", "debito_medio") ### ELE e Gas Medio, minimo anno_prima_attivazione_fornitura su contratti attivo df_contratti = dyf_contratti.toDF() df_contratti=df_contratti.withColumn("ts_data_attivazione_fornitura",F.to_timestamp (df_contratti.data_attivazione_fornitura)).\ withColumn ("ts_data_cessazione_fornitura",F.to_timestamp (df_contratti.data_cessazione_fornitura)).\ drop("key_punti_di_fornitura","data_cessazione_fornitura","codice_contratto","key_contratti", "canale_di_vendita","anno_prima_attivazione_fornitura") cd = F.current_timestamp() df_contratti=df_contratti.filter (df_contratti.ts_data_cessazione_fornitura >= cd).\ filter (df_contratti.ts_data_attivazione_fornitura <= cd) df_prodotti = dyf_prodotti.toDF() df_prodotti=df_prodotti.withColumn("ts_data_inizio_validita",F.to_timestamp (df_prodotti.data_inizio_validita)).\ withColumn ("ts_data_fine_validita",F.to_timestamp (df_prodotti.data_fine_validita)).\ drop ('data_inizio_validita','data_fine_validita','key_prodotti') df_prodotti = df_prodotti.withColumnRenamed("nome_prodotto", "nome_commerciale") df_prodotti=df_prodotti.withColumn("ELE", (df_prodotti.f0+ df_prodotti.f1+ df_prodotti.f2+ df_prodotti.f3)/4).\ drop('f0','f1','f2','f3') df_tariffe_contratti=df_contratti.join (df_prodotti,"nome_commerciale").\ withColumn ("realGas",F.when (F.col('vettore')=='GAS',df_prodotti.gas).otherwise (None)).\ withColumn ("realEle",F.when (F.col('vettore')=='ELE',df_prodotti.ELE).otherwise (None)).\ drop ('gas','ELE') df_tariffe_soggetti=df_tariffe_contratti.groupby (df_tariffe_contratti.key_soggetti).\ agg(F.min("data_attivazione_fornitura"),F.mean("realGas"),F.mean("realEle")) df_tariffe_soggetti=df_tariffe_soggetti.withColumnRenamed("min(data_attivazione_fornitura)","data_attivazione_fornitura").\ withColumnRenamed("avg(realGas)","media_GAS").\ withColumnRenamed("avg(realEle)","media_ELE") ## Calcolo dell'indice di churn e del canale di contatto preferenziale df_contratti = dyf_contratti.toDF() df_soggetti_canale_vendita=df_contratti.groupBy (df_contratti.key_soggetti,df_contratti.canale_di_vendita).\ agg (F.count(df_contratti.canale_di_vendita)) windowSpec=Window.partitionBy("key_soggetti").\ orderBy(F.col("count(canale_di_vendita)").desc()) df_soggetti_canale_vendita=df_soggetti_canale_vendita.\ withColumn("rank",F.row_number().over (windowSpec)).filter (F.col("rank")==1) df_soggetti_canale_vendita = df_soggetti_canale_vendita.drop( 'count(canale_di_vendita)', 'rank') df_contratti_churn = df_contratti.withColumn( "hadChurn", F.when(F.col('data_cessazione_fornitura') < cd, 1).otherwise(0)) df_contratti_churn = df_contratti_churn.groupBy("key_soggetti").agg( F.sum("hadChurn")) df_contratti_churn=df_contratti_churn.\ withColumn("Churn",F.when (F.col('sum(hadChurn)')>=1,1).otherwise (0)).\ drop("sum(hadChurn)") ### Unpivot tabella regioni df_contratti = dyf_contratti.toDF() df_fornitura = dyf_punti_di_fornitura.toDF() df_fonitura_per_contratto=df_contratti.join(df_fornitura,"key_punti_di_fornitura").\ groupBy ("key_soggetti","regione").\ pivot("regione").\ agg (F.count("key_punti_di_fornitura")) ### Preparazione Output output=deduplica_soggetti.join (df_contratti_per_soggetto,"key_soggetti").\ join(df_debito_medio_per_cliente,"key_soggetti").\ join(df_tariffe_soggetti,"key_soggetti").\ join(df_contratti_churn,"key_soggetti").\ join (df_soggetti_canale_vendita,"key_soggetti").\ join (df_fonitura_per_contratto,"key_soggetti") dyf_output = DynamicFrame.fromDF(output, glueContext, "output") return (DynamicFrameCollection({"CustomTransform0": dyf_output}, glueContext))