コード例 #1
0
def AggregateSignal(glueContext, dfc) -> DynamicFrameCollection:
    from pyspark.sql.functions import lit
    df = dfc.select(list(dfc.keys())[0]).toDF()
    df = df.groupby('ticker', 'date').agg({"signal1": "sum"}).withColumnRenamed('sum(signal1)', 'signal1')
    df = df.withColumn('market', lit('US'))
    aws_df = DynamicFrame.fromDF(df, glueContext, "conversion")
    return(DynamicFrameCollection({"CustomTransform0": aws_df}, glueContext))
コード例 #2
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    from pyspark.sql.functions import lit
    df = dfc.select(list(dfc.keys())[0]).toDF()
    df = df.drop_duplicates(subset=['ticker'])
    df = df.withColumn("market", lit('US'))
    aws_df = DynamicFrame.fromDF(df, glueContext, "conversion")
    return(DynamicFrameCollection({"CustomTransform0": aws_df}, glueContext))
コード例 #3
0
def Aggregate_Tickets(glueContext, dfc) -> DynamicFrameCollection:
    selected = dfc.select(list(dfc.keys())[0]).toDF()
    selected.createOrReplaceTempView("ticketcount")
    totals = spark.sql(
        "select court_location as location, infraction_description as infraction, count(infraction_code) as total  FROM ticketcount group by infraction_description, infraction_code, court_location order by court_location asc"
    )
    results = DynamicFrame.fromDF(totals, glueContext, "results")
    return DynamicFrameCollection({"results": results}, glueContext)
コード例 #4
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    import pyspark.sql.functions as F
    # convert dymamic frame to data frame
    df = dfc.select(list(dfc.keys())[0]).toDF()
    df = df.withColumn("title", F.upper(F.col("title")))
    # create dynamic frame from dataframe
    upperDf = DynamicFrame.fromDF(df, glueContext, "filter_votes")
    return (DynamicFrameCollection({"CustomTransform0": upperDf}, glueContext))
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    newdf = dfc.select(list(dfc.keys())[0]).toDF()

    from pyspark.sql import functions as sf
    from pyspark.sql.functions import col, substring
    newdf = newdf.withColumn('length', sf.length('id'))
    newdf = newdf.withColumn('IDnew',
                             col('id').substr(sf.lit(2), col('length')))
    newdatadyc = DynamicFrame.fromDF(newdf, glueContext, 'newData')
    return (DynamicFrameCollection({"CustomTransform0": newdatadyc},
                                   glueContext))
コード例 #6
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    selected = dfc.select(list(dfc.keys())[0]).toDF()
    
    from pyspark.sql.functions import regexp_replace as regxx
    
    modeCa = selected.groupby("ca").count().orderBy("count", ascending=False).first()[0]
    newDF = selected.withColumn('ca', regxx('ca', '\?', modeCa))
    
    modeThal = newDF.groupby("thal").count().orderBy("count", ascending=False).first()[0]
    newDF = newDF.withColumn('thal', regxx('thal', '\?', modeThal))
    
    results = DynamicFrame.fromDF(newDF, glueContext, "results")
    return DynamicFrameCollection({"results": results}, glueContext)
コード例 #7
0
    def write(self, dynamic_frame_or_dfc, info = ""):
        if isinstance(dynamic_frame_or_dfc, DynamicFrame):
            return self.writeFrame(dynamic_frame_or_dfc, info)

        elif isinstance(dynamic_frame_or_dfc, DynamicFrameCollection):
            res_frames = [self.writeFrame(frame)
                          for frame in dynamic_frame_or_dfc.values()]
            return DynamicFrameCollection(res_frames, self._sql_ctx)

        else:
            raise TypeError("dynamic_frame_or_dfc must be an instance of"
                            "DynamicFrame or DynamicFrameCollection. Got "
                            + str(type(dynamic_frame_or_dfc)))
コード例 #8
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    df = dfc.select(list(dfc.keys())[0]).toDF()

    from pyspark.sql.functions import input_file_name
    from pyspark.sql.functions import regexp_extract
    df = df.withColumn("filename", input_file_name(
    ))  # non-null for 1st run (lots of files), null for 2nd run (few files)
    df = df.withColumn(
        "submitteddatehour",
        regexp_extract(df['filename'],
                       r".+analytics-.+(202[01]-..-..-..).*parquet", 1))

    df = df.repartition("submitteddatehour")

    dyf = DynamicFrame.fromDF(df, glueContext, "submitteddatehour-extracted")

    return (DynamicFrameCollection({"CustomTransform0": dyf}, glueContext))
コード例 #9
0
    transformation_ctx="Transform4")
## @type: Join
## @args: [keys2 = ["parking_ticket_number"], keys1 = ["ticket_number"], transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame1 = Transform4, frame2 = Transform1]
Transform2 = Join.apply(frame1=Transform4,
                        frame2=Transform1,
                        keys2=["parking_ticket_number"],
                        keys1=["ticket_number"],
                        transformation_ctx="Transform2")
## @type: CustomCode
## @args: [dynamicFrameConstruction = DynamicFrameCollection({"Transform2": Transform2}, glueContext), className = Aggregate_Tickets, transformation_ctx = "Transform3"]
## @return: Transform3
## @inputs: [dfc = Transform2]
Transform3 = Aggregate_Tickets(
    glueContext, DynamicFrameCollection({"Transform2": Transform2},
                                        glueContext))
## @type: SelectFromCollection
## @args: [key = list(Transform3.keys())[0], transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [dfc = Transform3]
Transform0 = SelectFromCollection.apply(dfc=Transform3,
                                        key=list(Transform3.keys())[0],
                                        transformation_ctx="Transform0")
## @type: DataSink
## @args: [connection_type = "s3", format = "parquet", connection_options = {"path": "s3://glue-studio-blog-537808241319/parking_tickets_count/", "compression": "gzip", "partitionKeys": []}, transformation_ctx = "DataSink0"]
## @return: DataSink0
## @inputs: [frame = Transform0]
DataSink0 = glueContext.write_dynamic_frame.from_options(
    frame=Transform0,
    connection_type="s3",
    format="parquet",
コード例 #10
0
    return DynamicFrameCollection({"results": results}, glueContext)

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [format_options = {"withHeader":True,"separator":",","quoteChar":"\""}, connection_type = "s3", format = "csv", connection_options = {"paths": ["s3://aws-glue-lab-demo-landing/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012_part00000.csv"]}, transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_options(format_options = {"withHeader":True,"separator":",","quoteChar":"\""}, connection_type = "s3", format = "csv", connection_options = {"paths": ["s3://aws-glue-lab-demo-landing/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012/aws-glue-lab-heartdisease-studio-job_02Mar2021_1614702502012_part00000.csv"]}, transformation_ctx = "DataSource0")
## @type: CustomCode
## @args: [dynamicFrameConstruction = DynamicFrameCollection({"DataSource0": DataSource0}, glueContext), className = MyTransform, transformation_ctx = "Transform1"]
## @return: Transform1
## @inputs: [dfc = DataSource0]
Transform1 = MyTransform(glueContext, DynamicFrameCollection({"DataSource0": DataSource0}, glueContext))
## @type: SelectFromCollection
## @args: [key = list(Transform1.keys())[0], transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [dfc = Transform1]
Transform0 = SelectFromCollection.apply(dfc = Transform1, key = list(Transform1.keys())[0], transformation_ctx = "Transform0")
## @type: DataSink
## @args: [connection_type = "s3", format = "csv", connection_options = {"path": "s3://aws-glue-lab-demo-tgt/", "partitionKeys": []}, transformation_ctx = "DataSink0"]
## @return: DataSink0
## @inputs: [frame = Transform0]
DataSink0 = glueContext.write_dynamic_frame.from_options(frame = Transform0, connection_type = "s3", format = "csv", connection_options = {"path": "s3://aws-glue-lab-demo-tgt/", "partitionKeys": []}, transformation_ctx = "DataSink0")
job.commit()
コード例 #11
0
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

DataSource0 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}chain_venues", transformation_ctx = "DataSource0")

Transform10 = ApplyMapping.apply(frame = DataSource0, mappings = [("venueid", "string", "venues_venueid", "string"), ("chainid", "string", "chainid", "string")], transformation_ctx = "Transform10")

DataSource2 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}chain_tickers", transformation_ctx = "DataSource2")

Transform4 = SelectFields.apply(frame = DataSource2, paths = ["ticker"], transformation_ctx = "Transform4")

Transform7 = MyTransform(glueContext, DynamicFrameCollection({"Transform4": Transform4}, glueContext))

Transform8 = SelectFromCollection.apply(dfc = Transform7, key = list(Transform7.keys())[0], transformation_ctx = "Transform8")

DataSink1 = glueContext.write_dynamic_frame.from_options(frame = Transform8, connection_type = "s3", format = "csv", connection_options = {"path": f"s3://{args['bucket']}/data/etl-output/mapping/identities/", "compression": "gzip", "partitionKeys": []}, transformation_ctx = "DataSink1")

Transform1 = ApplyMapping.apply(frame = DataSource2, mappings = [("chainid", "string", "chain_tickers_chainid", "string"), ("ticker", "string", "ticker", "string")], transformation_ctx = "Transform1")

Transform0 = Join.apply(frame1 = Transform10, frame2 = Transform1, keys2 = ["chain_tickers_chainid"], keys1 = ["chainid"], transformation_ctx = "Transform0")

Transform11 = SelectFields.apply(frame = Transform0, paths = ["venues_venueid", "ticker"], transformation_ctx = "Transform11")

DataSource1 = glueContext.create_dynamic_frame.from_catalog(database = args['database_name'], table_name = f"{args['database_table_prefix']}data_set", transformation_ctx = "DataSource1")

Transform5 = ApplyMapping.apply(frame = DataSource1, mappings = [("venueid", "string", "venueid", "string"), ("utc_date", "string", "date", "string"), ("dwell", "long", "signal1", "long")], transformation_ctx = "Transform5")
コード例 #12
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    selected = dfc.select(list(dfc.keys())[0]).toDF()
    selected.createOrReplaceTempView("CountyPolicyMx")
    totals = spark.sql("SELECT county, policyID, Max(eq_site_limit) as Mx_eq_site_limit FROM CountyPolicyMx GROUP BY county, policyID")
    results = DynamicFrame.fromDF(totals, glueContext, "results")
    return DynamicFrameCollection({"results": results}, glueContext)
コード例 #13
0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    format="parquet",
    connection_options={
        "paths": [f"{args['SOURCE_BUCKET_URI']}/"],
        "recurse": True
    },
    transformation_ctx="DataSource0")
## @type: CustomCode
## @args: [dynamicFrameConstruction = DynamicFrameCollection({"DataSource0": DataSource0}, glueContext), className = MyTransform, transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [dfc = DataSource0]
Transform0 = MyTransform(
    glueContext,
    DynamicFrameCollection({"DataSource0": DataSource0}, glueContext))
## @type: SelectFromCollection
## @args: [key = list(Transform0.keys())[0], transformation_ctx = "Transform1"]
## @return: Transform1
## @inputs: [dfc = Transform0]
Transform1 = SelectFromCollection.apply(dfc=Transform0,
                                        key=list(Transform0.keys())[0],
                                        transformation_ctx="Transform1")
## @type: ApplyMapping
## @args: [mappings = [("submitteddatehour", "string", "submitteddatehour", "string"), ("startdate", "string", "startdate", "string"), ("enddate", "string", "enddate", "string"), ("postaldistrict", "string", "postaldistrict", "string"), ("localauthority", "string", "localauthority", "string"), ("devicemodel", "string", "devicemodel", "string"), ("latestapplicationversion", "string", "latestapplicationversion", "string"), ("operatingsystemversion", "string", "operatingsystemversion", "string"), ("cumulativedownloadbytes", "int", "cumulativedownloadbytes", "int"), ("cumulativeuploadbytes", "int", "cumulativeuploadbytes", "int"), ("cumulativecellulardownloadbytes", "int", "cumulativecellulardownloadbytes", "int"), ("cumulativecellularuploadbytes", "int", "cumulativecellularuploadbytes", "int"), ("cumulativewifidownloadbytes", "int", "cumulativewifidownloadbytes", "int"), ("cumulativewifiuploadbytes", "int", "cumulativewifiuploadbytes", "int"), ("checkedin", "int", "checkedin", "int"), ("canceledcheckin", "int", "canceledcheckin", "int"), ("receivedvoidtestresult", "int", "receivedvoidtestresult", "int"), ("isisolatingbackgroundtick", "int", "isisolatingbackgroundtick", "int"), ("hashadriskycontactbackgroundtick", "int", "hashadriskycontactbackgroundtick", "int"), ("receivedpositivetestresult", "int", "receivedpositivetestresult", "int"), ("receivednegativetestresult", "int", "receivednegativetestresult", "int"), ("hasselfdiagnosedpositivebackgroundtick", "int", "hasselfdiagnosedpositivebackgroundtick", "int"), ("completedquestionnaireandstartedisolation", "int", "completedquestionnaireandstartedisolation", "int"), ("encounterdetectionpausedbackgroundtick", "int", "encounterdetectionpausedbackgroundtick", "int"), ("completedquestionnairebutdidnotstartisolation", "int", "completedquestionnairebutdidnotstartisolation", "int"), ("totalbackgroundtasks", "int", "totalbackgroundtasks", "int"), ("runningnormallybackgroundtick", "int", "runningnormallybackgroundtick", "int"), ("completedonboarding", "int", "completedonboarding", "int"), ("includesmultipleapplicationversions", "boolean", "includesmultipleapplicationversions", "boolean"), ("receivedvoidtestresultenteredmanually", "int", "receivedvoidtestresultenteredmanually", "int"), ("receivedpositivetestresultenteredmanually", "int", "receivedpositivetestresultenteredmanually", "int"), ("receivednegativetestresultenteredmanually", "int", "receivednegativetestresultenteredmanually", "int"), ("receivedvoidtestresultviapolling", "int", "receivedvoidtestresultviapolling", "int"), ("receivedpositivetestresultviapolling", "int", "receivedpositivetestresultviapolling", "int"), ("receivednegativetestresultviapolling", "int", "receivednegativetestresultviapolling", "int"), ("hasselfdiagnosedbackgroundtick", "int", "hasselfdiagnosedbackgroundtick", "int"), ("hastestedpositivebackgroundtick", "int", "hastestedpositivebackgroundtick", "int"), ("isisolatingforselfdiagnosedbackgroundtick", "int", "isisolatingforselfdiagnosedbackgroundtick", "int"), ("isisolatingfortestedpositivebackgroundtick", "int", "isisolatingfortestedpositivebackgroundtick", "int"), ("isisolatingforhadriskycontactbackgroundtick", "int", "isisolatingforhadriskycontactbackgroundtick", "int"), ("receivedriskycontactnotification", "int", "receivedriskycontactnotification", "int"), ("startedisolation", "int", "startedisolation", "int"), ("receivedpositivetestresultwhenisolatingduetoriskycontact", "int", "receivedpositivetestresultwhenisolatingduetoriskycontact", "int"), ("receivedactiveipctoken", "int", "receivedactiveipctoken", "int"), ("haveactiveipctokenbackgroundtick", "int", "haveactiveipctokenbackgroundtick", "int"), ("selectedisolationpaymentsbutton", "int", "selectedisolationpaymentsbutton", "int"), ("launchedisolationpaymentsapplication", "int", "launchedisolationpaymentsapplication", "int"), ("receivedpositivelfdtestresultviapolling", "int", "receivedpositivelfdtestresultviapolling", "int"), ("receivednegativelfdtestresultviapolling", "int", "receivednegativelfdtestresultviapolling", "int"), ("receivedvoidlfdtestresultviapolling", "int", "receivedvoidlfdtestresultviapolling", "int"), ("receivedpositivelfdtestresultenteredmanually", "int", "receivedpositivelfdtestresultenteredmanually", "int"), ("receivednegativelfdtestresultenteredmanually", "int", "receivednegativelfdtestresultenteredmanually", "int"), ("receivedvoidlfdtestresultenteredmanually", "int", "receivedvoidlfdtestresultenteredmanually", "int"), ("hastestedlfdpositivebackgroundtick", "int", "hastestedlfdpositivebackgroundtick", "int"), ("isisolatingfortestedlfdpositivebackgroundtick", "int", "isisolatingfortestedlfdpositivebackgroundtick", "int"), ("totalexposurewindowsnotconsideredrisky", "int", "totalexposurewindowsnotconsideredrisky", "int"), ("totalexposurewindowsconsideredrisky", "int", "totalexposurewindowsconsideredrisky", "int"), ("acknowledgedstartofisolationduetoriskycontact", "int", "acknowledgedstartofisolationduetoriskycontact", "int"), ("hasriskycontactnotificationsenabledbackgroundtick", "int", "hasriskycontactnotificationsenabledbackgroundtick", "int"), ("totalriskycontactremindernotifications", "int", "totalriskycontactremindernotifications", "int"), ("receivedunconfirmedpositivetestresult", "int", "receivedunconfirmedpositivetestresult", "int"), ("isisolatingforunconfirmedtestbackgroundtick", "int", "isisolatingforunconfirmedtestbackgroundtick", "int"), ("launchedtestordering", "int", "launchedtestordering", "int"), ("didhavesymptomsbeforereceivedtestresult", "int", "didhavesymptomsbeforereceivedtestresult", "int"), ("didrememberonsetsymptomsdatebeforereceivedtestresult", "int", "didrememberonsetsymptomsdatebeforereceivedtestresult", "int"), ("didaskforsymptomsonpositivetestentry", "int", "didaskforsymptomsonpositivetestentry", "int"), ("declarednegativeresultfromdct", "int", "declarednegativeresultfromdct", "int"), ("receivedpositiveselfrapidtestresultviapolling", "int", "receivedpositiveselfrapidtestresultviapolling", "int"), ("receivednegativeselfrapidtestresultviapolling", "int", "receivednegativeselfrapidtestresultviapolling", "int"), ("receivedvoidselfrapidtestresultviapolling", "int", "receivedvoidselfrapidtestresultviapolling", "int"), ("receivedpositiveselfrapidtestresultenteredmanually", "int", "receivedpositiveselfrapidtestresultenteredmanually", "int"), ("receivednegativeselfrapidtestresultenteredmanually", "int", "receivednegativeselfrapidtestresultenteredmanually", "int"), ("receivedvoidselfrapidtestresultenteredmanually", "int", "receivedvoidselfrapidtestresultenteredmanually", "int"), ("isisolatingfortestedselfrapidpositivebackgroundtick", "int", "isisolatingfortestedselfrapidpositivebackgroundtick", "int"), ("hastestedselfrapidpositivebackgroundtick", "int", "hastestedselfrapidpositivebackgroundtick", "int"), ("receivedriskyvenuem1warning", "int", "receivedriskyvenuem1warning", "int"), ("receivedriskyvenuem2warning", "int", "receivedriskyvenuem2warning", "int"), ("hasreceivedriskyvenuem2warningbackgroundtick", "int", "hasreceivedriskyvenuem2warningbackgroundtick", "int"), ("totalalarmmanagerbackgroundtasks", "int", "totalalarmmanagerbackgroundtasks", "int"), ("missingpacketslast7days", "int", "missingpacketslast7days", "int"), ("consentedtosharevenuehistory", "int", "consentedtosharevenuehistory", "int"), ("askedtosharevenuehistory", "int", "askedtosharevenuehistory", "int")], transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame = Transform1]
Transform2 = ApplyMapping.apply(
    frame=Transform1,
    mappings=[
        ("submitteddatehour", "string", "submitteddatehour", "string"),
コード例 #14
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    dyf_soggetti = dfc.select("Soggetti")
    dyf_contratti = dfc.select("Contratti")
    dyf_credito = dfc.select("Credito")
    dyf_prodotti = dfc.select("Prodotti")
    dyf_punti_di_fornitura = dfc.select("PuntiDiFornitura")

    #### Deduplica Soggetti
    deduplica_soggetti = dyf_soggetti.toDF()
    windowSpec=Window.partitionBy(deduplica_soggetti.nome,deduplica_soggetti.cognome).\
    orderBy(F.col("key_soggetti").desc())
    deduplica_soggetti = deduplica_soggetti.withColumn(
        "rank",
        F.row_number().over(windowSpec)).filter(F.col("rank") == 1)

    ### Numero Contratti per soggetto
    df_contratti_per_soggetto = dyf_contratti.toDF()
    df_contratti_per_soggetto = df_contratti_per_soggetto.groupBy(
        "key_soggetti").count()

    ### Debito Medio per soggetto
    df_debito_medio_per_cliente = dyf_credito.toDF()
    df_debito_medio_per_cliente=df_debito_medio_per_cliente.withColumn("d_importo", (F.col("importo").\
            substr(F.lit(1), F.instr(F.col("importo"), '€')-2)).cast('double'))
    df_debito_medio_per_cliente = df_debito_medio_per_cliente.groupBy(
        "key_soggetti").agg(F.mean("d_importo"))
    df_debito_medio_per_cliente = df_debito_medio_per_cliente.withColumnRenamed(
        "avg(d_importo)", "debito_medio")

    ### ELE e Gas Medio, minimo anno_prima_attivazione_fornitura su contratti attivo

    df_contratti = dyf_contratti.toDF()
    df_contratti=df_contratti.withColumn("ts_data_attivazione_fornitura",F.to_timestamp (df_contratti.data_attivazione_fornitura)).\
        withColumn ("ts_data_cessazione_fornitura",F.to_timestamp (df_contratti.data_cessazione_fornitura)).\
        drop("key_punti_di_fornitura","data_cessazione_fornitura","codice_contratto","key_contratti",
            "canale_di_vendita","anno_prima_attivazione_fornitura")

    cd = F.current_timestamp()
    df_contratti=df_contratti.filter (df_contratti.ts_data_cessazione_fornitura >= cd).\
            filter (df_contratti.ts_data_attivazione_fornitura <= cd)

    df_prodotti = dyf_prodotti.toDF()
    df_prodotti=df_prodotti.withColumn("ts_data_inizio_validita",F.to_timestamp (df_prodotti.data_inizio_validita)).\
        withColumn ("ts_data_fine_validita",F.to_timestamp (df_prodotti.data_fine_validita)).\
        drop ('data_inizio_validita','data_fine_validita','key_prodotti')
    df_prodotti = df_prodotti.withColumnRenamed("nome_prodotto",
                                                "nome_commerciale")
    df_prodotti=df_prodotti.withColumn("ELE", (df_prodotti.f0+ df_prodotti.f1+ df_prodotti.f2+ df_prodotti.f3)/4).\
        drop('f0','f1','f2','f3')

    df_tariffe_contratti=df_contratti.join (df_prodotti,"nome_commerciale").\
        withColumn ("realGas",F.when (F.col('vettore')=='GAS',df_prodotti.gas).otherwise (None)).\
        withColumn ("realEle",F.when (F.col('vettore')=='ELE',df_prodotti.ELE).otherwise (None)).\
        drop ('gas','ELE')

    df_tariffe_soggetti=df_tariffe_contratti.groupby (df_tariffe_contratti.key_soggetti).\
        agg(F.min("data_attivazione_fornitura"),F.mean("realGas"),F.mean("realEle"))

    df_tariffe_soggetti=df_tariffe_soggetti.withColumnRenamed("min(data_attivazione_fornitura)","data_attivazione_fornitura").\
        withColumnRenamed("avg(realGas)","media_GAS").\
        withColumnRenamed("avg(realEle)","media_ELE")

    ## Calcolo dell'indice di churn e del canale di contatto preferenziale
    df_contratti = dyf_contratti.toDF()
    df_soggetti_canale_vendita=df_contratti.groupBy (df_contratti.key_soggetti,df_contratti.canale_di_vendita).\
            agg (F.count(df_contratti.canale_di_vendita))

    windowSpec=Window.partitionBy("key_soggetti").\
            orderBy(F.col("count(canale_di_vendita)").desc())

    df_soggetti_canale_vendita=df_soggetti_canale_vendita.\
        withColumn("rank",F.row_number().over (windowSpec)).filter (F.col("rank")==1)

    df_soggetti_canale_vendita = df_soggetti_canale_vendita.drop(
        'count(canale_di_vendita)', 'rank')
    df_contratti_churn = df_contratti.withColumn(
        "hadChurn",
        F.when(F.col('data_cessazione_fornitura') < cd, 1).otherwise(0))
    df_contratti_churn = df_contratti_churn.groupBy("key_soggetti").agg(
        F.sum("hadChurn"))
    df_contratti_churn=df_contratti_churn.\
        withColumn("Churn",F.when (F.col('sum(hadChurn)')>=1,1).otherwise (0)).\
        drop("sum(hadChurn)")

    ### Unpivot tabella regioni
    df_contratti = dyf_contratti.toDF()
    df_fornitura = dyf_punti_di_fornitura.toDF()
    df_fonitura_per_contratto=df_contratti.join(df_fornitura,"key_punti_di_fornitura").\
        groupBy ("key_soggetti","regione").\
        pivot("regione").\
        agg (F.count("key_punti_di_fornitura"))

    ### Preparazione Output
    output=deduplica_soggetti.join (df_contratti_per_soggetto,"key_soggetti").\
        join(df_debito_medio_per_cliente,"key_soggetti").\
        join(df_tariffe_soggetti,"key_soggetti").\
        join(df_contratti_churn,"key_soggetti").\
        join (df_soggetti_canale_vendita,"key_soggetti").\
        join (df_fonitura_per_contratto,"key_soggetti")

    dyf_output = DynamicFrame.fromDF(output, glueContext, "output")
    return (DynamicFrameCollection({"CustomTransform0": dyf_output},
                                   glueContext))