dataPath = "file:///home/karthik/srikarthik/twitter/" """ In case of windows type absolute path of file including drive. Example: dataPath = "C:\\filepath\\twitter\\" Note: only specify folder path dont specify filename else it throws error in below statements """ # Defining Schema dataSchema = StructType([StructField("Text",StringType(),True),StructField("Score",DoubleType(),True)]) """ Notes: *The method SparkSession.readStream returns a DataStreamReader used to configure the stream. *Every streaming DataFrame must have a schema - the definition of column names and data types. else it through error. *The type of stream can be: Files, Kafka, TCP/IP, console etc. we are using file as stream. In our example below, we will be consuming files written continuously to a pre-defined directory. To control how much data is pulled into Spark at once, we can specify the option maxFilesPerTrigger.For example we will be reading in only one file for every trigger. * We have stored tweets with fields text and score using '|' as delimiter as a text file. Hence we will read using csv format with separator as '|'. Note Streaming data frame
pdf_system_alerts= pd.DataFrame(system_alerts) pdf_system_regions= pd.DataFrame(system_regions) # Datatype change : station_ids -> List to station_id string and renaming the column too columns = ['alert_id','last_updated', 'station_id','summary','type'] new_pdf_system_alerts = pd.DataFrame(columns=columns) for index, row in pdf_system_alerts.iterrows(): if len(row.station_ids)==1: new_pdf_system_alerts.loc[len(new_pdf_system_alerts)]=[row['alert_id'],row['last_updated'],row['station_ids'][0],row['summary'],row['type']] else: for val in row.station_ids[:]: new_pdf_system_alerts.loc[len(new_pdf_system_alerts)]=[row['alert_id'],row['last_updated'],val,row['summary'],row['type']] schema_system_alerts = StructType([StructField("alert_id", LongType(), True), StructField("last_updated", LongType(), False), StructField("station_id", StringType(), False), StructField("summary", StringType(), False), StructField("type", StringType(), False)]) sqldf_system_alerts = sqlContext.createDataFrame([],schema_system_alerts) #Changing Pandas DF to SQL for aggregation sqldf_station_info=sqlContext.createDataFrame(pdf_station_info) sqldf_station_status=sqlContext.createDataFrame(pdf_station_status) if len(pdf_system_alerts>1): sqldf_system_alerts=sqlContext.createDataFrame(new_pdf_system_alerts) if len(pdf_system_regions): sqldf_system_regions=sqlContext.createDataFrame(pdf_system_regions) joined_info_status = sqldf_station_info.join(sqldf_station_status,['station_id'],'outer') join_info_status_alert = joined_info_status.join(sqldf_system_alerts,['station_id'],'outer')
jdf = spark.read.json(df.select("emp_json").rdd.map(lambda s: str(s))) jdf.show(truncate=False) jdf.printSchema() schema = \ StructType([ StructField("emp",ArrayType( StructType([ StructField("address", StructType([ StructField("city", StringType(), True), StructField("street",StringType(), True), StructField("unit", StringType(), True)]), True), StructField("name", StructType([ StructField("firstName", StringType(), True), StructField("lastName", StringType(), True)]), True) ]) ,True) ,True) ]) # .select(F.explode(F.col("employee")).alias("emp")) \ jdf = jdf.select(F.col("*"), F.from_json(F.col("_corrupt_record"), schema).alias("emp")) jdf = jdf.select("emp.name.firstName", "emp.name.lasteName",
def cleaner(df_union, spark): # Cambio el tipo de datos df_union = df_union.withColumn( "intervalSize", df_union["intervalSize"].cast(IntegerType())) #Hago el request para traer el json de configuracion base_url = "http://80f56133-default-platforms-88e5-2108999998.us-east-1.elb.amazonaws.com/platform_settings/get_configuration" headers = { "Content-Type": "application/json", "Transaction-Id": "test", "User-Id": "test", "owner": "test" } # Hago el request usando urlib req = request.Request(base_url, headers=headers) response = request.urlopen(req).read() response = json.loads(response) req_config = response["data"][0] # extraigo los atributos y los guardo en variables for item in req_config["configPastFutureDays"]: if item['type'] == 'LP': lp_futuro = item['future'] lp_pasado = item['past'] else: re_futuro = item['future'] re_pasado = item['past'] ceros = req_config["withNullsOrCero"]["withCeros"] lista_apagon = req_config["QualityCode"]["PowerOutage"] def check_empty(df): return len(df.head(1)) > 0 # Defino diccionario y la funcion para el paso de utc a readinglocaltime, esto se usa en el paso 5 utc_dicc = { '0': -12, '1': -11, '2': -10, '3': -9, '4': -8, '5': -7, '6': -7, '7': -6, '8': -6, '9': -6, '10': -5, '11': -5, '12': -5, '13': -4, '14': -4, '15': -3.5, '16': -3, '17': -3, '18': -2, '19': -1, '20': 0, '21': 0, '22': 1, '23': 1, '24': 1, '25': 2, '26': 2, '27': 2, '28': 2, '29': 2, '30': 3, '31': 3, '32': 3.5, '33': 4, '34': 4.5, '35': 5, '36': 5.5, '37': 6, '38': 7, '39': 8, '40': 8, '41': 9, '42': 9.5, '43': 9.5, '44': 10, '45': 10, '46': 10, '47': 11, '48': 12, '49': 12, '50': -8, '51': -7, '52': -6, '53': -6, '54': -6, '55': -6, '56': 10 } def conversor_utc_local(row): if not (row.readingLocalTime): fecha = datetime.strptime(row.readingUtcLocalTime, '%Y-%m-%d %H:%M:%S') fecha = fecha + timedelta(hours=utc_dicc[row.servicePointTimeZone]) return fecha.strftime('%Y-%m-%d %H:%M:%S') else: return row.readingLocalTime # PARTE 1: FLAGS #descarte de banderas, si alguna es false, se descarta el registro (se guarda en otro dataframe que se # va a usar para crear los logs) print(df_union.head(1)) print(len(df_union.head(1))) print(len(df_union.head(1)) > 0) if check_empty(df_union): # primero revisamos por la bandera de estado de asociacion, si esta es falsa todos los registros con dicha condicion # van a separarse el dataframe original para ir al dataframe de logs df_logs = df_union.filter(df_union.result_rD == False).withColumn( "Descripcion_log", lit("Estado de asociacion")) df_logs.show() df_logs.printSchema() df_union = df_union.filter(df_union.result_rD == True) # Ahora separamos aquellos registros que tienen falso en el estado de la variable o en la bandera de guardado # y lo concatenamos al df de logs df_logs_aux = df_union.filter((df_union.result_sPV == False) | (df_union.toStock == False))\ .withColumn("Descripcion_log",lit("result variable - to stock ")) df_logs = df_logs.union(df_logs_aux).coalesce(1) df_union = df_union.filter((df_union.result_sPV == True) & (df_union.toStock == True)) df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso1_banderas", header="true", emptyValue="") # PARTE 2: DIAS PASADOS FUTUROS # en base a la respuesta del request (req_config) tenemos que encontrar aquellas lecturas de los dias que correspondan if check_empty(df_union): # En los regustros donde el campo readingUtcLocalTime sea nulo, no vamos a poder hacer ninguna validacion, por lo que l # los descartamos del df original y los agragamos a los logs df_union = df_union.filter(df_union.readingUtcLocalTime.isNotNull()) # creamos la funcion para aplicar a cada fila del df hoy = datetime.today() date_format = '%Y-%m-%d' def pasado_futuro(row): last_date = datetime.strptime(row.lastReadDate[:10], date_format) asoc = datetime.strptime(row.relationStartDate[:10], date_format) reading_time = datetime.strptime(row.readingUtcLocalTime[:10], date_format) if (asoc - last_date).days > 0: last_date = asoc if row.readingType == "LOAD PROFILE READING": return (last_date - timedelta(days=lp_pasado) ) <= reading_time <= (hoy + timedelta(days=lp_futuro)) else: return (last_date - timedelta(days=re_pasado) ) <= reading_time <= (hoy + timedelta(days=re_futuro)) # Aplico la funcion al dataframe udf_object = udf(pasado_futuro, BooleanType()) df_union = df_union.withColumn( "lastReadDate_result", udf_object(struct([df_union[x] for x in df_union.columns]))) # Ahora separamos aquellos registros que tienen falso en el lastReadDate_result # y lo concatenamos al df de logs df_logs_aux = df_union.filter(df_union.lastReadDate_result == False)\ .withColumn("Descripcion_log",lit("dias pasado-futuro"))\ .drop("lastReadDate_result") df_logs = df_logs.union(df_logs_aux).coalesce(1) df_union = df_union.filter(df_union.lastReadDate_result == True) df_union = df_union.drop("lastReadDate_result") df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso2_pasadoFuturo", header="true", emptyValue="") # PARTE 3: DUPLICADOS E INCONSISTENCIAS # las lecturas totalmente repetidas hay que descartar todas menos una (paso 1), pero si hay alguna lectura con el mismo conjunto # de datos (servicePointId, deviceId, meteringType, variableId, readingLocalTime,logNumber) pero con distinto # readingsValue entonces se descartan todas las filas por inconsistencia en los datos (paso 2). if check_empty(df_union): # paso 1 (DUPLICADOS) #union_unique = union.dropDuplicates(['servicePointId','deviceId','meteringType','variableId','readingLocalTime','logNumber','readingsValue']) #df_logs_aux = union.subtract(union_unique).withColumn("Descripcion_log",lit("Duplicados")) ### Get Duplicate rows in pyspark df_logs_aux = df_union.groupBy('servicePointId', 'deviceId', 'meteringType', 'variableId', 'readingLocalTime', 'logNumber', 'readingsValue').count() df_logs_aux = df_logs_aux.filter(df_logs_aux["count"] > 1) df_logs_aux = df_logs_aux.select( col("servicePointId").alias("servicePointId_2"), col("deviceId").alias("deviceId_2"), col("meteringType").alias("meteringType_2"), col("variableId").alias("variableId_2"), col("readingLocalTime").alias("readingLocalTime_2"), col("logNumber").alias("logNumber_2"), col("readingsValue").alias("readingsValue_2")) df_logs_aux = df_logs_aux.withColumn("Descripcion_log", lit("Duplicados")) df_logs_aux = df_union.join( df_logs_aux, [(df_union.servicePointId == df_logs_aux.servicePointId_2), (df_union.deviceId == df_logs_aux.deviceId_2), (df_union.meteringType == df_logs_aux.meteringType_2), (df_union.variableId == df_logs_aux.variableId_2), (df_union.readingLocalTime == df_logs_aux.readingLocalTime_2), (df_union.logNumber.eqNullSafe(df_logs_aux.logNumber_2)), (df_union.readingsValue == df_logs_aux.readingsValue_2)], how='inner').coalesce(1) df_logs_aux = df_logs_aux.dropDuplicates([ 'servicePointId', 'deviceId', 'meteringType', 'variableId', 'readingLocalTime', 'logNumber', 'readingsValue' ]) columns_to_drop = [ "servicePointId_2", "deviceId_2", "meteringType_2", "variableId_2", "readingLocalTime_2", "logNumber_2", "readingsValue_2" ] df_logs_aux = df_logs_aux.drop(*columns_to_drop) df_union = df_union.dropDuplicates([ 'servicePointId', 'deviceId', 'meteringType', 'variableId', 'readingLocalTime', 'logNumber', 'readingsValue' ]) df_logs = df_logs.union(df_logs_aux).coalesce(1) # paso 2 (INCONSISTENCIAS) # funciona con el left anti join union_unique = df_union.dropDuplicates([ 'servicePointId', 'deviceId', 'meteringType', 'variableId', 'readingLocalTime', 'logNumber' ]) df_duplicates = df_union.subtract(union_unique).select( 'servicePointId', 'deviceId', 'meteringType', 'variableId', 'readingLocalTime', 'logNumber') df_final = union_unique.join( df_duplicates, [(union_unique.servicePointId == df_duplicates.servicePointId), (union_unique.deviceId == df_duplicates.deviceId), (union_unique.meteringType == df_duplicates.meteringType), (union_unique.variableId == df_duplicates.variableId), (union_unique.readingLocalTime == df_duplicates.readingLocalTime), (union_unique.logNumber.eqNullSafe(df_duplicates.logNumber))], how='left_anti').coalesce(1) # con esta linea salvamos la union entre nulos de pyspark, (union_unique.logNumber.eqNullSafe(df_duplicates.logNumber)) df_logs_aux = df_union.subtract(union_unique).withColumn( "Descripcion_log", lit("Valores inconsitentes")).coalesce(1) df_logs = df_logs.union(df_logs_aux).coalesce(1) df_union = df_final df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso3_duplicados", header="true", emptyValue="") # PARTE 4: VERSIONADO # creo las columnas de editado y estimado con valores vacios df_union = df_union.withColumn("estimationReading", lit(""))\ .withColumn("estimationValid", lit(""))\ .withColumn("editionReading", lit(""))\ .withColumn("editionValid", lit("")) # verificar si el df tiene datos de REGISTERS Y LOAD PROFILE READING if check_empty( df_union.filter((df_union.readingType == "REGISTERS") | ( df_union.readingType == "LOAD PROFILE READING"))): df_nuevo = df_union.filter((df_union.readingType == "REGISTERS") | ( df_union.readingType == "LOAD PROFILE READING")) min_ts = df_nuevo.agg(min('readingUtcLocalTime')).collect()[0][0] # pongo todos los valores de version en 1, como si fueran nuevos, despues se van modificando df_nuevo = df_nuevo.withColumn("version", lit(1)) # este es el df que esta presente en trustdata (base de datos), hacer una consulta que traiga datos con readinType REGISTERS O # LOAD PROFILE READING y ademas que tenga una fecha comprendida entre max_ts y min_ts df_orig = spark.read.csv( "../tests/Examples/original_modificado_glue.csv", header=True, mode="DROPMALFORMED") df_orig = df_orig.filter(df_orig.readingUtcLocalTime >= min_ts) df_orig.write.format('csv').mode("overwrite").save( "./output/cleaned/trustdata_para_versionado", header="true", emptyValue="") # ATHENA # checkeo que el df en la base no este vacio para poder hacer los cruces correctamente, caso que sea vacio, automaticamente # se rellena el campo version presentes el el df procesado con 1 if len(df_orig.head(1)) > 0: df_orig = df_orig.filter((df_orig.readingType == "REGISTERS") | ( df_orig.readingType == "LOAD PROFILE READING")) # obtener la info de la ultima version de cada lectura unica df_orig = df_orig.withColumn( "version", df_orig["version"].cast(IntegerType())) df_maxs = df_orig.groupby([ 'servicePointId', "variableId", "logNumber", "readingUtcLocalTime", "meteringType", "readingsSource", "owner" ]).agg(max("version")).drop('version').withColumnRenamed( 'max(version)', 'version').coalesce(1) df_orig = df_orig.withColumnRenamed('servicePointId', 'servicePointId_drop')\ .withColumnRenamed('variableId', 'variableId_drop')\ .withColumnRenamed('logNumber', 'logNumber_drop')\ .withColumnRenamed('readingUtcLocalTime', 'readingUtcLocalTime_drop')\ .withColumnRenamed('meteringType', 'meteringType_drop')\ .withColumnRenamed('version', 'version_drop')\ .withColumnRenamed('readingsSource', 'readingsSource_drop')\ .withColumnRenamed('owner', 'owner_drop')\ # traer los datos de la ultima version de cada lectura unica df_orig = df_maxs.join( df_orig, [(df_maxs.servicePointId == df_orig.servicePointId_drop), (df_maxs.variableId == df_orig.variableId_drop), (df_maxs.readingsSource == df_orig.readingsSource_drop), (df_maxs.owner == df_orig.owner_drop), (df_maxs.meteringType == df_orig.meteringType_drop), (df_maxs.readingUtcLocalTime == df_orig.readingUtcLocalTime_drop), (df_maxs.logNumber.eqNullSafe(df_orig.logNumber_drop)), (df_maxs.version == df_orig.version_drop)], how='inner').coalesce(1) columns_to_drop = [ item for item in df_orig.columns if "_drop" in item ] df_orig = df_orig.drop(*columns_to_drop) # seleccionar los valores de interes en la base de trustdata df_orig = df_orig.select( col("servicePointId").alias("servicePointId_orig"), col("variableId").alias("variableId_orig"), col("owner").alias("owner_orig"), col("logNumber").alias("logNumber_orig"), col("meteringType").alias("meteringType_orig"), col("readingUtcLocalTime").alias("readingUtcLocalTime_orig"), col("readingsSource").alias("readingsSource_orig"), col("readingsValue").alias("readingsValue_orig"), col("version").alias("version_orig"), col("estimationReading").alias("estimationReading_orig"), col("estimationValid").alias("estimationValid_orig"), col("editionReading").alias("editionReading_orig"), col("editionValid").alias("editionValid_orig")) # left join entre el df en proceso y el df en la base de datos df_final = df_nuevo.join( df_orig, [(df_nuevo.servicePointId == df_orig.servicePointId_orig), (df_nuevo.variableId == df_orig.variableId_orig), (df_nuevo.readingsSource == df_orig.readingsSource_orig), (df_nuevo.owner == df_orig.owner_orig), (df_nuevo.meteringType == df_orig.meteringType_orig), (df_nuevo.readingUtcLocalTime == df_orig.readingUtcLocalTime_orig), (df_nuevo.logNumber.eqNullSafe(df_orig.logNumber_orig))], how='left_outer').coalesce(1) def versionamiento(row): if row.readingsValue_orig is None: return 1 else: if row.readingsValue_orig == row.readingsValue: return None else: return row.version_orig + 1 udf_object = udf(versionamiento, IntegerType()) df_final = df_final.withColumn( "version_ref", udf_object(struct([df_final[x] for x in df_final.columns]))) # logs y df resultante en base a los valores devueltos por la funcion de versionamiento df_logs_aux = df_final.filter( df_final.version_ref.isNull()).withColumn( "Descripcion_log", lit("Versionamiento duplicados")) df_final = df_final.filter(df_final.version_ref.isNotNull()) df_final = df_final.drop("version").withColumnRenamed( "version_ref", 'version') df_final = df_final.drop("estimationReading").withColumnRenamed( "estimationReading_orig", 'estimationReading') df_final = df_final.drop("estimationValid").withColumnRenamed( "estimationValid_orig", 'estimationValid') df_final = df_final.drop("editionReading").withColumnRenamed( "editionReading_orig", 'editionReading') df_final = df_final.drop("editionValid").withColumnRenamed( "editionValid_orig", 'editionValid') # mantengo los valores de estimado y editado de la version anterior else: df_final = df_nuevo def usage_reading(row): string = ":".join(row.multiplier.split(":")[1:]) string = string.replace("'", "\"") dicti = json.loads(string) val = float(row.readingsValue) for value in dicti.values(): if value: val = val * float(value) return val udf_object = udf(usage_reading, StringType()) df_final = df_final.withColumn( "usageReading", udf_object(struct([df_final[x] for x in df_final.columns]))) lista_columnas = [ "servicePointId", "readingType", "variableId", "deviceId", "meteringType", "readingUtcLocalTime", "readingDateSource", "readingLocalTime", "dstStatus", "channel", "unitOfMeasure", "qualityCodesSystemId", "qualityCodesCategorization", "qualityCodesIndex", "intervalSize", "logNumber", "version", "readingsValue", "primarySource", "readingsSource", "owner", "guidFile", "estatus", "registersNumber", "eventsCode", "agentId", "agentDescription", "multiplier", "deviceMaster", "deviceDescription", "deviceStatus", "serial", "accountNumber", "servicePointTimeZone", "connectionType", "relationStartDate", "relationEndDate", "deviceType", "brand", "model", "idVdi", "identReading", "idenEvent", "idDateYmd", "idDateYw", 'dateCreation', "usageReading", "estimationReading", "estimationValid", "editionReading", "editionValid" ] df_final = df_final.select(*lista_columnas) # union con los EVENTS df_union = df_union.filter((df_union.readingType == "EVENTS")) df_union = df_union.withColumn("usageReading", lit('')).select(*lista_columnas) df_union = df_union.union(df_final).coalesce(1) # logs columnas_log = df_logs.columns df_logs_aux = df_logs_aux.select(*columnas_log) df_logs = df_logs.union(df_logs_aux).coalesce(1) df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso4_versionado", header="true", emptyValue="") # PARTE 5: RELLENO DE CEROS Y NULOS if check_empty( df_union.filter(df_union.readingType == "LOAD PROFILE READING")): # Se trabaja solo para los datos de LOAD PROFILE READING df_load_profile = df_union.filter( df_union.readingType == "LOAD PROFILE READING") # Crear columna quality_flag df_load_profile = df_load_profile.withColumn( "quality_flag", concat(col("qualityCodesSystemId"), lit("-"), col("qualityCodesCategorization"), lit("-"), col("qualityCodesIndex"))) variables_id = df_load_profile.select( "variableId").distinct().collect() variables_id = [row.variableId for row in variables_id] # definimos algunas funciones def elem_ranges(all_elem, elem): """ Funcion que devuelve una lista con listas incluyendo los rangos de tiempos faltantes. Input: all_elem (string): todos los timestamps separados por coma en un string elem (list): lista de los timestamp presentes en nuestro df a enriquecer """ aux_list = [] for i in elem[1:]: include, ommit = all_elem.split(i) aux_list.append(include[:-1]) all_elem = i + ommit aux_list.append(elem[-1]) return aux_list # funcion para rellenar el campo readingsValue con nulos o ceros dependiendo de si la bandera es de apagon. def func_apagon(row): if (row.quality_flag in lista_apagon) and (not row.referencia): return relleno else: return row.readingsValue_ref # creo funcion que va a rellenar en otra columna (a la que vamos a llamar validacion_intervalos) con nulo cuando el valor del # campo quality_flag no sea bandera de apagon, y un 1 cuando la bandera indique apagon # esta nueva columna nos va a permitir identificar si hay que contar o no ese registro a la hora de hacer la validacion de # intervalos def func_apagon_intervalos(row): if (row.quality_flag in lista_apagon) or (row.referencia): return "1" else: return None # aca empieza el ciclo for por cada elemento de la variable variable_Id for iteracion, variable in enumerate(variables_id): df_lp = df_load_profile.filter( df_load_profile.variableId == variable) # obtengo los limites de los tiempos y los paso a timestamp max_ts = datetime.strptime( (df_lp.agg(max('readingUtcLocalTime')).collect()[0][0]), "%Y-%m-%d %H:%M:%S") min_ts = datetime.strptime( (df_lp.agg(min('readingUtcLocalTime')).collect()[0][0]), "%Y-%m-%d %H:%M:%S") delta = max_ts - min_ts # interval indica el intervalo en minutos interval = df_lp.select("intervalSize").first()[0] mins = delta.seconds // 60 lista = [min_ts] for rep in range(1, mins // interval): lista.append(min_ts + timedelta(minutes=rep * interval)) if min_ts != max_ts: lista.append(max_ts) all_elem = [fecha.strftime('%Y-%m-%d %H:%M:%S') for fecha in lista] all_elem = ",".join(all_elem) elem = df_lp.select("readingUtcLocalTime").orderBy( "readingUtcLocalTime", ascending=True).collect() elem = [row.readingUtcLocalTime for row in elem] # genero lista que va a formar parte del df to_df = elem_ranges(all_elem, elem) to_df = [ls.split(",") for ls in to_df] # genero lista para el join, los valores son el primer elemento de cada lista de to_df to_df_join = [item[0] for item in to_df] # creo el df con las listas generadas data = zip(to_df_join, to_df) schema = StructType([ StructField('readingUtcLocalTime_2', StringType(), True), StructField('timestamps_arrays', ArrayType(StringType()), True) ]) rdd = spark.sparkContext.parallelize(data) df_timestamps = spark.createDataFrame(rdd, schema).coalesce(1) # hago un join del df timestamp explodeado con una porcion del df original para obtener un nuevo df que tenga los valores de value y las # quality flags como estan originalmente en las fechas presentes en el df original y si no existen tendran nulo. df_portion = df_lp.select(col("readingUtcLocalTime").alias("readingUtcLocalTime_ref") ,col("readingLocalTime").alias("readingLocalTime_ref") ,col("readingDateSource").alias("readingDateSource_ref") ,col("readingsValue").alias("readingsValue_ref") ,col("qualityCodesSystemId").alias("qualityCodesSystemId_ref") ,col("qualityCodesCategorization").alias("qualityCodesCategorization_ref") ,col("qualityCodesIndex").alias("qualityCodesIndex_ref") )\ .withColumn("referencia",lit("Original")) df_timestamps_comp = df_timestamps.withColumn( "complete_interval_ref", explode("timestamps_arrays")).select("complete_interval_ref") df_reference = df_timestamps_comp.join( df_portion, [(df_timestamps_comp.complete_interval_ref == df_portion.readingUtcLocalTime_ref)], how='left').coalesce(1) # hago un join del df original con el df timestamp para obtener los datos de los timestamp faltantes en el # df original df_lp = df_lp.join(df_timestamps, [(df_lp.readingUtcLocalTime == df_timestamps.readingUtcLocalTime_2)], how='inner').coalesce(1) df_lp = df_lp.drop("readingUtcLocalTime_2") # explode de los arrays de la columna df_lp = df_lp.withColumn("complete_interval", explode("timestamps_arrays")) df_lp = df_lp.drop("timestamps_arrays") # hago join del df original con el df de referencia, para asi tener los valores consistentes de los campos valor y quality flags df_lp = df_lp.join(df_reference, [ (df_lp.complete_interval == df_reference.complete_interval_ref) ], how='inner').coalesce(1) # hago el rellenado con nulos y ceros # veo si rellanar con ceros o con nulos if ceros: relleno = "0" else: relleno = None # Create your UDF object (which accepts func_apagon python function) udf_object = udf(func_apagon, StringType()) df_lp = df_lp.withColumn( "readingsValue_ref", udf_object(struct([df_lp[x] for x in df_lp.columns]))) # Create your UDF object (which accepts func_apagon_intervalos python function) udf_object = udf(func_apagon_intervalos, StringType()) df_lp = df_lp.withColumn( "validacion_intervalos", udf_object(struct([df_lp[x] for x in df_lp.columns]))) # Elimino algunas columnas y renombro otras para mantener el formato de salida esperado columns_to_drop = [ 'readingUtcLocalTime_ref', 'complete_interval_ref', 'readingUtcLocalTime', 'readingDateSource', 'readingLocalTime', "qualityCodesSystemId", "qualityCodesCategorization", 'qualityCodesIndex', "readingsValue", "referencia", "quality_flag" ] df_lp = df_lp.drop(*columns_to_drop) if iteracion == 0: df_load_profile_final = df_lp else: df_load_profile_final = df_load_profile.union(df_lp).coalesce( 1) df_load_profile = df_load_profile_final.withColumnRenamed( "complete_interval", "readingUtcLocalTime").withColumnRenamed( "readingLocalTime_ref", "readingLocalTime").withColumnRenamed( "readingDateSource_ref", "readingDateSource").withColumnRenamed( "qualityCodesSystemId_ref", "qualityCodesSystemId").withColumnRenamed( "qualityCodesCategorization_ref", "qualityCodesCategorization").withColumnRenamed( "qualityCodesIndex_ref", "qualityCodesIndex").withColumnRenamed( "readingsValue_ref", "readingsValue") # Aplico la funcion de transformacion de fechas al dataframe, para obtener todos los valores de readingLocalTime # y readingDateSource udf_object = udf(conversor_utc_local, StringType()) df_load_profile = df_load_profile.withColumn("readingLocalTime", udf_object(struct([df_load_profile[x] for x in df_load_profile.columns])))\ .withColumn("readingDateSource", col("readingLocalTime")) # selecciona las columnas en el orden que van y ademas hacer el union con el df original filtrado por # los otros tipos de lecturas df_registers_events = df_union.filter( (df_union.readingType == "REGISTERS") | (df_union.readingType == "EVENTS")) df_registers_events = df_registers_events.withColumn( "validacion_intervalos", lit("")) lista_columnas = [ "servicePointId", "readingType", "variableId", "deviceId", "meteringType", "readingUtcLocalTime", "readingDateSource", "readingLocalTime", "dstStatus", "channel", "unitOfMeasure", "qualityCodesSystemId", "qualityCodesCategorization", "qualityCodesIndex", "intervalSize", "logNumber", "version", "readingsValue", "primarySource", "readingsSource", "owner", "guidFile", "estatus", "registersNumber", "eventsCode", "agentId", "agentDescription", "multiplier", "deviceMaster", "deviceDescription", "deviceStatus", "serial", "accountNumber", "servicePointTimeZone", "connectionType", "relationStartDate", "relationEndDate", "deviceType", "brand", "model", "idVdi", "identReading", "idenEvent", "idDateYmd", "idDateYw", 'dateCreation', "validacion_intervalos", "usageReading", "estimationReading", "estimationValid", "editionReading", "editionValid" ] df_load_profile = df_load_profile.select(*lista_columnas) df_registers_events = df_registers_events.select(*lista_columnas) df_union = df_load_profile.union(df_registers_events).coalesce(1) df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso5_rellenoNulosyCeros", header="true", emptyValue="") # PARTE 6: VALIDACION DE INTERVALOS if check_empty( df_union.filter(df_union.readingType == "LOAD PROFILE READING")): df_union = df_union.withColumn( "intervalSize", df_union["intervalSize"].cast(IntegerType())) df_loadprofile = df_union.filter( df_union.readingType == "LOAD PROFILE READING") variables_id = df_loadprofile.select("variableId").distinct().collect() variables_id = [row.variableId for row in variables_id] # aca empieza el ciclo for por cada elemento de la variable variable_Id for iteracion, variable in enumerate(variables_id): df_lp = df_loadprofile.filter( df_loadprofile.variableId == variable) # obtengo los limites de los tiempos y los paso a timestamp max_ts = datetime.strptime( (df_lp.agg(max('readingUtcLocalTime')).collect()[0][0]), "%Y-%m-%d %H:%M:%S") min_ts = datetime.strptime( (df_lp.agg(min('readingUtcLocalTime')).collect()[0][0]), "%Y-%m-%d %H:%M:%S") delta = max_ts - min_ts # interval indica el intervalo en minutos interval = df_lp.select("intervalSize").first()[0] mins = delta.seconds // 60 cant_lecturas = (mins // interval) + 1 if df_lp.filter(df_lp.validacion_intervalos.isNotNull()).count( ) == cant_lecturas: df_lp = df_lp.withColumn("usageValid", lit(True)).withColumn( "ValidationDetail", lit("")).withColumn("IsGrouped", lit(False)) else: df_lp_success = df_lp.filter( df_lp.validacion_intervalos.isNotNull()) df_lp_success = df_lp_success.withColumn( "usageValid", lit(True)).withColumn( "ValidationDetail", lit("")).withColumn("IsGrouped", lit(False)) df_lp_failure = df_lp.filter( df_lp.validacion_intervalos.isNull()) df_lp_failure = df_lp_failure.withColumn("usageValid",lit(False))\ .withColumn("ValidationDetail",lit("{'Usage':{'IntervalsError':'Interval not exist'}}"))\ .withColumn("IsGrouped",lit(False)) df_lp = df_lp_success.union(df_lp_failure).coalesce(1) df_lp = df_lp.orderBy("readingUtcLocalTime", ascending=True) if iteracion == 0: df_loadprofile_final = df_lp else: df_loadprofile_final = df_loadprofile_final.union( df_lp).coalesce(1) df_loadprofile = df_loadprofile_final # selecciona las columnas en el orden que van y ademas hacer el union con el df original filtrado por # los otros tipos de lecturas df_registers_events = df_union.filter( (df_union.readingType == "REGISTERS") | (df_union.readingType == "EVENTS")) df_registers_events = df_registers_events.withColumn("usageValid",lit(None))\ .withColumn("ValidationDetail",lit(""))\ .withColumn("IsGrouped",lit(None)) lista_columnas = [ "servicePointId", "readingType", "variableId", "deviceId", "meteringType", "readingUtcLocalTime", "readingDateSource", "readingLocalTime", "dstStatus", "channel", "unitOfMeasure", "qualityCodesSystemId", "qualityCodesCategorization", "qualityCodesIndex", "intervalSize", "logNumber", "version", "readingsValue", "primarySource", "readingsSource", "owner", "guidFile", "estatus", "registersNumber", "eventsCode", "agentId", "agentDescription", "multiplier", "deviceMaster", "deviceDescription", "deviceStatus", "serial", "accountNumber", "servicePointTimeZone", "connectionType", "relationStartDate", "relationEndDate", "deviceType", "brand", "model", "idVdi", "identReading", "idenEvent", "idDateYmd", "idDateYw", 'dateCreation', "usageReading", "usageValid", "ValidationDetail", "IsGrouped", "estimationReading", "estimationValid", "editionReading", "editionValid" ] df_loadprofile = df_loadprofile.select(*lista_columnas) df_registers_events = df_registers_events.select(*lista_columnas) df_union = df_loadprofile.union(df_registers_events).coalesce(1) df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/paso6_validacionIntervalos", header="true", emptyValue="") # acomodo las columnas del df antes de escribirlo, y creo las columnas faltantes con valor vacio lista_columnas = [ "servicePointId", "readingType", "variableId", "deviceId", "meteringType", "readingUtcLocalTime", "readingDateSource", "readingLocalTime", "dstStatus", "channel", "unitOfMeasure", "qualityCodesSystemId", "qualityCodesCategorization", "qualityCodesIndex", "intervalSize", "logNumber", "version", "readingsValue", "primarySource", "guidFile", "estatus", "registersNumber", "eventsCode", "agentId", "agentDescription", "multiplier", "deviceMaster", "deviceDescription", "deviceStatus", "serial", "accountNumber", "servicePointTimeZone", "connectionType", "relationStartDate", "relationEndDate", "deviceType", "brand", "model", "idVdi", "identReading", "idenEvent", "idDateYmd", 'dateCreation', "usageReading", "usageValid", "ValidationDetail", "IsGrouped", "estimationReading", "estimationValid", "editionReading", "editionValid", "owner", "readingsSource", "idDateYw" ] cols_df = set(df_union.columns) for columna in set(lista_columnas).difference(cols_df): df_union = df_union.withColumn(columna, lit('')) df_union = df_union.select(*lista_columnas) # bloque obtencion de anio y semana para nomenclatura del archivo fecha = date.today() anio, sem, _ = fecha.isocalendar() anio = str(anio) sem = str(sem) # escribir csv df_union.write.format('csv').mode("overwrite").save( "./output/cleaned/final/" + anio + "/" + sem, header="true", emptyValue="") df_logs.write.format('csv').mode("overwrite").save( "./output/cleaned/removed_rows", header="true", emptyValue="") return df_union
).select("movie1", "movie2", "score", "numPairs") return result # Get movie name by given movie id def getMovieName(movieNames, movieId): result = movieNames.filter(func.col("movieID") == movieId) \ .select("movieTitle").collect()[0] return result[0] spark = SparkSession.builder.appName("MovieSimilarities").master("local[*]").getOrCreate() movieNamesSchema = StructType([ \ StructField("movieID", IntegerType(), True), \ StructField("movieTitle", StringType(), True) \ ]) moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) # Create a broadcast dataset of movieID and movieTitle. # Apply ISO-885901 charset movieNames = spark.read \ .option("sep", "|") \ .option("charset", "ISO-8859-1") \ .schema(movieNamesSchema) \
def make_df_solid(context): schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import StructField, StructType, FloatType, StringType, BooleanType, ArrayType, DateType SPARK_HOST = "spark://spark:7077" KAFKA_HOST = "kafka:19092" spark = SparkSession.builder.master(SPARK_HOST).appName("customer-deposits").getOrCreate() spark.sparkContext.setLogLevel("WARN") BankDepositsSchema = StructType([ StructField("accountNumber", StringType()), StructField("amount", FloatType()), StructField("dateAndTime", StringType()) ]) BankCustomersSchema = StructType([ StructField("customerName", StringType()), StructField("email", StringType()), StructField("phone", StringType()), StructField("birthDay", StringType()), StructField("accountNumber", StringType()), StructField("location", StringType()), ]) bankDepositsRawStreamingDF = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", KAFKA_HOST) \ .option("subscribe", "bank-deposits") \
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def top_dataframe(aggregated_df): return aggregated_df.sort('Carrera').sort('Promedio', ascending=False) if __name__ == '__main__': import sys spark = SparkSession.builder.appName('Mejores Promedios').getOrCreate() # Archivo de estudiantes estudiante_schema = StructType([ StructField('Carnet', IntegerType()), StructField('Nombre', StringType()), StructField('Carrera', StringType()) ]) estudiante_df = spark.read.csv(sys.argv[1], schema=estudiante_schema, header=False) estudiante_df.show() # Archivo de cursos curso_schema = StructType([ StructField('Curso', IntegerType()), StructField('Creditos', IntegerType()), StructField('Carrera', StringType())
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", args.access_key) if args.secret_key is not None: sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", args.secret_key) spark = SparkSession(sc) reviewsSchema = \ StructType() \ .add("marketplace", "string") \ .add("customer_id", "integer") \ .add("review_id", "string") \ .add("product_id", "string") \ .add("product_title", "string") \ .add("product_category", "string") \ .add("star_rating", "integer") \ .add("helpful_votes", "integer") \ .add("total_votes", "integer") \ .add("vine", "boolean") \ .add("verified_purchase", "boolean") \ .add("review_headline", "string") \ .add("review_body", "string") \ .add("review_date", "date") df = spark.read \ .format('csv') \ .option('sep','\t') \ .option('header','true') \ .option('schema',reviewsSchema) \ .load(args.source)
ts_col.append(t) return window_col, ts_col def combine_data(window_col): feature_matrix = np.zeros((0, 11)) for i, item in enumerate(window_col): feature = rr_feature_computation(item[:, 0], item[:, 1]) feature_matrix = np.concatenate( (feature_matrix, np.array(feature).reshape(-1, 11))) return feature_matrix schema = StructType([ StructField("user", StringType()), StructField("timestamp", TimestampType()), StructField("rr_feature", ArrayType(FloatType())), ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def rr_interval_feature_extraction(data: object) -> object: winsor_limit = 0.1 #FIXME - this must be passed or configurable mean = data['rr_interval'].mean() std = data['rr_interval'].std() data['rr_interval'] = (data['rr_interval'] - mean) / std window_col, ts_col = get_windows(data) X = combine_data(window_col)
def vectorizeBi(row, dico): vector_dict = {} for w in row.bigrams: if w in dico: vector_dict[dico[w]] = 1 return (row.label, SparseVector(len(dico), vector_dict)) # In[321]: from pyspark.mllib.linalg import VectorUDT from pyspark.sql.types import StructType, StructField, DoubleType schema = StructType([ StructField('label', DoubleType(), True), StructField('bigramVectors', VectorUDT(), True) ]) # In[322]: from functools import partial print "Converting bigrams to sparse vectors in a dataframe for the train set" t0 = time() features = dfTrain.map(partial(vectorizeBi, dico=dict_broad.value)).toDF(schema) features.take(1) tt = time() - t0 print "Done in {} second".format(round(tt, 3)) # In[323]:
def get_hrv_features(rr_data, acceptable_percentage=50, window_length=60): """ Args: rr_data (DataStream): acceptable_percentage (int): window_length (int): Returns: """ stream_name = 'org.md2k.autosense.ecg.features' def get_metadata(): stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \ .add_input_stream(rr_data.metadata.get_name()) \ .add_dataDescriptor( DataDescriptor() .set_name("var") .set_type("double") .set_attribute("description","variance")) \ .add_dataDescriptor( DataDescriptor() .set_name("iqr") .set_type("double") .set_attribute("description","Inter Quartile Range")) \ .add_dataDescriptor( DataDescriptor() .set_name("mean") .set_type("double") .set_attribute("description","Mean RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("median") .set_type("double") .set_attribute("description","Median RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("80th") .set_type("double") .set_attribute("description","80th percentile RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("20th") .set_type("double") .set_attribute("description","20th percentile RR Interval")) \ .add_dataDescriptor( DataDescriptor() .set_name("heartrate") .set_type("double") .set_attribute("description","Heart Rate in BPM")) \ .add_dataDescriptor( DataDescriptor() .set_name("vlf") .set_type("double") .set_attribute("description","Very Low Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("lf") .set_type("double") .set_attribute("description","Low Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("hf") .set_type("double") .set_attribute("description","High Frequency Energy")) \ .add_dataDescriptor( DataDescriptor() .set_name("lfhf") .set_type("double") .set_attribute("description","Low frequency to High Frequency energy ratio")) \ .add_dataDescriptor( DataDescriptor() .set_name("window") .set_type("struct") .set_attribute("description","window start and end time in UTC") .set_attribute('start','start of window') .set_attribute('end','end of window')) \ .add_module( ModuleMetadata().set_name("HRV Features from ECG RR Interval") .set_attribute("url", "http://md2k.org/") .set_attribute('algorithm','ecg feature computation') .set_attribute('unit','ms') .set_author("Md Azim Ullah", "*****@*****.**")) return stream_metadata def get_rr_features(a): return np.array([ np.var(a), iqr(a), np.mean(a), np.median(a), np.percentile(a, 80), np.percentile(a, 20), 60000 / np.median(a) ]) def frequencyDomain(RRints, tmStamps, band_type=None, lf_bw=0.11, hf_bw=0.1, vlf=(0.003, 0.04), lf=(0.04, 0.15), hf=(0.15, 0.4)): """ Args: RRints: tmStamps: band_type: lf_bw: hf_bw: vlf: lf: hf: Returns: """ NNs = RRints tss = tmStamps frequency_range = np.linspace(0.001, 1, 10000) NNs = np.array(NNs) NNs = NNs - np.mean(NNs) result = signal.lombscargle(tss, NNs, frequency_range) #Pwelch w/ zero pad fxx = frequency_range pxx = result if band_type == 'adapted': vlf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= vlf[0], fxx < vlf[1])]))[0][0]] lf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= lf[0], fxx < lf[1])]))[0][0]] hf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and( fxx >= hf[0], fxx < hf[1])]))[0][0]] peak_freqs = (vlf_peak, lf_peak, hf_peak) hf = (peak_freqs[2] - hf_bw / 2, peak_freqs[2] + hf_bw / 2) lf = (peak_freqs[1] - lf_bw / 2, peak_freqs[1] + lf_bw / 2) vlf = (0.003, lf[0]) if lf[0] < 0: print( '***Warning***: Adapted LF band lower bound spills into negative frequency range' ) print('Lower thresold of LF band has been set to zero') print('Adjust LF and HF bandwidths accordingly') lf = (0, lf[1]) vlf = (0, 0) elif hf[0] < 0: print( '***Warning***: Adapted HF band lower bound spills into negative frequency range' ) print('Lower thresold of HF band has been set to zero') print('Adjust LF and HF bandwidths accordingly') hf = (0, hf[1]) lf = (0, 0) vlf = (0, 0) df = fxx[1] - fxx[0] vlf_power = np.trapz(pxx[np.logical_and(fxx >= vlf[0], fxx < vlf[1])], dx=df) lf_power = np.trapz(pxx[np.logical_and(fxx >= lf[0], fxx < lf[1])], dx=df) hf_power = np.trapz(pxx[np.logical_and(fxx >= hf[0], fxx < hf[1])], dx=df) totalPower = vlf_power + lf_power + hf_power #Normalize and take log vlf_NU_log = np.log((vlf_power / (totalPower - vlf_power)) + 1) lf_NU_log = np.log((lf_power / (totalPower - vlf_power)) + 1) hf_NU_log = np.log((hf_power / (totalPower - vlf_power)) + 1) lfhfRation_log = np.log((lf_power / hf_power) + 1) freqDomainFeats = { 'VLF_Power': vlf_NU_log, 'LF_Power': lf_NU_log, 'HF_Power': hf_NU_log, 'LF/HF': lfhfRation_log } return freqDomainFeats schema = StructType([ StructField("timestamp", TimestampType()), StructField("start", TimestampType()), StructField("end", TimestampType()), StructField("localtime", TimestampType()), StructField("version", IntegerType()), StructField("user", StringType()), StructField("features", ArrayType(DoubleType())) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) @CC_MProvAgg('org.md2k.autosense.ecg.rr', 'get_hrv_features', stream_name, ['user', 'timestamp'], ['user', 'timestamp']) def ecg_r_peak(key, data): """ Args: key: data: Returns: """ if data.shape[0] >= acceptable_percentage * window_length / 100: data = data.sort_values('time') data['time'] = 1000 * data['time'] a = data['rr'].values features = [ np.double( np.array( list(get_rr_features(a)) + list( frequencyDomain( np.array(a) / 1000, np.cumsum(a) / 1000).values()))) ] data = data[:1] data['features'] = features data['start'] = [key[2]['start']] data['end'] = [key[2]['end']] data = data[[ 'timestamp', 'localtime', 'version', 'user', 'start', 'end', 'features' ]] return data else: return pd.DataFrame([], columns=[ 'timestamp', 'localtime', 'version', 'user', 'features', 'start', 'end' ]) rr_data = rr_data.withColumn('time', F.col('timestamp').cast('double')) ecg_features = rr_data.compute(ecg_r_peak, windowDuration=window_length, startTime='0 seconds') df = ecg_features.select('timestamp', F.struct('start', 'end').alias('window'), 'localtime', 'features', 'user', 'version') df = df.withColumn('var', F.col('features').getItem(0)) df = df.withColumn('iqr', F.col('features').getItem(1)) df = df.withColumn('vlf', F.col('features').getItem(7)) df = df.withColumn('lf', F.col('features').getItem(8)) df = df.withColumn('hf', F.col('features').getItem(9)) df = df.withColumn('lfhf', F.col('features').getItem(10)) df = df.withColumn('mean', F.col('features').getItem(2)) df = df.withColumn('median', F.col('features').getItem(3)) df = df.withColumn('80th', F.col('features').getItem(4)) df = df.withColumn('20th', F.col('features').getItem(5)) ecg_features_final = df.withColumn('heartrate', F.col('features').getItem(6)) ecg_features_final = ecg_features_final.drop('features') ecg_features_final.metadata = get_metadata() feature_names = [ 'var', 'iqr', 'mean', 'median', '80th', '20th', 'heartrate', 'vlf', 'lf', 'hf', 'lfhf' ] stress_features = ecg_features_final.withColumn( 'features', F.array([F.col(i) for i in feature_names])) return stress_features
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer, RegexTokenizer, Word2Vec spark = SparkSession.builder.appName("process_marc").getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) list = [] fields = [StructField("id", StringType(), True), StructField("marc", StringType(), True), StructField("titles", ArrayType(StringType()), True), StructField("subjects", ArrayType(StringType()), True), StructField("notes", ArrayType(StringType()), True) ] schema = StructType(fields) # 10,000 # marc/data/alma_bibs__2017090800_4547647970003811_new_1.xml # 7 # marc/data/test_small_alma_bibs__2017102418_4593081760003811_new.xml def array_merge(titles, subjects, notes): new_arr = [] new_arr.extend(titles) new_arr.extend(subjects) new_arr.extend(notes) return new_arr exclude = set(string.punctuation)
# local mode vs cluster mode # driver runs its own Java process(JVM) so does the Executors which have their own JVM's. # spark-submit --executor-memory 3G --total-executor-cores 2 sampleSpark.py spark = SparkSession.builder.appName('mytestSparkapp').getOrCreate() filename = "/Users/sbommireddy/Documents/python-spark-tutorial-master/in/airports.text" myManualSchema = StructType([ StructField("AirportId",LongType(),True), StructField("AirportName",StringType(),True), StructField("City",StringType(),True), StructField("Country",StringType(),True), StructField("AirportCode",StringType(),True), StructField("ICAOCode",StringType(),True), StructField("Latitude",FloatType(),False), StructField("Longitude",FloatType(),False), StructField("Altitude",FloatType(),False), StructField("timezoneDST",StringType(),False), StructField("unknown",StringType(),False), StructField("timeZoneinOlson",StringType(),False) ]) airports = spark.read.format("csv").schema(myManualSchema).load(filename) # option("inferSchema", "true").option("header","true") print(airports.rdd.getNumPartitions) # Each partition is a separate CSV file when you write a DataFrame to disk. # make the dataframe a table or view. airports.createOrReplaceTempView("airports_all")
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, FloatType spark = SparkSession.builder.appName("TotalSpentByCustomer").master( "local[*]").getOrCreate() # Create schema when reading customer-orders customerOrderSchema = StructType([ \ StructField("cust_id", IntegerType(), True), StructField("item_id", IntegerType(), True), StructField("amount_spent", FloatType(), True) ]) # Load up the data into spark dataset customersDF = spark.read.schema(customerOrderSchema).csv( "file:///SparkCourse/customer-orders.csv") totalByCustomer = customersDF.groupBy("cust_id").agg(func.round(func.sum("amount_spent"), 2) \ .alias("total_spent")) totalByCustomerSorted = totalByCustomer.sort("total_spent") totalByCustomerSorted.show(totalByCustomerSorted.count()) spark.stop()
#destDataDirRoot = "/home/allenk/tmp/" srcDataDirRoot = "abfs://[email protected]/" destDataDirRoot = "abfs://[email protected]/" TripSchemaColList = [ "year", "month", "day", "city", "vendor", "pickup_time", "pickup_zone", "dropoff_time", "dropoff_zone", "trip_time_by_sec", "fare", "tips", "total", "tips_rate" ] TripSchema = StructType([ StructField("vendor", StringType(), True), StructField("pickup_zone", IntegerType(), True), StructField("dropoff_zone", IntegerType(), True), StructField("trip_time_by_sec", IntegerType(), True), StructField("pickup_time", TimestampType(), True), StructField("dropoff_time", TimestampType(), True), StructField("fare", DoubleType(), True), StructField("tips", DoubleType(), True), StructField("total", DoubleType(), True), StructField("verbose", StringType(), True) ]) # clearnsing the data by remove the verbose column and create 3 new column: Year, Month, TipRate def cleansingTripDataframe(sourceDF): sourceDF = (sourceDF.withColumn("city", lit("shanghai")).withColumn( "tips_rate", round(col("tips") / col("total"), 2)).withColumn( "year", substring(col("pickup_time"), 0, 4)).withColumn( "month", substring(col("pickup_time"), 6, 2)).withColumn("day",
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Mon Mar 25 16:55:00 2019 @author: willy """ if __name__ == '__main__': from pyspark.sql import SparkSession from pyspark.sql.functions import explode from pyspark.sql.functions import split from pyspark.sql.types import StructType, StructField, StringType, LongType import findspark spark = SparkSession.builder.appName("StructuredNetworkWordCount").getOrCreate() userSchema = StructType([ StructField("id", LongType(), True), StructField("text", StringType(), True), ]) df = spark.readStream.schema(userSchema).json('/home/jovyan/git/personal/lab/spark101/twitter/tweets/') a = "/home/jovyan/git/personal/lab/spark101/twitter/twitter/final/final.txt" b = "/home/jovyan/git/personal/lab/spark101/twitter/twitter/final/final.txt" df.writeStream.format("console").outputMode("append").start().awaitTermination()
from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import StructType, StructField, IntegerType, LongType spark = SparkSession.builder.appName("PopularMovies").getOrCreate() # Create schema when reading u.data schema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) # Load up movie data as dataframe moviesDF = spark.read.option("sep", "\t").schema(schema).csv("ml-100k/u.data") # Some SQL-style magic to sort all movies by popularity in one line! topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(desc("count")) # Grab the top 10 topMovieIDs.show(10) # Stop the session spark.stop()
def main(): random.seed(2018) # spark config conf = SparkConf() conf.setMaster("local").setAppName("MemoryBasedCF") conf.set("spark.network.timeout", "3600s") conf.set("spark.executor.heartbeatInterval", "3000s") sc = SparkContext(conf=conf) #sc.setLogLevel("ERROR") sc.setCheckpointDir("checkpoint") sqlContext = SQLContext(sc) ''' load train data ''' train_path = 'PA/Restaurants/train/' train_user = train_path + 'PA_train_yelp_academic_dataset_user.csv' train_review = train_path + 'PA_train_yelp_academic_dataset_review.csv' train_business = train_path + 'PA_train_yelp_academic_dataset_business.csv' train_tips = train_path + 'PA_train_yelp_academic_dataset_tip.csv' train_checkin = train_path + 'PA_train_yelp_academic_dataset_checkin.csv' schema_review = StructType([ StructField("funny", IntegerType()), StructField("user_id", StringType()), StructField("review_id", StringType()), StructField("text", StringType()), StructField("business_id", StringType()), StructField("stars", IntegerType()), StructField("date", StringType()), StructField("useful", IntegerType()), StructField("cool", IntegerType()), StructField("1overN", DoubleType()), StructField("2overN", DoubleType()), StructField("percentile", DoubleType()) ]) rawData_review = sqlContext.read.format("com.databricks.spark.csv") \ .option("header", "true") \ .option("inferschema", "true") \ .option("mode", "DROPMALFORMED") \ .schema(schema_review) \ .load(train_review).rdd rawData_business = sqlContext.read.format("com.databricks.spark.csv")\ .option("header", "true")\ .option("inferschema", "true")\ .option("mode", "DROPMALFORMED")\ .load(train_business).rdd # Step1: find nn for users using review text userReviewVec, lshf, transformer, idMap, stop_words = findNN(rawData_review) print "Step1 Completed" # Step2: find business attr mappings busiAttrMap = findBusiAttrMapping(rawData_business) print "Step2 Completed" # Step3: get user business map userBusiMap = rawData_review.map(lambda x: (x[1], [x[4]])).reduceByKey(lambda x, y: x + y).collectAsMap() print "Step3 Completed" # Step4: for each user in knn find its business, then compute a weighted vote on their business #userReviewVec = userReviewVec.collectAsMap() #print(weightedVote(userReviewVec['IjVuk0tawvT0ygazmrBQEg'], userBusiMap, busiAttrMap)) userReviewVec = userReviewVec.map(lambda x: (x[0], weightedVote(x[1], userBusiMap, busiAttrMap))) #print userReviewVec.collectAsMap()['IjVuk0tawvT0ygazmrBQEg'] print "Step4 Completed" # Step5: find true business mapping # run train on train for test first #true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])) true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])).collectAsMap() print "Step5 Completed" # Step6: join prediction and true val #result = userReviewVec.collect() #result2 = true_review.collect()#.join(true_review) result = userReviewVec.filter(lambda x: x[0] in true_review).map(lambda x: (x[0], (x[1], true_review[x[0]]))) print "Step6 Completed" # Step7: Compute error between prediction and true mapping MSE = result.map(lambda x: computeMSE(x[1][0], x[1][1])).collect() MSE = np.mean(MSE, axis=0) RMSE = MSE ** 0.5 print "Step7 Completed" # Step8: Output the results print RMSE with open('result_train.txt', 'w') as f: f.writelines([str(RMSE)]) result = result.sortByKey() if os.path.exists('result/train_result'): shutil.rmtree('result/train_result') result.saveAsTextFile('result/train_result') print "Step8 Completed" # Step9: Run validation data valid_path = 'PA/Restaurants/valid/' valid_review = valid_path + 'PA_valid_yelp_academic_dataset_review.csv' valid_business = valid_path + 'PA_valid_yelp_academic_dataset_business.csv' runPrediction(sc, sqlContext, valid_review, valid_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "valid") print "Step9 Completed" # Step10: Run Test data test_path = 'PA/Restaurants/test/' test_review = test_path + 'PA_test_yelp_academic_dataset_review.csv' test_business = test_path + 'PA_test_yelp_academic_dataset_business.csv' runPrediction(sc, sqlContext, test_review, test_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "test") print "Step10 Completed" return
from pyspark.sql import SparkSession, Row, functions from pyspark.sql.types import StructType, StructField, IntegerType, StringType, IntegerType if __name__ == "__main__": spark = SparkSession.builder.appName("PopularMovies").getOrCreate() # Definindo os schemas para as tabelas ratingsSchema = StructType([ StructField("userId", IntegerType(), True), StructField("movieId", IntegerType(), True), StructField("rating", IntegerType(), True) ]) itemSchema = StructType([ StructField("movieId", IntegerType(), True), StructField("movieTitle", StringType(), True), StructField("releaseDate", StringType(), True), StructField("videoReleaseDate", StringType(), True), StructField("imdb", StringType(), True), StructField("unknown", IntegerType(), True), StructField("Action", IntegerType(), True), StructField("Adventure", IntegerType(), True), StructField("Animation", IntegerType(), True), StructField("Childrens", IntegerType(), True), StructField("Comedy", IntegerType(), True), StructField("Crime", IntegerType(), True), StructField("Documentary", IntegerType(), True), StructField("Drama", IntegerType(), True), StructField("Fantasy", IntegerType(), True), StructField("FilmNoir", IntegerType(), True), StructField("Horror", IntegerType(), True),
pdline = pandas.read_csv("/Users/xiejinxin/datafloder/test/copy.csv") spkdf = sqlc.createDataFrame(pdline) print(pdline) print(spkdf) spkdf.createOrReplaceTempView("tmp1") data = sqlc.sql( "select row_number() over(order by relation_year) as id, cust_level,relation_year,sex,age,cust_status,is_xyk_kk,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,if(finance_bal > 0,1,0) as label from tmp1" ) data.show() tran = data.rdd.map(lambda x: LabeledPoint( list(x)[17], SparseVector(16, [i for i in range(16)], list(x)[1:17]))) #cust_level,relation_year,sex,age,eva_bal_rmb,cnt,transamt,liucun_bal #cust_level,relation_year,sex,age,cust_status,is_xyk_kk,degree,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,finance_bal,finance_bal_bb,finance_bal_fbb,invest_bal,ldjj_bal,gz_aum_bal,b_aum_bal,gold_bal,trust_bal,insurance_bal,third_bal,loan_house_bal,loan_car_bal,loan_mana_bal,loan_stuty_bal,loan_other_bal,ola_aum_bal,b_z_cd_aum_bal,loan_z_cd,zhc_aum_bal,jer_bal,dly_bal,hxlc_bal,jeqj_bal,jegd_bal,jewy_bal,dzzh_bal,decd_bal,xfc_aum_bal,jj_tot_vol,card_xy_bal_last_m_avg,card_xy_bal_last_m_avg_y,card_swing_bal_avg,card_swing_bal_avg_y,card_swing_num_avg,card_swing_num_avg_y,corpname,tran_amt_1m,tran_num_1m,tran_amt_3m,tran_num_3m,tran_amt_6m,tran_num_6m,day_cnt,tran_wy_amt_1m,tran_wy_num_1m,tran_wy_amt_3m,tran_wy_num_3m,tran_wy_amt_6m,tran_wy_num_6m,day_wy_cnt,tran_dz_amt_1m,tran_dz_num_1m,tran_dz_amt_3m,tran_dz_num_3m,tran_dz_amt_6m,tran_dz_num_6m,day_dz_cnt,tran_atm_amt_1m,tran_atm_num_1m,tran_atm_amt_3m,tran_atm_num_3m,tran_atm_amt_6m,tran_atm_num_6m,day_atm_cnt,tran_gt_amt_1m,tran_gt_num_1m,tran_gt_amt_3m,tran_gt_num_3m,tran_gt_amt_6m,tran_gt_num_6m,day_gt_cnt,tran_pos_amt_1m,tran_pos_num_1m,tran_pos_amt_3m,tran_pos_num_3m,tran_pos_amt_6m,tran_pos_num_6m,day_pos_cnt,tran_sj_amt_1m,tran_sj_num_1m,tran_sj_amt_3m,tran_sj_num_3m,tran_sj_amt_6m,tran_sj_num_6m,day_sj_cnt,tran_dh_amt_1m,tran_dh_num_1m,tran_dh_amt_3m,tran_dh_num_3m,tran_dh_amt_6m,tran_dh_num_6m,day_dh_cnt,is_despoit,is_fixed,is_finance,is_fund,is_gz_aum,is_insurance,is_gold,is_third,is_trust,is_loan,is_cbank,is_xyk,is_finance_bb,is_finance_fbb,is_ldjj,is_loan_house,is_loan_car,is_loan_mana,is_loan_stuty,is_loan_other,is_ola_aum,is_zhc_aum,is_jer,is_dly,is_hxlc,is_jeqj,is_jewy,is_decd,is_xfc_aum,' model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10) model.save(sc, "./gbdymodelonlionev1") a = StructType([ StructField("ID", StringType(), False), StructField("cust_type", StringType(), True), StructField("cust_level", IntegerType(), True) ]) print(a)
spark = SparkSession\ .builder\ .appName("StructuredNetworkWordCount")\ .getOrCreate() # Create DataFrame representing the stream of input lines from connection to host:port """lines = spark\ .readStream\ .format('socket')\ .option('host', host)\ .option('port', port)\ .load()""" #"ID","language","Date","source","len","likes","RTs","Hashtags","Usernames","Userid","name","Place","followers","friends" user_schema = StructType().add("ID", "string").add("language", "string").add("Date", "string").add("source", "string").add("len", "string").add("likes", "integer").add("RTs", "integer").add("Hashtags", "string").add("Usernames", "string").add("Userid", "string").add("name", "string").add("Place", "string").add("followers", "integer").add("friends", "integer") lines = spark\ .readStream\ .option("sep", ";")\ .schema(user_schema)\ .csv("hdfs://localhost:9000/stream") # Split the lines into words words = lines.select( # explode turns each item in an array into a separate row lines.name, lines.friends/lines.followers ) # Generate running word count wordCounts = words.groupBy(lines.name).agg({"(friends / followers)": "max" }).limit(5) wordCounts = wordCounts.select(col("name"), col("max((friends / followers))").alias("FRRatio"))
from pyspark.sql import SparkSession from pyspark.sql.types import StructType spark = SparkSession \ .builder \ .appName("Bigdata") \ .getOrCreate() userSchema = StructType().add("ID","string").add("Lang","string",).add("Date","string").add("Source","string").add("len","integer").add("Likes","integer").add("RTs","integer").add("Hashtags","string").add("UserMentionNames","string").add("UserMentionID","string").add("Name","string").add("Place","string").add("Followers","integer").add("Friends","integer") '''csvDF = spark \ .readStream \ .option("sep", ";") \ .schema(userSchema) \ .csv("/input") ''' dfCSV = spark.readStream.option("sep", ";").option("header", "false").schema(userSchema).csv("/stream1") dfCSV.createOrReplaceTempView("Bigdata") summ = spark.sql("select Name,(Followers/Friends) as FRratio from Bigdata order by Frratio desc limit 5") m = summ.writeStream.outputMode("complete").format("console") m.start() m.awaitTermination(100) spark.stop()
# Databricks notebook source from pyspark.sql import SparkSession from pyspark.sql.types import StructType # COMMAND ---------- from pyspark.sql.types import * # COMMAND ---------- userSchema = StructType([ StructField("desc", StringType(), True), StructField("zip", IntegerType(), True), StructField("title", StringType(), True), StructField("timeStamp", StringType(), True), StructField("twp", StringType(), True), StructField("addr", StringType(), True) ]) # COMMAND ---------- fileStreamDf = spark.readStream.option( "header", "true").schema(userSchema).csv("/FileStore/tables/") # COMMAND ---------- fileStreamDf.createOrReplaceTempView("salary") # COMMAND ---------- totalSalary = spark.sql("select count(title),title from salary group by title")
#Read All JSON files from a directory df3 = spark.read.json("resources/*.json") df3.show() # Define custom schema schema = StructType([ StructField("RecordNumber", IntegerType(), True), StructField("Zipcode", IntegerType(), True), StructField("ZipCodeType", StringType(), True), StructField("City", StringType(), True), StructField("State", StringType(), True), StructField("LocationType", StringType(), True), StructField("Lat", DoubleType(), True), StructField("Long", DoubleType(), True), StructField("Xaxis", IntegerType(), True), StructField("Yaxis", DoubleType(), True), StructField("Zaxis", DoubleType(), True), StructField("WorldRegion", StringType(), True), StructField("Country", StringType(), True), StructField("LocationText", StringType(), True), StructField("Location", StringType(), True), StructField("Decommisioned", BooleanType(), True), StructField("TaxReturnsFiled", StringType(), True), StructField("EstimatedPopulation", IntegerType(), True), StructField("TotalWages", IntegerType(), True), StructField("Notes", StringType(), True) ]) df_with_schema = spark.read.schema(schema) \ .json("resources/zipcodes.json") df_with_schema.printSchema() df_with_schema.show()
join staging_songs as s on e.song like s.title where e.page like 'NextSong' and e.song is not null and s.artist_id is not null and e.artist like s.artist_name """) insert_table_queries = [ user_table_insert, song_table_insert, artist_table_insert, time_table_insert, songplay_table_insert ] usr_schema = StructType([ \ StructField("userid", IntegerType(), False), \ StructField("firstname", StringType(), True), \ StructField("lastname", StringType(), True), \ StructField("gender", StringType(), True), \ StructField("level", StringType(), True), \ ]) song_schema = StructType([ \ StructField("song_id", StringType(), False), \ StructField("title", StringType(), True), \ StructField("artist_id", StringType(), True), \ StructField("year", IntegerType(), True), \ StructField("duration", FloatType(), True), \ ]) artist_schema = StructType([ \
schema = StructType([ StructField('geo',StringType(), True), StructField('yr1980', DoubleType(), False), StructField('yr1981', DoubleType(), False), StructField('yr1982', DoubleType(), False), StructField('yr1983', DoubleType(), False), StructField('yr1984', DoubleType(), False), StructField('yr1985', DoubleType(), False), StructField('yr1986', DoubleType(), False), StructField('yr1987', DoubleType(), False), StructField('yr1988', DoubleType(), False), StructField('yr1989', DoubleType(), False), StructField('yr1990', DoubleType(), False), StructField('yr1991', DoubleType(), False), StructField('yr1992', DoubleType(), False), StructField('yr1993', DoubleType(), False), StructField('yr1994', DoubleType(), False), StructField('yr1995', DoubleType(), False), StructField('yr1996', DoubleType(), False), StructField('yr1997', DoubleType(), False), StructField('yr1998', DoubleType(), False), StructField('yr1999', DoubleType(), False), StructField('yr2000', DoubleType(), False), StructField('yr2001', DoubleType(), False), StructField('yr2002', DoubleType(), False), StructField('yr2003', DoubleType(), False), StructField('yr2004', DoubleType(), False), StructField('yr2005', DoubleType(), False), StructField('yr2006', DoubleType(), False), StructField('yr2007', DoubleType(), False), StructField('yr2008', DoubleType(), False), StructField('yr2009', DoubleType(), False), StructField('yr2010', DoubleType(), False) ])
# All the queries are based largely on queries found in the book: # Bill Chambers & Matei Zaharia , # Spark: The Definitive Guide: Big Data Processing Made Simple, # O'Reilly, 2018 # ------------------------------------------------------------------------------ # Apache Spark 2.x DataFrames and Python Pyspark # ------------------------------------------------------------------------------ # ------------------We first create product and customer dataframes------------- from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType # ------------------------------------------------------------------------------ # Create products dataframe manual_schema_products = StructType([ StructField('product_id', IntegerType(), False), StructField('product_name', StringType(), True), StructField('product_specs', StringType(), True), StructField('price', FloatType(), True) ]) array_temp1 = [[1, 'Apple iPhone 6', '16GB memory', 200.0], [2, 'Samsung Galaxy 5', '16 megapixel camera', 100.0]] in_memory_products_dataframe = spark.createDataFrame(array_temp1, manual_schema_products) in_memory_products_dataframe.show(2) in_memory_products_dataframe.cache() # ----------------------------------------------------------------------------- # Create customers dataframe
def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions(sys.argv, [ 'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX' ]) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set( "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([ StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True) ]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv(('s3://' + os.path.join( args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')), header=False, schema=schema, encoding='UTF-8') validationdf = spark.read.csv(('s3://' + os.path.join( args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')), header=False, schema=schema, encoding='UTF-8') # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map( lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join( args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map( lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join( args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)