def test_validate_column_types(self): from pyspark.sql.functions import udf, to_json from pyspark.sql.column import _to_java_column self.assertTrue("Column" in _to_java_column("a").getClass().toString()) self.assertTrue("Column" in _to_java_column(u"a").getClass().toString()) self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString()) self.assertRaisesRegexp( TypeError, "Invalid argument, not a string or column", lambda: _to_java_column(1)) class A(): pass self.assertRaises(TypeError, lambda: _to_java_column(A())) self.assertRaises(TypeError, lambda: _to_java_column([])) self.assertRaisesRegexp( TypeError, "Invalid argument, not a string or column", lambda: udf(lambda x: x)(None)) self.assertRaises(TypeError, lambda: to_json(1))
df = (spark.read.format('delta').load("s3://oetrta/volker/datasets/turbine/kinesis_sample/") .withColumn("jsonData", from_json(col("value"), jsonSchema)) \ .select("key","jsonData.*") ) # COMMAND ---------- df.cache() # COMMAND ---------- df_sample = (df.sample(fraction=0.01).limit(500).dropDuplicates( ["ID"]).withColumn("TIMESTAMP", current_timestamp()).select( 'key', to_json(struct(col('*'))).alias('value'))) pdf = df_sample.toPandas() display(df_sample) # COMMAND ---------- while True: df_sample = (df.sample(fraction=0.01).limit(500).dropDuplicates( ["ID"]).withColumn("TIMESTAMP", current_timestamp()).select( 'key', to_json(struct(col('*'))).alias('value'))) pdf = df_sample.toPandas()
def run(spark): """ This is an attempt at combining multiple video sources into a grid of images. WARNING: This is broken because Spark is not maintaining the time order of the images. This file has been superceded by the Flink/Java class MultiVideoGridJob in the flinkprocessor directory. """ schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary' # To allow for large images and avoid out-of-memory, the JVM will # send to the Python UDF this batch size. spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1') controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090') scope = os.getenv('PRAVEGA_SCOPE', 'examples') checkpoint_location = os.getenv('CHECKPOINT_LOCATION', '/tmp/spark_checkpoints_multi_video_grid') shutil.rmtree(checkpoint_location, ignore_errors=True) df = ( spark.readStream.format("pravega").option( "controller", controller).option("scope", scope).option( "stream", "video").option("encoding", "chunked_v1") # .option("start_stream_cut", "earliest") .load()) df = df.withColumnRenamed('event', 'raw_event') df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string')) df = df.select('*', from_json('event_string', schema=schema).alias('event')) df = df.select('*', 'event.*') df = df.select('*', length('data')) fps = 2.0 df = df.selectExpr( '*', f'timestamp(floor(cast(timestamp as double) * {fps}) / {fps}) as discrete_timestamp' ) df = df.withWatermark('discrete_timestamp', '5 second') df = df.drop('raw_event', 'event_string', 'event') thumbnail_size = (84, 84) @pandas_udf(returnType='binary', functionType=PandasUDFType.SCALAR) def decode_and_scale_image(data_series, ssrc): def f(data): in_pil = Image.open(io.BytesIO(data)) out_pil = in_pil.resize(thumbnail_size) return out_pil.tobytes() return data_series.apply(f) df = df.select( '*', decode_and_scale_image(df['data'], df['ssrc']).alias('image')) df = df.select( '*', func.to_json( func.struct(df['discrete_timestamp'], df['frame_number'], df['camera'])).alias('json')) df = df.repartition(1) grp = df.groupby( # window('timestamp', '1 second'), 'discrete_timestamp', ) @pandas_udf( returnType= 'timestamp timestamp, frame_number int, ssrc int, data binary, source string', functionType=PandasUDFType.GROUPED_MAP) def combine_images_into_grid(df): # TODO: This Pandas UDF provides incorrect results because it is called before the aggregation is finalized by the watermark. if df.empty: return None row0 = df.iloc[0] num_cameras = df.camera.max() + 1 grid_count = math.ceil(math.sqrt(num_cameras)) # Determine number of images per row and column. image_width = thumbnail_size[0] image_height = thumbnail_size[1] image_mode = 'RGB' margin = 1 status_width = 0 # Create blank output image, white background. out_pil = Image.new( 'RGB', ((image_width + margin) * grid_count - margin + status_width, (image_height + margin) * grid_count - margin), (128, 128, 128)) # Add images from each camera def add_image(r): # in_pil = Image.open(io.BytesIO(r['image'])) in_pil = Image.frombytes(image_mode, (image_width, image_height), r['image']) x = (r['camera'] % grid_count) * (image_width + margin) y = (r['camera'] // grid_count) * (image_width + margin) out_pil.paste(in_pil, (x, y)) df.apply(add_image, axis=1) # font = ImageFont.truetype('/usr/share/fonts/truetype/freefont/FreeSans.ttf', font_size) # draw = ImageDraw.Draw(img) # draw.text((status_width, 0), 'FRAME\n%05d\nCAMERA\n %03d' % (frame_number, camera), font=font, align='center') out_bytesio = io.BytesIO() out_pil.save(out_bytesio, format='PNG', compress_level=0) out_bytes = out_bytesio.getvalue() new_row = pd.Series() new_row['timestamp'] = row0['discrete_timestamp'] new_row['ssrc'] = 0 new_row['frame_number'] = 0 new_row['source'] = df[['camera', 'frame_number', 'timestamp']].to_json() new_row['data'] = out_bytes # new_row['data'] = b'' return pd.DataFrame([new_row]) # @pandas_udf(returnType='string', functionType=PandasUDFType.SCALAR) # def combine_images_into_grid2(json): # # TODO # def f(data): # in_pil = Image.open(io.BytesIO(data)) # out_pil = in_pil.resize(thumbnail_size) # return out_pil.tobytes() # return data_series.apply(f) df = grp.apply(combine_images_into_grid) df = df.select( func.to_json(func.struct(df["frame_number"], df["data"])).alias("event")) # df = grp.agg(func.collect_list('json')) # df = df.selectExpr('*', '0 as ssrc') # window = Window.partitionBy('ssrc').orderBy('discrete_timestamp').rowsBetween(Window.unboundedPreceding, Window.currentRow) # df = df.select('*', func.row_number().over(window)) # TODO: Output rows are not written in timestamp order. How can this be fixed? # Below gives error: Sorting is not supported on streaming DataFrames/Datasets, unless it is on aggregated DataFrame/Dataset in Complete output mode # df = df.sortWithinPartitions(df['discrete_timestamp']) df.printSchema() if False: (df.writeStream # .trigger(processingTime='1000 milliseconds') # limit trigger rate .outputMode('append').format('console').option( 'truncate', 'false').option('checkpointLocation', checkpoint_location).start().awaitTermination()) else: (df.writeStream.trigger(processingTime="1000 milliseconds").outputMode( "append").format("pravega").option( "controller", controller).option( "scope", scope).option("stream", "combinedvideo").option( "checkpointLocation", checkpoint_location).start().awaitTermination())
S3_BUCKET_REPLICATION_DATA = 'vtex-orders-index' S3_BUCKET_DATALAKE = 'vtex.datalake' S3_DATALAKE_SCHEMA_DIR = 'sample_schema/orders' ## Getting Schema's Interface from Checkout Structured Json cleansed_df = spark.read.json( get_all_paths(S3_BUCKET_DATALAKE, S3_DATALAKE_SCHEMA_DIR)) ## Reading data from Checkout History df = spark.read.json( get_all_paths(S3_BUCKET_HISTORIC_DATA, S3_BUCKET_REPLICATION_DATA + '/' + folder_prefix_filter)) ## Temporary column to convert df data to JSON. df = df.withColumn("ToJSON", to_json(struct([df[x] for x in df.columns]))) df = struct_data_frame(df, cleansed_df) df = create_partition_columns(df) ## Deletes the temporary column ToJSON df = df.drop("ToJSON") df = rewriteColumnNames(df) ### Writing data into S3 bucket #### Save table to S3 using Parquet format and partitioning by defined columns df.repartition('ingestion_year','ingestion_month','ingestion_day', 'ingestion_hour')\ .write\ .partitionBy('ingestion_year','ingestion_month','ingestion_day', 'ingestion_hour')\ .mode('append')\
def process(time, rdd): print("========= %s =========" % str(time)) #2018-05-28T13:52:07.0000000Z,2018-05-28T13:52:35.6721175Z format_1 = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSS'Z'" a = time_py.time() try: # Get the singleton instance of SparkSession if (not rdd.isEmpty()): spark = getSparkSessionInstance(rdd.context.getConf()) #rdd.context.clearCache() # Get data from kafka json to df df = spark.read.json(rdd.map(lambda x: x[1])) df = df.rdd.repartition(100).toDF() print(df.count()) df = df.withColumn('observationDate', from_unixtime(unix_timestamp('observationTime', format_1))).\ withColumn('serverDate', from_unixtime(unix_timestamp('serverTime', format_1))) #Usa current_date(), col("observationDate") en produccion df = df.where(datediff(col("serverTime"), col("observationDate")) < 7) joinUserSensor = getDataUsers(rdd.context) # join data from hdfs and stream joinData = df.alias('stream').join(joinUserSensor.alias('data'), col('stream.sensorId') == col('data.sensorId'),"leftOuter") # can be "inner", "leftOuter", "rightOuter" dataToSend = joinData.select("Type","altitude","coordinates_lat","coordinates_long","date","observationTime","observationDate","dateSend", "serverDate", "heading","location","stream.sensorId","serverTime","speed","speedmetric","temp","id_user", "name") dataToSend = dataToSend.withColumn("id", monotonically_increasing_id()) #Convertimos la funcion en udf schema4udf = StructType([StructField("addrs_name", StringType()), StructField("max_speed", IntegerType()) ]) reference_to_dict_udf = udf(reference_to_dict, schema4udf) #Obtenemos la georeferenciacion dataToSend = dataToSend.withColumn("data_osm", reference_to_dict_udf(struct([dataToSend[x] for x in ['coordinates_lat','coordinates_long', 'Type', 'speed']]))) dataToSend = dataToSend.select("id", "Type","altitude","coordinates_lat","coordinates_long","date","observationTime","observationDate","dateSend", "serverDate", "serverTime","heading","location","stream.sensorId","speed","speedmetric","temp","id_user", "name", col("data_osm.addrs_name").alias("addrs_name"), col("data_osm.max_speed").alias("max_speed") ) actualCoordinates = dataToSend.select("id", "coordinates_lat","coordinates_long") # Cargamos los puntos negros blackShapes = getBlackShapes(rdd.context) # Los cruzamos con las posiciones actuales de los vehiculos nearBlkShp = actualCoordinates.crossJoin(blackShapes) # Obtenemos solo los puntos mas cercanos a ~1km de distancia # How to find the most near position efficient? # https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa nearBlkShp = nearBlkShp.filter((nearBlkShp.lat_min <= nearBlkShp.coordinates_lat) & (nearBlkShp.lat_max >= nearBlkShp.coordinates_lat) & (nearBlkShp.long_min <= nearBlkShp.coordinates_long) & (nearBlkShp.long_max >= nearBlkShp.coordinates_long)) #Obtenemos las distancias a los puntos negros nearBlkShp = nearBlkShp.select(col("id"), col("Address"), col("Province"), col("Country"), col("numAccident"), col("lat").alias("blk_point_lat"), col("long").alias("blk_point_long"), dist(col('coordinates_lat'),col('coordinates_long'),col('lat'),col('long')).alias("Distance")) #Nos quedamos con la menor minD4 = nearBlkShp.groupBy("id").min("Distance") #Ahora obtenemos los que tienen solo menor distancia finalNearBlkShp = minD4.alias('mins').join(nearBlkShp.alias('dataBlkShp'), (col('mins.id')==col('dataBlkShp.id')) & (col('mins.min(Distance)') == col('dataBlkShp.Distance')),"leftOuter").\ select(col("dataBlkShp.id").alias('id'), col("dataBlkShp.Address").alias('address'), col("dataBlkShp.Province").alias('province'), col("dataBlkShp.Country").alias('country'), col("dataBlkShp.numAccident").alias('accidents'), col("dataBlkShp.blk_point_lat").alias('blk_point_lat'), col("dataBlkShp.blk_point_long").alias('blk_point_long'), col("dataBlkShp.Distance").alias('dist_to_blk_shp')) #Join de los puntos negros con los datos dataToSend = dataToSend.alias('data').join(finalNearBlkShp.alias('blk_shp'), col('data.id')==col('blk_shp.id'), "leftOuter") dataToSend = dataToSend.select(col("data.Type").alias("Type"), col("data.altitude").alias("altitude"), col("data.observationTime").alias("observationTime"), col("data.dateSend").alias("dateSend"), col("data.serverTime").alias("serverTime"), col("data.heading").alias("heading"), col("data.location").alias("location"), col("data.sensorId").alias("sensorId"), col("data.speed").alias("speed"), col("data.speedmetric").alias("speedmetric"), col("data.temp").alias("temp"), col("data.id_user").alias("id_user"), col("data.name").alias("user"), col("data.addrs_name").alias("actual_address"), col("data.max_speed").alias("max_speed"), col("blk_shp.address").alias("blk_shp_address"), col("blk_shp.province").alias("blk_shp_province"), col("blk_shp.country").alias("blk_shp_country"), col("blk_shp.accidents").alias("blk_shp_accidents"), array(col('blk_shp.blk_point_lat'),col('blk_shp.blk_point_long')).alias("blk_shp_coordinates"), col("blk_shp.dist_to_blk_shp").alias("blk_shp_dist") ) #Send data #dataToSend.printSchema() print(dataToSend.rdd.getNumPartitions()) dataToSend.select(to_json(struct([dataToSend[x] for x in dataToSend.columns])).alias("value")).write.format("kafka").option("kafka.bootstrap.servers", kServer).option("topic", topicOut).save() except Exception as e: print(str(e)) pass b = time_py.time() c = (b-a) print(c)
out_df = out_df.withColumn("aov", F.col("revenue") / F.col("purchases")) #out_df = out_df.groupBy("start_ts")\ # .agg(\ # F.countDistinct("partyId").alias("visitors")\ # )\ # .join(\ # out_df.filter("eventType='itemBuyEvent'").groupBy("start_ts")\ # .agg(F.max("end_ts").alias("end_ts"),\ # F.sum("item_price").alias("revenue"),\ # F.countDistinct("sessionId").alias("purchases"),\ # F.sum("item_price")/F.countDistinct("sessionId")\ # )\ # ,"start_ts", "left") out_columns = list(out_df.columns) #out_columns = ["start_ts","end_ts","visitors","revenue","purchases","aov"] query = out_df\ .select(F.to_json(F.struct(*out_columns)).alias("value"))\ .writeStream \ .outputMode("update")\ .format("kafka") \ .option("checkpointLocation", "/tmp/checkpoint-write")\ .option("kafka.bootstrap.servers", kafka_bootstrap ) \ .option("topic", topic_out) \ .start()\ query.awaitTermination()
def main(): spark = SparkSession \ .builder \ .appName("Divvy Bikes") \ .enableHiveSupport() \ .getOrCreate() # spark = session_spark() # spark.sparkContext.addPyFile("utils.zip") # spark.sparkContext.addPyFile("ingestion.zip") # spark.sparkContext.addPyFile("ml.zip") log4jLogger = quiet_logs(spark) logger = log4jLogger.LogManager.getLogger(__name__) logger.info("Iniciando previsões") logger.info("Buscando dados de estações") train_inst = CreateDataframe(spark) bikes = train_inst.get_data(with_temperature=False) logger.info("Buscando dados de previsão do tempo") predict_inst = PredictDataframe(spark) temperature_api = predict_inst.get_data() temperature_inst = TemperatureDataframe(spark) temperature = temperature_inst.create_new_columns(temperature_api) logger.info("Join dataframe de estações e temperatura") dataframe_join = predict_inst.join_dataframe(bikes, temperature) logger.info("Criando dataframe final") dataframe_new_columns = train_inst.create_new_columns(dataframe_join)\ .select("date", "from_station_id", "latitude", "month", "longitude", "mean_dpcapacity_start", "mean_dpcapacity_end", "sum_subscriber","sum_customer", "part_time", "holiday", "week_days", col("weather_condition").alias("weather_description"), "humidity", "pressure", "temperature", "wind_speed")\ .cache() dataframe_new_columns.take(1) logger.info("Buscando a condição climática mais frequente por período") mf_condition_part_time = train_inst.get_mf_part_time(dataframe_new_columns) dataframe_final = train_inst.group_target(dataframe_new_columns, mf_condition_part_time) \ .drop("bicycle_rentals")\ .cache() dataframe_final.take(1) logger.info("Carregando modelo") model = predict_inst.get_model() logger.info("Realizando previsões") predictions = model.transform(dataframe_final) #predictions.show(truncate=False) logger.info("Criando json final") df_json = predict_inst.create_json(predictions) df_json.select(to_json(struct("*")).alias("value"))\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", "quickstart.cloudera:9092")\ .option("topic", "bikes")\ .save()
# +--------------------+-----+--------------------+---------+ # # In this JSON Format {"customer":"*****@*****.**","score":"28.5","email":"*****@*****.**","birthYear":"1963"} joinedCustomerDF = customerRiskStreamingDF.withColumn("email", col("customer")).join(emailAndBirthYearStreamingDF, on="email").select("customerrisk.customer","customerrisk.score", "email", "birthYear") # joinedCustomerDF = customerRiskStreamingDF.join(emailAndBirthYearStreamingDF, emailAndBirthYearStreamingDF.email== customerRiskStreamingDF.customer) joinedCustomerDF.printSchema() # joinedCustomerDF.show(n=3) # joinedCustomerDF.writeStream.outputMode("append").format("console")\ # .option("truncate", "false")\ # .start()\ # .awaitTermination() # joinedCustomerDF.withColumn("value", to_json()) kafkaDataFrame = joinedCustomerDF.select(col("email"), to_json(struct([joinedCustomerDF[x] for x in joinedCustomerDF.columns]))).toDF("key","value") # (joinedCustomerDF.select(to_json(struct([joinedCustomerDF[x] for x in joinedCustomerDF.columns])).alias("value")) # .writeStream # .format("kafka") # .option("kafka.bootstrap.servers", "localhost:9092") # .option("topic", "topic-risk-score") # .option("checkpointLocation", "/tmp/kafka/checkpoint") # .start()) kafkaDataFrame.printSchema() # kafkaDataFrame.writeStream\ # .outputMode("append").format("console")\ # .option("truncate", "false")\ # .start().awaitTermination()
topics = player_names.union(teams) # Reads the data from kafka df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "broker:9092") \ .option("failOnDataLoss", "false") \ .option("subscribe", "tweets") \ .option("startingOffsets", "earliest") \ .load() messages = extractTweetPayload(df, tweetSchema, payloadSchema) wordCount = wordCountQuery(messages, "Text") \ .join(topics, "word") \ .select("word", "count","category", to_json(struct("word", "count","category")).alias("value")) langCount = langCountQuery(messages, "Lang") query = wordCount \ .writeStream \ .format("kafka") \ .option("checkpointLocation", "./checkpoints") \ .option("kafka.bootstrap.servers", "broker:9092") \ .option("topic", "countByName") \ .start() query.awaitTermination()
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2) # You can also explode map types, which will turn them into columns df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("explode(complex_map)").show(2) # ---------- STEP 4 ---------- # JSON # ----------------------------- # Let’s begin by creating a JSON column jsonDF = spark.range(1).selectExpr(""" '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""") # show it jsonDF.show() # You can use the get_json_object to inline query a JSON object # You can use json_tuple if this object has only one level of nesting from pyspark.sql.functions import get_json_object, json_tuple jsonDF.select( get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias('column'), json_tuple(col("jsonString"), "myJSONKey")).show(2) # You can also turn a StructType into a JSON string by using the to_json function from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))).show()
def execute(self, conf_path: str, input_path: str, output_path: str, on_dbfs: bool) -> None: """ Pipeline that sanitize data, extract drugs and change the data model finally save to a JSON. This is the main entrypoint of the package. The parameters are the job's arguments. Args: conf_path: If DataBricks Filesystem is mounted input_path: Folder path to write files output_path: Folder path to read raw files on_dbfs: File path of the params.json Returns: Nothing only modify inplace the instanced class """ self.load_params(conf_path) df_dict = Sanitizer.read_files(self.logger, self.spark, self.params, input_path) Sanitizer.clean_strings(self.logger, df_dict) df_dict = Sanitizer.clean_date(self.logger, df_dict) df_dict = Sanitizer.empty_str_cleaning(self.logger, df_dict) Sanitizer.deduplication(self.logger, df_dict, self.params.get("deduplication rules")) Files.merge_write(self.logger, df_dict, self.params.get("merge sanitized rules"), path.join(output_path, "sanitized"), self.spark) df_dict = Files.read_delta( self.logger, set(self.params.get("csv") + self.params.get("json")), path.join(output_path, "sanitized"), self.spark) Sanitizer.deduplication(self.logger, df_dict, self.params.get("deduplication rules")) DrugsExtractor.to_words(self.logger, df_dict, self.params.get("to words")) drug_df_name = self.params.get("names").get("drugs") drug_col_name = self.params.get("names").get("drug") df_dict[drug_df_name] = df_dict.get(drug_df_name).withColumn( drug_col_name, lower(col(drug_col_name))).filter(col(drug_col_name).isNotNull()) # To be refactor as it don't work in case of really large drug list because of collect to driver (below) and column creation (above) # need to drop duplicate because several drugs can have different atc code drugs_list = df_dict.get(drug_df_name).select( drug_col_name).drop_duplicates().toPandas()[drug_col_name].to_list( ) df_dict.pop(drug_df_name) for df in df_dict.values(): df.cache() self.logger.info( "Prepared drug list and cached dataframes for following intensive computation: {}" .format(df_dict)) DrugsExtractor.pivot(self.logger, drugs_list, df_dict) date = self.params.get("names").get("date") id_col = self.params.get("names").get("id") journal = self.params.get("names").get("journal") columns_kept = [date, id_col, journal] df_dict = DrugsExtractor.shift(self.logger, drugs_list, df_dict, drug_col_name, self.spark, columns_kept) # Construct publication objects and journal object for df_name in self.params.get("to words").keys(): df_dict[df_name] = df_dict.get(df_name).withColumn( date, col(date).cast(StringType())) df_dict[df_name] = df_dict.get(df_name).withColumn(id_col, struct(col(date).alias(date), col(id_col).alias(id_col))) \ .withColumn(journal, struct(col(date).alias(date), col(journal).alias(journal))) self.logger.info( "Publication objects and journal object constructed: {}".format( df_dict)) trial = self.params.get("names").get("clinical_trials") pubmed = self.params.get("names").get("pubmed") # Get of each drug a the list of journal and publication (we use set on journal to avoid duplicates) merge_trial_df = \ df_dict.get(trial).groupby(drug_col_name)\ .agg(collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(trial))\ .withColumn(pubmed, lit(None) .cast(ArrayType(StructType([StructField('date', StringType(), True), StructField('id', StringType(), True)])))) self.logger.info("Created publication per drug for trials: {}".format( merge_trial_df)) merge_pub_df = df_dict.get(pubmed).groupby(drug_col_name).agg( collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(pubmed)) self.logger.info( "Created publication per drug for pubmed: {}".format(merge_pub_df)) # Merge clinical trials publications with pubmed publication by drug with their associated journal (without repetition) merge_path = path.join(output_path, "enriched") Files.merge_write(self.logger, {trial: merge_trial_df}, self.params.get("merge sanitized rules"), merge_path, self.spark) delta_path = path.join(merge_path, trial) from delta.tables import DeltaTable delta_trial = DeltaTable.forPath(self.spark, delta_path) update_match = "trial.{0} = pub.{0}".format(drug_col_name) update = { pubmed: col(f"pub.{pubmed}"), journal: array_distinct( concat(col(f"pub.{journal}"), col(f"trial.{journal}"))) } insert = { pubmed: col(f"pub.{pubmed}"), journal: col(f"pub.{journal}"), drug_col_name: col(f"pub.{drug_col_name}"), trial: lit(None) } self.logger.info( "Merging publications with the matching rule: {}".format( update_match)) (delta_trial.alias("trial").merge( merge_pub_df.alias("pub"), update_match).whenMatchedUpdate( set=update).whenNotMatchedInsert(values=insert).execute()) # Save the end result graph_filename = self.params.get("names").get("graph_filename") json_df = self.spark.read.format("delta").load(delta_path) # To use the filesystem mounted on databricks with python process we need to prefix "/dbfs/" but Spark process don't work with this prefix pythonic_path = "/dbfs" + output_path if on_dbfs else output_path graph_path = path.join(pythonic_path, *graph_filename) json_df.withColumn(journal, to_json(col(journal))).withColumn( trial, to_json(col(trial))).withColumn(pubmed, to_json( col(pubmed))).toPandas().to_json(graph_path, orient="records", date_format="iso") # when used multiLine need to be enable on the reading spark process self.logger.info("Wrote the resulting JSON to: {}".format(graph_path))
.withColumn('x1', (F.col('N1') + F.col('geoAltitude')) * F.cos(F.radians(F.col('latitude'))) * F.cos(F.radians(F.col('longitude')))) \ .withColumn('y1', (F.col('N1') + F.col('geoAltitude')) * F.cos(F.radians(F.col('latitude'))) * F.sin(F.radians(F.col('longitude')))) \ .withColumn('z1', ((1 - 8.1819190842622e-2**2) * F.col('N1') + F.col('geoAltitude')) * F.sin(F.radians(F.col('latitude')))) \ .withColumn('x2', (F.col('N2') + F.col('pred_geoAltitude')) * F.cos(F.radians(F.col('pred_latitude'))) * F.cos(F.radians(F.col('pred_longitude')))) \ .withColumn('y2', (F.col('N2') + F.col('pred_geoAltitude')) * F.cos(F.radians(F.col('pred_latitude'))) * F.sin(F.radians(F.col('pred_longitude')))) \ .withColumn('z2', ((1 - 8.1819190842622e-2**2) * F.col('N2') + F.col('pred_geoAltitude')) * F.sin(F.radians(F.col('pred_latitude')))) pred_df = pred_df \ .withColumn('dist_error', F.sqrt((F.col('x1') - F.col('x2'))**2 + \ (F.col('y1') - F.col('y2'))**2 + \ (F.col('z1') - F.col('z2'))**2) / 1000) \ .drop('latitude', 'longitude', 'geoAltitude') \ .drop('pred_latitude', 'pred_longitude', 'pred_geoAltitude') \ .drop('N1', 'N2', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2') # Write stream to console for debug purposes # pred_df.writeStream.outputMode("append").option("truncate", False).format("console").start().awaitTermination() # Write stream to Kafka pred_df = pred_df \ .select(F.to_json(F.struct("pred", "target", "dist_error", "timeAtServer", "aircraft")).alias("value")) pred_df \ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", ", ".join(kafka_config['servers'])) \ .option("topic", kafka_config['topics'][1]) \ .option("checkpointLocation", "checkpoint") \ .start() \ .awaitTermination()
def transform_patient(patient): now = datetime.datetime.now().strftime('%Y-%m-%d') return patient.na.fill("").groupBy('person_id').\ agg( f.first('person_id').alias('couch_id'), f.to_json(f.struct( f.first('person_uuid').alias('uuid'), f.concat_ws(' ', f.first('given_name'), f.first('middle_name'), f.first('family_name')).alias('display'), f.struct( f.first('person_uuid').alias('uuid'), f.concat_ws(' ', f.first('given_name'), f.first('middle_name'), f.first('family_name')).alias('display'), f.first('gender').alias('gender'), f.first('birthdate').alias('birthdate'), f.first('dead').alias('dead'), (f.year(f.to_date(f.lit(now))) - f.year(f.first('birthdate'))).alias('age') , f.first('death_date').alias('deathDate'), f.first('cause_of_death').alias('causeOfDeath'), f.first('birthdate_estimated').alias('birthdateEstimated'), f.collect_set( f.struct( f.concat_ws(' ', f.col('person_attribute_type_name'), f.lit('='), f.col('person_attribute_value')).alias('display'), f.col('person_attribute_value').alias('value'), f.col('person_attribute_uuid').alias('uuid'), f.col('person_attribute_voided').alias('voided'), f.struct( f.col('person_attribute_type_name').alias('display'), f.col('person_attribute_type_uuid').alias('uuid'), ).alias('attributeType') ) ).alias('attributes') ).alias('person'), f.collect_set( f.struct( f.col('identifier'), f.col('identifier_preferred').alias('preferred'), f.struct( f.col('identifier_location_name').alias('name'), f.col('identifier_location_uuid').alias('uuid') ).alias('location'), f.struct( f.col('identifier_type_name').alias('name'), f.col('identifier_type_uuid').alias('uuid') ).alias('identifierType'), )).alias('identifiers'), f.struct( f.first('person_address_city_village').alias('cityVillage'), f.first('person_address_longitude').alias('longitude'), f.first('person_address_latitude').alias('latitude'), f.first('person_address_country').alias('country'), f.first('person_address_county_district').alias('countyDistrict'), f.first('person_address_1').alias('address1'), f.first('person_address_2').alias('address2'), f.first('person_address_3').alias('address3'), f.first('person_address_4').alias('address4'), f.first('person_address_5').alias('address5'), f.first('person_address_6').alias('address6'), f.first('person_address_preferred').alias('preferred') ).alias('preferredAddress') )).alias('patient')).\ withColumn('type', f.lit('patient'))
def transform_encounter(encounter_dataframe, obs_dataframe, streaming=True, filters=None): if (streaming): orders = get_orders(filters['encounter_ids']) encounter_types = get_encounter_types() forms = get_forms(filters['form_ids']) encounter_providers = get_encounter_providers( filters['encounter_ids']).alias('encounter_provider') locations = get_locations(filters['location_ids']).alias('location') visits = get_visits(locations, filters['visit_ids']) patients = get_patients(filters['patient_ids']) else: forms = get_forms() locations = get_locations().alias('location') visits = get_visits(locations) encounter_types = get_encounter_types() patients = get_patients() orders = get_orders() encounter_providers = get_encounter_providers().alias( 'encounter_provider') obs = obs_dataframe.alias('obs') joined_encounters = encounter_dataframe.join(f.broadcast(forms),on='form_id')\ .join(f.broadcast(locations), on='location_id')\ .join(f.broadcast(visits), on='visit_id')\ .join(f.broadcast(encounter_types), on=encounter_dataframe['encounter_type'] == encounter_types['encounter_type_id'])\ .join(patients, on='patient_id')\ .join(encounter_providers, on=encounter_providers['encounter_id'] == encounter_dataframe['encounter_id'], how='left')\ .join(orders, on=orders['encounter_id'] == encounter_dataframe['encounter_id'], how='left')\ .join(obs, on=obs['encounter_id'] == encounter_dataframe['encounter_id'], how='left') return joined_encounters\ .groupBy('encounter.encounter_id').agg( f.first('patient_id').alias('person_id'), f.lit('encounter').alias('type'), f.first('encounter.location_id').alias('location_id'), f.first('person_uuid').alias('person_uuid'), f.col('encounter.encounter_id').cast('string').alias('couch_id'), f.first('encounter.uuid').alias('uuid'), f.first('encounter_datetime').alias('encounterdatetime'), f.struct( f.first('encounter_type_name').alias('display'), f.first('encounter_type_uuid').alias('uuid') ).alias('encountertype'), f.struct( f.first('form_name').alias('name'), f.first('form_uuid').alias('uuid') ).alias('form'), f.struct( f.first('location.location_name').alias('display'), f.first('location.location_uuid').alias('uuid') ).alias('location'), f.to_json(f.collect_set( f.when(f.col('encounter_provider_uuid').isNotNull(), f.struct( f.col('encounter_provider_uuid').alias('uuid'), f.col('encounter_provider.provider_name').alias('display'), f.struct( f.col('encounter_provider.provider_uuid').alias('uuid'), f.concat_ws(' ', f.col('encounter_provider.provider_identifier'), f.lit('-'), f.col('encounter_provider.provider_name')).alias('display') ).alias('provider') )) )).alias('encounterproviders'), f.to_json(f.struct( f.first('visit_uuid').alias('uuid'), f.first('visit.date_started').alias('dateStarted'), f.first('visit.date_stopped').alias('dateStopped'), f.struct( f.first('visit_type_name').alias('name'), f.first('visit_type_uuid').alias('uuid') ).alias('visitType'), f.struct( f.first('visit.location_name').alias('name'), f.first('visit.location_uuid').alias('uuid') ).alias('location'), f.concat_ws(' ', f.first('visit_type_name'), f.lit('@'), f.first('visit.location_name'), f.lit('-'), f.first('visit.date_started')) .alias('display') )).alias('visit'), f.to_json(f.collect_set( f.when(f.col('order_uuid').isNotNull(),f.struct( f.col('order_uuid').alias('uuid'), f.col('order_number').alias('orderNumber'), f.struct( f.col('orders.concept_uuid').alias('uuid'), f.col('orders.concept_name').alias('display') ).alias('concept'), f.struct( f.col('orders.provider_uuid').alias('uuid'), f.concat_ws(' ', 'orders.provider_identifier', 'orders.provider_name').alias('display') ).alias('orderer'), f.col('order_action').alias('action'), f.col('orders.date_activated').alias('dateActivated'), f.col('orders.date_created').alias('dateCreated'), f.col('orders.urgency').alias('urgency'), f.col('order_type_name').alias('type') ) ).otherwise(None))).alias('orders'), f.to_json(f.collect_list( f.struct( f.lit('obs.uuid').alias('uuid'), f.col('obs_datetime').alias('obsDatetime'), f.struct( f.col('parent_obs_concept_uuid').alias('uuid'), f.struct( f.col('parent_obs_concept_name').alias('display')) .alias('name') ).alias('concept'), f.when(f.col('value_coded').isNotNull(), f.struct( f.col('value_type').alias('type'), f.to_json( f.struct( f.col('value_coded_concept_uuid').alias('uuid'), f.col('value_coded_concept_name').alias('display') )).alias('value') ) ).when(f.col('value_not_coded').isNotNull(), f.struct( f.col('value_type').alias('type'), f.col('value_not_coded').alias('value') ) ).alias('value'), f.when(f.col('groupmembers').isNotNull(), f.col('groupmembers') ).alias('groupMembers') ))).alias('obs'), ).withColumn('build_date', f.current_timestamp())
.option("startingOffsets", "latest") \ .load() \ .selectExpr("CAST(value as string)")\ .select(F.from_json("value", schema).alias("value"))\ .select(F.col("value.*"))\ .select("uid", F.col('visits').url.alias("urls"))\ .withColumn('domains', foo_udf(F.col('urls'))) # Infer on test data results = model.transform(st) # get string classes from encoded values converter = IndexToString(inputCol="prediction", outputCol="gender_age", labels=model.stages[1].labels) converted = converter.transform(results) #Saving to another topic query = converted\ .select(F.to_json(F.struct("uid", "gender_age")).alias("value"))\ .writeStream\ .outputMode("append")\ .format("kafka") \ .option("checkpointLocation", "file:///tmp/checkpoint")\ .option("kafka.bootstrap.servers", kafka_bootstrap ) \ .option("topic", topic_out) \ .start() query.awaitTermination()
RenamedDF= JsonDF.select(explode(F.col("TimeSerieDtos")).alias("TimeSerie"),\ col("EntityExternalId").alias("EntityName"),col("TimeSerie.Time").alias("StartDate")\ ,"TimeResolution","TimeSerie.Tags","TimeSerie.Value").drop("TimeSerie") \ .withColumn('ImportDateTime',lit(nu)) \ .withColumn("EntryId", row_number().over(window)) #display(RenamedDF) # COMMAND ---------- # making export dataframe if DINO1 == 1: DINO1DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO1"))\ .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\ ,to_json("Location").alias("JsonValue")) if SUN1 == 1: SUN1DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("SUN1"))\ .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime",\ to_json("Location").alias("JsonValue")) if DINO2 == 1: DINO2DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO2")) \ .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\ ,col("Valid").alias("JsonValue")) if SUN2 == 1: SUN2DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("SUN2")) \ .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\ ,col("Valid").alias("JsonValue")) if DINO3 == 1: DINO3DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO3")) \ .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\
#then we can calculate the relative SAIDI/SAIFI contribution of each outage pw_finalized_outages = pw_finalized_outages.join( pw_distinct_user_id, F.date_trunc("day", F.from_unixtime( pw_finalized_outages["outage_time"])) == F.date_trunc( "day", pw_distinct_user_id["window_mid_point"])) pw_finalized_outages = pw_finalized_outages.select( "outage_time", "cluster_size", "phones_reporting", "user_id", "outage_times", "outage_times_range", "outage_times_stddev") pw_finalized_outages = pw_finalized_outages.withColumn( "relative_cluster_size", col("cluster_size") / col("phones_reporting")) pw_finalized_with_string = pw_finalized_outages.withColumn( "outage_times", F.to_json("outage_times")) pw_finalized_with_string = pw_finalized_outages.withColumn( "user_id", F.to_json("user_id")) #okay we should save this #pw_finalized_with_string.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(args.result + '/full_outage_list') #okay let's filter and print this pw_finalized_outages = pw_finalized_outages.filter(col("cluster_size") >= 2) pw_finalized_outages.show(1000) #We need to zero fill for every date and cluster size not already present in the dataset #to do this create a dataframe for range date_min to date_max and cluster_size cluster_size_min to cluster_size max with 0 rel saifi #then join it with the actual DF preferentially choosing the non zero value #min_time = pw_finalized_outages.agg(F.min("outage_time")).collect()[0].__getitem__("min(outage_time)") #max_time = pw_finalized_outages.agg(F.max("outage_time")).collect()[0].__getitem__("max(outage_time)")
cvmodel = model.stages[1] vocabulary = cvmodel.vocabulary vocab_size = len(vocabulary) topics = model.stages[-1].describeTopics() topics = topics.withColumn( 'terms', indices_to_terms(vocabulary)(topics.termIndices, topics.termWeights)) scores = topics.select('terms').rdd.flatMap( lambda list: list).collect() val = Row(date=week, results=scores, subreddit=subreddit, \ vocab_size=vocab_size, num_docs=num_docs) line = (subreddit, week, val) week_df = spark.createDataFrame([line], cols) week_df = week_df.withColumn('date', to_date(week_df.date)) week_df = week_df.withColumn( 'results', to_json(week_df.results).cast(StringType())) week_df.write.jdbc(dburl, 'newresults', mode='append', properties={ 'user': dbuser, 'password': dbpwd })
dump_df_to_s3(test_ids.toDF(), 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) logger.info(f'Dumping features and labels for training...') dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') featurs_graph_df = features_df.withColumn( 'props_values:String', to_json( struct( list( filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names))))) featurs_graph_df = featurs_graph_df.select('TransactionID', 'props_values:String') logger.info(f'Creating glue dynamic frame from spark dataframe...') features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF') features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')]) features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel( features_graph_dynamic_df, 'Transaction') features_graph_dynamic_df = SelectFields.apply( frame=features_graph_dynamic_df, paths=["~id", '~label', 'props_values:String'])
def read_data(self): userSchema = StructType([ StructField('medallion', StringType()), StructField('pickup_time', TimestampType()), StructField('dropoff_time', TimestampType()), StructField('passenger_count', IntegerType()), StructField('trip_time', IntegerType()), StructField('trip_distance', DoubleType()), StructField('pickup_loc', MapType(StringType(), DoubleType())), StructField('dropoff_loc', MapType(StringType(), DoubleType())) ]) self.df = self.spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "nycspeed12") \ .option("startingOffsets", "earliest") \ .option('failOnDataLoss','false') \ .option('enable.auto.commit','false') \ .option('group.id','nyc6') \ .option('auto.offset.reset','earliest') \ .option("kafka.client.id", "nycid6") \ .option("maxOffsetsPerTrigger", 1000) \ .load() self.dff = self.df.selectExpr("CAST(value as STRING) as json") \ .select(from_json("json", userSchema).alias('data'))\ .selectExpr( "data.medallion as medallion", "cast (data.pickup_time as timestamp)", "cast (data.dropoff_time as timestamp)", "cast (data.passenger_count as integer)", "cast (data.trip_time as integer)", "cast (data.trip_distance as float)", "cast (data.pickup_loc.lat as float) as pickup_loc_lat", "cast (data.pickup_loc.lon as float) as pickup_loc_lon", "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat", "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon", ) print(self.dff.printSchema()) self.windowedCounts = self.dff \ .filter('trip_time > 0') \ .withWatermark("pickup_time", "30 days") \ .groupBy("medallion", window("pickup_time", "24 hours")) \ .agg(func.sum('trip_distance').alias('sum_trip_distance'), func.avg('trip_distance').alias('avg_trip_distance'), func.sum('trip_time').alias('sum_trip_time'), func.avg('trip_time').alias('avg_trip_time'), func.sum('passenger_count').alias('sum_passenger_count'), func.avg('passenger_count').alias('avg_passenger_count') ) print((self.windowedCounts \ .writeStream \ .outputMode("complete") \ .format("console") \ .option('truncate','false') .option('numRows', 20) .start() .awaitTermination() )) query = self.windowedCounts \ .select(to_json(struct("medallion",'window','sum_trip_distance', 'avg_trip_distance','sum_trip_time','avg_trip_time', 'sum_passenger_count','avg_passenger_count')).alias('value')) \ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("topic", "es3") \ .option("checkpointLocation", "/tmp/kafkachkpnt/")\ .outputMode('update') \ .start() query.awaitTermination()
# COMMAND ---------- from pyspark.sql.functions import get_json_object, json_tuple jsonDF.select( get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column", json_tuple(col("jsonString"), "myJSONKey")).show(2) # COMMAND ---------- from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))) # COMMAND ---------- from pyspark.sql.functions import from_json from pyspark.sql.types import * parseSchema = StructType(( StructField("InvoiceNo",StringType(),True), StructField("Description",StringType(),True))) df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct")).alias("newJSON"))\ .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2) # COMMAND ----------
df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_servers) \ .option("subscribe", in_topic) \ .option("failOnDataLoss", "false") \ .load() df=df.withColumn("value", from_avro("value", jsonFormatSchema)) \ .select((col("value.timestamp")/milli).alias("time") \ .cast(TimestampType()),col("value.house_id"),col("value.appliance_id"),col("value.appliance_name"),col("value.power")) out=df.withWatermark("time", watermark + " seconds") \ .groupBy(window(col("time"), str(window_converted) + " seconds", str(window_converted//5) + " second"),"house_id","appliance_id") \ .agg(count("power").alias("c_all"),count(when(col("power") > powerthres, True)).alias("c_duty")) \ .withColumn("duty_cycle", (col("c_duty")/col("c_all"))).withColumn("time_end", col("window.end")) \ .drop("window", "c_all", "c_duty") #query=out.writeStream.outputMode("append").format("console").option("truncate", False).start() query = out.withColumn("value", to_json(struct("time_end","house_id","appliance_id","duty_cycle"))) \ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_servers) \ .option("topic", out_topic) \ .option("checkpointLocation", "checkpoints") \ .start() query.awaitTermination()
#!/usr/bin/env python3 from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.sql import functions as F spark = SparkSession.builder.appName("SimpleStreamingApp").getOrCreate() to_kafka = spark.read.json('datasets/data1.json').drop('_corrupt_record') columns = to_kafka.columns schema = to_kafka.schema to_kafka \ .select(F.to_json(F.struct(*columns)).alias('value')) \ .write \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("topic", "test_topic") \ .save() batch_from_kafka = spark \ .read \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "test_topic") \ .load() batch_from_kafka \ .select(batch_from_kafka['value'].cast("String")) \
df_tags = df_tags.withColumn("timestamp", functions.from_unixtime(df_tags['timestamp'])) df_tags = df_tags.withColumnRenamed('tag', 'value') df_tags = df_tags.withColumn('event_type', func.lit('tag')) all_data = df_rating.unionAll(df_tags) all_data_sorted = all_data.sort(all_data['timestamp'].asc()) all_data_sorted = all_data_sorted.withColumn( 'key', func.concat_ws('|', all_data_sorted['user_id'], all_data_sorted['movie_id'], all_data_sorted['value'], all_data_sorted['timestamp'], all_data_sorted['event_type'])) all_data_sorted = all_data_sorted.withColumn( 'value_json', func.to_json( func.struct(all_data_sorted['user_id'], all_data_sorted['movie_id'], all_data_sorted['value'], all_data_sorted['timestamp'], all_data_sorted['event_type']))) all_data_sorted.selectExpr("CAST(key AS STRING)", "CAST(value_json AS STRING) as value") \ .write \ .format("kafka") \ .option("kafka.bootstrap.servers", kafka_server) \ .option("topic", kafka_topic) \ .save() # kafka-console-consumer --bootstrap-server 199.60.17.212:9092 --topic tag_rate_small --from-beginning # {"user_id":"514","movie_id":"5247","value":"2.5","timestamp":"2018-09-23 19:44:00","event_type":"rate"} #spark-submit --packages datastax:spark-cassandra-connector:2.3.1-s_2.11,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 data_loading/event_stream_generator.py
# Selección los campos que se enviarán a Druid través de kafka taxiTripsToKafka = taxiTrips.select("trip_id", "taxi_id", "company", "trip_start_timestamp", "trip_end_timestamp", "trip_seconds", "trip_miles", "pickup_community_area", "dropoff_community_area", "fare", "tips", "tolls", "extras", "trip_total") # Enriquecimiento del Stream con los nombres de los areas de inicio y fin, y sus puntos centrales(lat. y long.) taxiTripsEnrich = taxiTripsToKafka.join(pickupAreas, 'pickup_community_area')\ .join(dropoffAreas, 'dropoff_community_area') # Inicio de la query que escribe el resultado del enriquecimiennto a kafka queryToKafka = taxiTripsEnrich\ .select(taxiTripsEnrich["taxi_id"].cast('string').alias("key"), to_json(struct("*")).alias("value"))\ .writeStream \ .format("kafka") \ .option("kafka.bootstrap.servers", cfg.kafka_brokers) \ .option("topic", cfg.kafka_outTopic) \ .option("checkpointLocation", cfg.checkpointKafka_path) \ .outputMode("Append") \ .start() # Inicio de la query que escribe los eventos a HDFS queryToHDFS = taxiTrips.writeStream \ .format("parquet") \ .trigger(processingTime='15 minutes') \ .partitionBy("year", "month") \ .option("path", cfg.trips_path) \ .option("checkpointLocation", cfg.checkpointHDFS_path) \
df.printSchema() print([3] * 5) # multi_ids_columns = ["tag_ids", "ids"] # # for column in multi_ids_columns: # df = feature.multionehot(df, column) # df.show() # # def array_to_string(my_list): # return '[' + ','.join([str(elem) for elem in my_list]) + ']' # # # array_to_string_udf = udf(array_to_string, StringType()) # # df = df.withColumn('categorystr', array_to_string_udf("category")).drop("category") # df = df.withColumn('categoryIdStr', array_to_string_udf("category_id")).drop("category_id") # df.show() # df.withColumn("features", to_json(struct($"features"))).write.csv(. df = df.withColumn("category-onehot", pyf.to_json("category-onehot")).withColumn( "category_id-onehot", pyf.to_json("category_id-onehot")) df.drop("tag_ids").drop("ids").drop("tag_texts").coalesce(1).write.format( "com.databricks.spark.csv").option( "header", "true").mode("overwrite").save("feature.csv") df = df.toPandas() df.to_csv('test.csv', index=False)
def transform_into_openmrs_object(self, encounter_dataframe): return encounter_dataframe.groupBy('encounter.encounter_id').agg( f.first('patient_id').alias('person_id'), f.lit('encounter').alias('type'), f.first('encounter.location_id').alias('location_id'), f.first('person_uuid').alias('person_uuid'), f.col('encounter.encounter_id').cast('string').alias('couch_id'), f.first('uuid').alias('uuid'), f.first('encounter_datetime').alias('encounterdatetime'), f.struct( f.first('encounter_type_name').alias('display'), f.first('encounter_type_uuid').alias('uuid')).alias( 'encountertype'), f.struct( f.first('form_name').alias('name'), f.first('form_uuid').alias('uuid')).alias('form'), f.struct( f.first('location.location_name').alias('display'), f.first('location.location_uuid').alias('uuid')).alias( 'location'), f.to_json( f.collect_set( f.when( f.col('encounter_provider_uuid').isNotNull(), f.struct( f.col('encounter_provider_uuid').alias('uuid'), f.col('encounter_provider.provider_name').alias( 'display'), f.struct( f.col('encounter_provider.provider_uuid'). alias('uuid'), f.concat_ws( ' ', f.col( 'encounter_provider.provider_identifier' ), f.lit('-'), f.col('encounter_provider.provider_name')). alias('display')).alias('provider'))))).alias( 'encounterproviders'), f.to_json( f.struct( f.first('visit_uuid').alias('uuid'), f.first('visit.date_started').alias('dateStarted'), f.first('visit.date_stopped').alias('dateStopped'), f.struct( f.first('visit_type_name').alias('name'), f.first('visit_type_uuid').alias('uuid')).alias( 'visitType'), f.struct( f.first('visit.location_name').alias('name'), f.first('visit.location_uuid').alias('uuid')).alias( 'location'), f.concat_ws(' ', f.first('visit_type_name'), f.lit('@'), f.first('visit.location_name'), f.lit('-'), f.first('visit.date_started')).alias( 'display'))).alias('visit'), f.to_json( f.collect_set( f.when( f.col('order_uuid').isNotNull(), f.struct( f.col('order_uuid').alias('uuid'), f.col('order_number').alias('orderNumber'), f.struct( f.col('orders.concept_uuid').alias('uuid'), f.col('orders.concept_name').alias( 'display')).alias('concept'), f.struct( f.col('orders.provider_uuid').alias('uuid'), f.concat_ws(' ', 'orders.provider_identifier', 'orders.provider_name').alias( 'display')).alias('orderer'), f.col('order_action').alias('action'), f.col('orders.date_activated').alias( 'dateActivated'), f.col('orders.date_created').alias('dateCreated'), f.col('orders.urgency').alias('urgency'), f.col('order_type_name').alias('type'))).otherwise( None))).alias('orders'), f.to_json( f.collect_list( f.struct( f.lit('obs_uuid_to_be_included').alias('uuid'), f.col('obs_datetime').alias('obsDatetime'), f.struct( f.col('parent_obs_concept_uuid').alias('uuid'), f.struct( f.col('parent_obs_concept_name').alias( 'display')).alias('name')).alias( 'concept'), f.when( f.col('value_coded').isNotNull(), f.struct( f.col('value_type').alias('type'), f.to_json( f.struct( f.col('value_coded_concept_uuid'). alias('uuid'), f.col('value_coded_concept_name' ).alias('display'))). alias('value'))).when( f.col('value_not_coded').isNotNull(), f.struct( f.col('value_type').alias('type'), f.col('value_not_coded').alias( 'value'))).alias('value'), f.when( f.col('groupmembers').isNotNull(), f.col('groupmembers')).alias( 'groupMembers')))).alias('obs'), ).withColumn('build_date', f.current_timestamp())
value_df.printSchema() #Choosing prime customers and calculate total transactions and earned points rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \ .groupBy("value.CustomerCardNo") \ .agg(sum("value.TotalValue").alias("TotalPurchase"), sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards")) #Rename column rewards_df = rewards_df.withColumn("CustomerCardNo", expr("`value.CustomerCardNo`")) \ .drop("value.CustomerCardNo") #Serilization to json format kafka_target_df = rewards_df.select( expr("CustomerCardNo as key"), to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value")) #Check schema rewards_df.printSchema() # Alternative statement for kafka target # kafka_target_df = rewards_df.selectExpr("value.CustomerCardNo as key", # "to_json(struct(*)) as value") # kafka_target_df.show(truncate=False) #Writestream to kafka topic rewards_writer_query = kafka_target_df \ .writeStream \ .queryName("Rewards Writer") \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \
def main(topic): # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', topic)\ .option('failOnDataLoss', 'false')\ .option('auto.offset.reset', 'earliest')\ .load() values = messages.select(messages['value'].cast('string')) words = values.select( functions.explode(functions.split(values.value, ';')).alias("words")) data = words.withColumn('text', functions.split('words', ',')).select('text') data = data.withColumn('news_id', data['text'][0]) data = data.withColumn('news_url', data['text'][1]) print('finish load data') # 2. Scrap the news_text and tweets_comments data = data.withColumn('news_info', udf_get_news_info(data['news_url'])) data = data.withColumn('news_title', data['news_info'][0]) data = data.withColumn('news_text', data['news_info'][1]) data = data.withColumn('news_image', data['news_info'][2]) data = data.where(data['news_title'].isNotNull() & (functions.length(data['news_title']) > 0)) data = data.where(data['news_text'].isNotNull() & (functions.length(data['news_text']) > 0)) # data = data.where(data['tweets_comment'].isNotNull() & (functions.length(data['tweets_comment']) > 0)) # filter reviews with no text print('finish scrap') # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words data = data.withColumn('sentiment_scores', udf_sentiment_score(data['news_text'])) news_regex_tokenizer = RegexTokenizer(inputCol='news_text', outputCol='news_words', pattern='[^A-Za-z]+') news_stopwords_remover = StopWordsRemover( inputCol='news_words', outputCol='news_tokens', stopWords=StopWordsRemover.loadDefaultStopWords('english')) # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features') nlp_pipeline = Pipeline( stages=[news_regex_tokenizer, news_stopwords_remover]) model = nlp_pipeline.fit(data) nlp_data = model.transform(data).select('news_id', 'news_title', 'news_text', 'news_image', 'news_tokens', 'sentiment_scores') # 4. Select Features nlp_data = nlp_data.withColumn('news_tokens', udf_morphy(nlp_data['news_tokens'])) # nlp_data = nlp_data.withColumn('tweets_tokens', udf_morphy(nlp_data['tweets_tokens'])) # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens')) nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0) # nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0) # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+')) nlp_data = nlp_data.withColumn('news_tokens', functions.concat_ws(' ', 'news_tokens')) print('finish scores') # 5. Save nlp_data = nlp_data.withColumn( 'dl_value', functions.to_json( functions.struct([nlp_data[x] for x in nlp_data.columns]))) stream = nlp_data.select(nlp_data.news_id.alias("key"), nlp_data.dl_value.alias("value"))\ .writeStream\ .format('kafka')\ .outputMode('update')\ .option('kafka.bootstrap.servers', 'localhost:9092')\ .option("topic", "mlnews-2")\ .option("checkpointLocation", "../check")\ .start() # stream = nlp_data.writeStream.format('console').outputMode('update').start() stream.awaitTermination()