Example #1
0
    def test_validate_column_types(self):
        from pyspark.sql.functions import udf, to_json
        from pyspark.sql.column import _to_java_column

        self.assertTrue("Column" in _to_java_column("a").getClass().toString())
        self.assertTrue("Column" in _to_java_column(u"a").getClass().toString())
        self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString())

        self.assertRaisesRegexp(
            TypeError,
            "Invalid argument, not a string or column",
            lambda: _to_java_column(1))

        class A():
            pass

        self.assertRaises(TypeError, lambda: _to_java_column(A()))
        self.assertRaises(TypeError, lambda: _to_java_column([]))

        self.assertRaisesRegexp(
            TypeError,
            "Invalid argument, not a string or column",
            lambda: udf(lambda x: x)(None))
        self.assertRaises(TypeError, lambda: to_json(1))
Example #2
0
df = (spark.read.format('delta').load("s3://oetrta/volker/datasets/turbine/kinesis_sample/")
     .withColumn("jsonData", from_json(col("value"), jsonSchema)) \
     .select("key","jsonData.*")
     )

# COMMAND ----------

df.cache()

# COMMAND ----------

df_sample = (df.sample(fraction=0.01).limit(500).dropDuplicates(
    ["ID"]).withColumn("TIMESTAMP", current_timestamp()).select(
        'key',
        to_json(struct(col('*'))).alias('value')))

pdf = df_sample.toPandas()

display(df_sample)

# COMMAND ----------

while True:
    df_sample = (df.sample(fraction=0.01).limit(500).dropDuplicates(
        ["ID"]).withColumn("TIMESTAMP", current_timestamp()).select(
            'key',
            to_json(struct(col('*'))).alias('value')))

    pdf = df_sample.toPandas()
Example #3
0
def run(spark):
    """
    This is an attempt at combining multiple video sources into a grid of images.
    WARNING: This is broken because Spark is not maintaining the time order of the images.
    This file has been superceded by the Flink/Java class MultiVideoGridJob in the flinkprocessor directory.
    """
    schema = 'timestamp timestamp, frame_number int, camera int, ssrc int, data binary'

    # To allow for large images and avoid out-of-memory, the JVM will
    # send to the Python UDF this batch size.
    spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1')

    controller = os.getenv('PRAVEGA_CONTROLLER', 'tcp://127.0.0.1:9090')
    scope = os.getenv('PRAVEGA_SCOPE', 'examples')
    checkpoint_location = os.getenv('CHECKPOINT_LOCATION',
                                    '/tmp/spark_checkpoints_multi_video_grid')
    shutil.rmtree(checkpoint_location, ignore_errors=True)

    df = (
        spark.readStream.format("pravega").option(
            "controller", controller).option("scope", scope).option(
                "stream", "video").option("encoding", "chunked_v1")
        # .option("start_stream_cut", "earliest")
        .load())

    df = df.withColumnRenamed('event', 'raw_event')
    df = df.select('*', decode('raw_event', 'UTF-8').alias('event_string'))
    df = df.select('*',
                   from_json('event_string', schema=schema).alias('event'))
    df = df.select('*', 'event.*')
    df = df.select('*', length('data'))
    fps = 2.0
    df = df.selectExpr(
        '*',
        f'timestamp(floor(cast(timestamp as double) * {fps}) / {fps}) as discrete_timestamp'
    )
    df = df.withWatermark('discrete_timestamp', '5 second')
    df = df.drop('raw_event', 'event_string', 'event')

    thumbnail_size = (84, 84)

    @pandas_udf(returnType='binary', functionType=PandasUDFType.SCALAR)
    def decode_and_scale_image(data_series, ssrc):
        def f(data):
            in_pil = Image.open(io.BytesIO(data))
            out_pil = in_pil.resize(thumbnail_size)
            return out_pil.tobytes()

        return data_series.apply(f)

    df = df.select(
        '*',
        decode_and_scale_image(df['data'], df['ssrc']).alias('image'))
    df = df.select(
        '*',
        func.to_json(
            func.struct(df['discrete_timestamp'], df['frame_number'],
                        df['camera'])).alias('json'))

    df = df.repartition(1)

    grp = df.groupby(
        # window('timestamp', '1 second'),
        'discrete_timestamp', )

    @pandas_udf(
        returnType=
        'timestamp timestamp, frame_number int, ssrc int, data binary, source string',
        functionType=PandasUDFType.GROUPED_MAP)
    def combine_images_into_grid(df):
        # TODO: This Pandas UDF provides incorrect results because it is called before the aggregation is finalized by the watermark.
        if df.empty:
            return None
        row0 = df.iloc[0]
        num_cameras = df.camera.max() + 1
        grid_count = math.ceil(math.sqrt(num_cameras))
        # Determine number of images per row and column.
        image_width = thumbnail_size[0]
        image_height = thumbnail_size[1]
        image_mode = 'RGB'
        margin = 1
        status_width = 0
        # Create blank output image, white background.

        out_pil = Image.new(
            'RGB',
            ((image_width + margin) * grid_count - margin + status_width,
             (image_height + margin) * grid_count - margin), (128, 128, 128))

        # Add images from each camera
        def add_image(r):
            # in_pil = Image.open(io.BytesIO(r['image']))
            in_pil = Image.frombytes(image_mode, (image_width, image_height),
                                     r['image'])
            x = (r['camera'] % grid_count) * (image_width + margin)
            y = (r['camera'] // grid_count) * (image_width + margin)
            out_pil.paste(in_pil, (x, y))

        df.apply(add_image, axis=1)

        # font = ImageFont.truetype('/usr/share/fonts/truetype/freefont/FreeSans.ttf', font_size)
        # draw = ImageDraw.Draw(img)
        # draw.text((status_width, 0), 'FRAME\n%05d\nCAMERA\n %03d' % (frame_number, camera), font=font, align='center')

        out_bytesio = io.BytesIO()
        out_pil.save(out_bytesio, format='PNG', compress_level=0)
        out_bytes = out_bytesio.getvalue()

        new_row = pd.Series()
        new_row['timestamp'] = row0['discrete_timestamp']
        new_row['ssrc'] = 0
        new_row['frame_number'] = 0
        new_row['source'] = df[['camera', 'frame_number',
                                'timestamp']].to_json()
        new_row['data'] = out_bytes
        # new_row['data'] = b''
        return pd.DataFrame([new_row])

    # @pandas_udf(returnType='string', functionType=PandasUDFType.SCALAR)
    # def combine_images_into_grid2(json):
    #     # TODO
    #     def f(data):
    #         in_pil = Image.open(io.BytesIO(data))
    #         out_pil = in_pil.resize(thumbnail_size)
    #         return out_pil.tobytes()
    #     return data_series.apply(f)

    df = grp.apply(combine_images_into_grid)
    df = df.select(
        func.to_json(func.struct(df["frame_number"],
                                 df["data"])).alias("event"))

    # df = grp.agg(func.collect_list('json'))
    # df = df.selectExpr('*', '0 as ssrc')
    # window = Window.partitionBy('ssrc').orderBy('discrete_timestamp').rowsBetween(Window.unboundedPreceding, Window.currentRow)
    # df = df.select('*', func.row_number().over(window))

    # TODO: Output rows are not written in timestamp order. How can this be fixed?
    # Below gives error: Sorting is not supported on streaming DataFrames/Datasets, unless it is on aggregated DataFrame/Dataset in Complete output mode
    # df = df.sortWithinPartitions(df['discrete_timestamp'])

    df.printSchema()

    if False:
        (df.writeStream
         # .trigger(processingTime='1000 milliseconds')    # limit trigger rate
         .outputMode('append').format('console').option(
             'truncate',
             'false').option('checkpointLocation',
                             checkpoint_location).start().awaitTermination())
    else:
        (df.writeStream.trigger(processingTime="1000 milliseconds").outputMode(
            "append").format("pravega").option(
                "controller", controller).option(
                    "scope", scope).option("stream", "combinedvideo").option(
                        "checkpointLocation",
                        checkpoint_location).start().awaitTermination())
    S3_BUCKET_REPLICATION_DATA = 'vtex-orders-index'
    S3_BUCKET_DATALAKE = 'vtex.datalake'

    S3_DATALAKE_SCHEMA_DIR = 'sample_schema/orders'

    ## Getting Schema's Interface from Checkout Structured Json
    cleansed_df = spark.read.json(
        get_all_paths(S3_BUCKET_DATALAKE, S3_DATALAKE_SCHEMA_DIR))

    ## Reading data from Checkout History
    df = spark.read.json(
        get_all_paths(S3_BUCKET_HISTORIC_DATA,
                      S3_BUCKET_REPLICATION_DATA + '/' + folder_prefix_filter))

    ## Temporary column to convert df data to JSON.
    df = df.withColumn("ToJSON", to_json(struct([df[x] for x in df.columns])))

    df = struct_data_frame(df, cleansed_df)
    df = create_partition_columns(df)

    ## Deletes the temporary column ToJSON
    df = df.drop("ToJSON")

    df = rewriteColumnNames(df)

    ### Writing data into S3 bucket
    #### Save table to S3 using Parquet format and partitioning by defined columns
    df.repartition('ingestion_year','ingestion_month','ingestion_day', 'ingestion_hour')\
        .write\
        .partitionBy('ingestion_year','ingestion_month','ingestion_day', 'ingestion_hour')\
        .mode('append')\
Example #5
0
    def process(time, rdd):
        print("========= %s =========" % str(time))
        #2018-05-28T13:52:07.0000000Z,2018-05-28T13:52:35.6721175Z
        format_1 = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSS'Z'"
        a = time_py.time()
        
        try:
            # Get the singleton instance of SparkSession
            if (not rdd.isEmpty()):
                spark = getSparkSessionInstance(rdd.context.getConf())
                #rdd.context.clearCache()
                # Get data from kafka json to df
                df = spark.read.json(rdd.map(lambda x: x[1]))
                df = df.rdd.repartition(100).toDF()
                print(df.count())

                df = df.withColumn('observationDate', from_unixtime(unix_timestamp('observationTime', format_1))).\
                         withColumn('serverDate', from_unixtime(unix_timestamp('serverTime', format_1)))
                #Usa current_date(), col("observationDate") en produccion
                df = df.where(datediff(col("serverTime"), col("observationDate")) < 7)

                joinUserSensor = getDataUsers(rdd.context)

                # join data from hdfs and stream
                joinData = df.alias('stream').join(joinUserSensor.alias('data'), col('stream.sensorId') == col('data.sensorId'),"leftOuter") # can be "inner", "leftOuter", "rightOuter"

                dataToSend = joinData.select("Type","altitude","coordinates_lat","coordinates_long","date","observationTime","observationDate","dateSend", "serverDate", "heading","location","stream.sensorId","serverTime","speed","speedmetric","temp","id_user", "name")
                dataToSend = dataToSend.withColumn("id", monotonically_increasing_id())

                #Convertimos la funcion en udf
                schema4udf = StructType([StructField("addrs_name", StringType()),
                                            StructField("max_speed", IntegerType())
                                        ])
                reference_to_dict_udf = udf(reference_to_dict, schema4udf)

                #Obtenemos la georeferenciacion
                dataToSend = dataToSend.withColumn("data_osm", reference_to_dict_udf(struct([dataToSend[x] for x in ['coordinates_lat','coordinates_long', 'Type', 'speed']])))
                dataToSend = dataToSend.select("id", "Type","altitude","coordinates_lat","coordinates_long","date","observationTime","observationDate","dateSend", "serverDate",
                                   "serverTime","heading","location","stream.sensorId","speed","speedmetric","temp","id_user", "name", 
                                   col("data_osm.addrs_name").alias("addrs_name"), col("data_osm.max_speed").alias("max_speed") )

                actualCoordinates = dataToSend.select("id", "coordinates_lat","coordinates_long")
                # Cargamos los puntos negros
                blackShapes = getBlackShapes(rdd.context)
                # Los cruzamos con las posiciones actuales de los vehiculos
                nearBlkShp = actualCoordinates.crossJoin(blackShapes)
                # Obtenemos solo los puntos mas cercanos a ~1km de distancia
                # How to find the most near position efficient?
                # https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
                nearBlkShp = nearBlkShp.filter((nearBlkShp.lat_min <= nearBlkShp.coordinates_lat) & (nearBlkShp.lat_max >= nearBlkShp.coordinates_lat) & (nearBlkShp.long_min <= nearBlkShp.coordinates_long) & (nearBlkShp.long_max >= nearBlkShp.coordinates_long))
                #Obtenemos las distancias a los puntos negros
                nearBlkShp = nearBlkShp.select(col("id"), col("Address"), col("Province"), col("Country"), col("numAccident"), 
                                               col("lat").alias("blk_point_lat"), col("long").alias("blk_point_long"),
                                               dist(col('coordinates_lat'),col('coordinates_long'),col('lat'),col('long')).alias("Distance"))
                #Nos quedamos con la menor
                minD4 = nearBlkShp.groupBy("id").min("Distance")
                #Ahora obtenemos los que tienen solo menor distancia
                finalNearBlkShp = minD4.alias('mins').join(nearBlkShp.alias('dataBlkShp'), 
                    (col('mins.id')==col('dataBlkShp.id')) & (col('mins.min(Distance)') == col('dataBlkShp.Distance')),"leftOuter").\
                    select(col("dataBlkShp.id").alias('id'), 
                    col("dataBlkShp.Address").alias('address'),
                    col("dataBlkShp.Province").alias('province'), col("dataBlkShp.Country").alias('country'), col("dataBlkShp.numAccident").alias('accidents'), 
                    col("dataBlkShp.blk_point_lat").alias('blk_point_lat'), col("dataBlkShp.blk_point_long").alias('blk_point_long'), col("dataBlkShp.Distance").alias('dist_to_blk_shp'))

                #Join de los puntos negros con los datos 
                dataToSend = dataToSend.alias('data').join(finalNearBlkShp.alias('blk_shp'), col('data.id')==col('blk_shp.id'), "leftOuter")
                dataToSend = dataToSend.select(col("data.Type").alias("Type"),
                                               col("data.altitude").alias("altitude"),
                                               col("data.observationTime").alias("observationTime"),
                                               col("data.dateSend").alias("dateSend"),
                                               col("data.serverTime").alias("serverTime"),
                                               col("data.heading").alias("heading"),
                                               col("data.location").alias("location"),
                                               col("data.sensorId").alias("sensorId"),
                                               col("data.speed").alias("speed"),
                                               col("data.speedmetric").alias("speedmetric"),
                                               col("data.temp").alias("temp"),
                                               col("data.id_user").alias("id_user"),
                                               col("data.name").alias("user"),
                                               col("data.addrs_name").alias("actual_address"),
                                               col("data.max_speed").alias("max_speed"),
                                               col("blk_shp.address").alias("blk_shp_address"),
                                               col("blk_shp.province").alias("blk_shp_province"),
                                               col("blk_shp.country").alias("blk_shp_country"),
                                               col("blk_shp.accidents").alias("blk_shp_accidents"),
                                               array(col('blk_shp.blk_point_lat'),col('blk_shp.blk_point_long')).alias("blk_shp_coordinates"),
                                               col("blk_shp.dist_to_blk_shp").alias("blk_shp_dist")
                                              )

                #Send data
                #dataToSend.printSchema()
                print(dataToSend.rdd.getNumPartitions())
                dataToSend.select(to_json(struct([dataToSend[x] for x in dataToSend.columns])).alias("value")).write.format("kafka").option("kafka.bootstrap.servers", kServer).option("topic", topicOut).save()

        except Exception as e:
            print(str(e))
            pass

        b = time_py.time()
        c = (b-a)
        print(c)
Example #6
0
out_df = out_df.withColumn("aov", F.col("revenue") / F.col("purchases"))

#out_df = out_df.groupBy("start_ts")\
#                            .agg(\
#                                 F.countDistinct("partyId").alias("visitors")\
#                                 )\
#                                 .join(\
#                            out_df.filter("eventType='itemBuyEvent'").groupBy("start_ts")\
#                            .agg(F.max("end_ts").alias("end_ts"),\
#                                 F.sum("item_price").alias("revenue"),\
#                                 F.countDistinct("sessionId").alias("purchases"),\
#                                 F.sum("item_price")/F.countDistinct("sessionId")\
#                                 )\
#                                 ,"start_ts", "left")

out_columns = list(out_df.columns)
#out_columns = ["start_ts","end_ts","visitors","revenue","purchases","aov"]

query = out_df\
    .select(F.to_json(F.struct(*out_columns)).alias("value"))\
    .writeStream \
    .outputMode("update")\
    .format("kafka") \
    .option("checkpointLocation", "/tmp/checkpoint-write")\
    .option("kafka.bootstrap.servers", kafka_bootstrap ) \
    .option("topic", topic_out) \
    .start()\

query.awaitTermination()
def main():

    spark = SparkSession \
        .builder \
        .appName("Divvy Bikes") \
        .enableHiveSupport() \
        .getOrCreate()
    # spark = session_spark()
    # spark.sparkContext.addPyFile("utils.zip")
    # spark.sparkContext.addPyFile("ingestion.zip")
    # spark.sparkContext.addPyFile("ml.zip")

    log4jLogger = quiet_logs(spark)
    logger = log4jLogger.LogManager.getLogger(__name__)
    logger.info("Iniciando previsões")

    logger.info("Buscando dados de estações")
    train_inst = CreateDataframe(spark)
    bikes = train_inst.get_data(with_temperature=False)

    logger.info("Buscando dados de previsão do tempo")
    predict_inst = PredictDataframe(spark)
    temperature_api = predict_inst.get_data()

    temperature_inst = TemperatureDataframe(spark)
    temperature = temperature_inst.create_new_columns(temperature_api)

    logger.info("Join dataframe de estações e temperatura")
    dataframe_join = predict_inst.join_dataframe(bikes, temperature)

    logger.info("Criando dataframe final")
    dataframe_new_columns = train_inst.create_new_columns(dataframe_join)\
                                    .select("date", "from_station_id", "latitude", "month",
                                            "longitude", "mean_dpcapacity_start", "mean_dpcapacity_end",
                                            "sum_subscriber","sum_customer", "part_time", "holiday",
                                            "week_days", col("weather_condition").alias("weather_description"),
                                            "humidity", "pressure", "temperature", "wind_speed")\
                                    .cache()
    dataframe_new_columns.take(1)

    logger.info("Buscando a condição climática mais frequente por período")
    mf_condition_part_time = train_inst.get_mf_part_time(dataframe_new_columns)

    dataframe_final = train_inst.group_target(dataframe_new_columns, mf_condition_part_time) \
                                .drop("bicycle_rentals")\
                                .cache()
    dataframe_final.take(1)

    logger.info("Carregando modelo")
    model = predict_inst.get_model()

    logger.info("Realizando previsões")
    predictions = model.transform(dataframe_final)
    #predictions.show(truncate=False)

    logger.info("Criando json final")
    df_json = predict_inst.create_json(predictions)

    df_json.select(to_json(struct("*")).alias("value"))\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", "quickstart.cloudera:9092")\
            .option("topic", "bikes")\
            .save()
# +--------------------+-----+--------------------+---------+
#
# In this JSON Format {"customer":"*****@*****.**","score":"28.5","email":"*****@*****.**","birthYear":"1963"}
joinedCustomerDF = customerRiskStreamingDF.withColumn("email", col("customer")).join(emailAndBirthYearStreamingDF, on="email").select("customerrisk.customer","customerrisk.score", "email", "birthYear")

# joinedCustomerDF = customerRiskStreamingDF.join(emailAndBirthYearStreamingDF, emailAndBirthYearStreamingDF.email== customerRiskStreamingDF.customer)

joinedCustomerDF.printSchema()
# joinedCustomerDF.show(n=3)
# joinedCustomerDF.writeStream.outputMode("append").format("console")\
#       .option("truncate", "false")\
#       .start()\
#       .awaitTermination()
# joinedCustomerDF.withColumn("value", to_json())

kafkaDataFrame = joinedCustomerDF.select(col("email"), to_json(struct([joinedCustomerDF[x] for x in joinedCustomerDF.columns]))).toDF("key","value")

# (joinedCustomerDF.select(to_json(struct([joinedCustomerDF[x] for x in joinedCustomerDF.columns])).alias("value"))
#     .writeStream
#     .format("kafka")
#     .option("kafka.bootstrap.servers", "localhost:9092")
#     .option("topic", "topic-risk-score")
#     .option("checkpointLocation", "/tmp/kafka/checkpoint")
#     .start())
kafkaDataFrame.printSchema()

# kafkaDataFrame.writeStream\
#       .outputMode("append").format("console")\
#       .option("truncate", "false")\
#       .start().awaitTermination()
Example #9
0
    topics = player_names.union(teams)

    # Reads the data from kafka
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "broker:9092") \
        .option("failOnDataLoss", "false") \
        .option("subscribe", "tweets") \
        .option("startingOffsets", "earliest") \
        .load()

    messages = extractTweetPayload(df, tweetSchema, payloadSchema)

    wordCount = wordCountQuery(messages, "Text") \
        .join(topics, "word") \
        .select("word", "count","category", to_json(struct("word", "count","category")).alias("value"))

    langCount = langCountQuery(messages, "Lang")

    query = wordCount \
        .writeStream \
        .format("kafka") \
        .option("checkpointLocation", "./checkpoints") \
        .option("kafka.bootstrap.servers", "broker:9092") \
        .option("topic", "countByName") \
        .start()

    query.awaitTermination()
Example #10
0
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

# You can also explode map types, which will turn them into columns
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)

# ---------- STEP 4 ----------
# JSON
# -----------------------------

# Let’s begin by creating a JSON column
jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

# show it
jsonDF.show()

# You can use the get_json_object to inline query a JSON object
# You can use json_tuple if this object has only one level of nesting
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"),
                    "$.myJSONKey.myJSONValue[1]").alias('column'),
    json_tuple(col("jsonString"), "myJSONKey")).show(2)

# You can also turn a StructType into a JSON string by using the to_json function
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct"))).show()
Example #11
0
    def execute(self, conf_path: str, input_path: str, output_path: str,
                on_dbfs: bool) -> None:
        """
        Pipeline that sanitize data, extract drugs and change the data model finally save to a JSON.
        This is the main entrypoint of the package. The parameters are the job's arguments.
        Args:
            conf_path: If DataBricks Filesystem is mounted
            input_path: Folder path to write files
            output_path: Folder path to read raw files
            on_dbfs: File path of the params.json

        Returns: Nothing only modify inplace the instanced class

        """

        self.load_params(conf_path)

        df_dict = Sanitizer.read_files(self.logger, self.spark, self.params,
                                       input_path)
        Sanitizer.clean_strings(self.logger, df_dict)
        df_dict = Sanitizer.clean_date(self.logger, df_dict)
        df_dict = Sanitizer.empty_str_cleaning(self.logger, df_dict)
        Sanitizer.deduplication(self.logger, df_dict,
                                self.params.get("deduplication rules"))
        Files.merge_write(self.logger, df_dict,
                          self.params.get("merge sanitized rules"),
                          path.join(output_path, "sanitized"), self.spark)

        df_dict = Files.read_delta(
            self.logger, set(self.params.get("csv") + self.params.get("json")),
            path.join(output_path, "sanitized"), self.spark)
        Sanitizer.deduplication(self.logger, df_dict,
                                self.params.get("deduplication rules"))

        DrugsExtractor.to_words(self.logger, df_dict,
                                self.params.get("to words"))

        drug_df_name = self.params.get("names").get("drugs")
        drug_col_name = self.params.get("names").get("drug")

        df_dict[drug_df_name] = df_dict.get(drug_df_name).withColumn(
            drug_col_name,
            lower(col(drug_col_name))).filter(col(drug_col_name).isNotNull())
        # To be refactor as it don't work in case of really large drug list because of collect to driver (below) and column creation (above)
        # need to drop duplicate because several drugs can have different atc code
        drugs_list = df_dict.get(drug_df_name).select(
            drug_col_name).drop_duplicates().toPandas()[drug_col_name].to_list(
            )
        df_dict.pop(drug_df_name)

        for df in df_dict.values():
            df.cache()
        self.logger.info(
            "Prepared drug list and cached dataframes for following intensive computation: {}"
            .format(df_dict))

        DrugsExtractor.pivot(self.logger, drugs_list, df_dict)

        date = self.params.get("names").get("date")
        id_col = self.params.get("names").get("id")
        journal = self.params.get("names").get("journal")
        columns_kept = [date, id_col, journal]
        df_dict = DrugsExtractor.shift(self.logger, drugs_list, df_dict,
                                       drug_col_name, self.spark, columns_kept)

        # Construct publication objects and journal object
        for df_name in self.params.get("to words").keys():
            df_dict[df_name] = df_dict.get(df_name).withColumn(
                date,
                col(date).cast(StringType()))
            df_dict[df_name] = df_dict.get(df_name).withColumn(id_col, struct(col(date).alias(date), col(id_col).alias(id_col))) \
                .withColumn(journal, struct(col(date).alias(date), col(journal).alias(journal)))
        self.logger.info(
            "Publication objects and journal object constructed: {}".format(
                df_dict))

        trial = self.params.get("names").get("clinical_trials")
        pubmed = self.params.get("names").get("pubmed")
        # Get of each drug a the list of journal and publication (we use set on journal to avoid duplicates)
        merge_trial_df = \
            df_dict.get(trial).groupby(drug_col_name)\
            .agg(collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(trial))\
            .withColumn(pubmed, lit(None)
                        .cast(ArrayType(StructType([StructField('date', StringType(), True), StructField('id', StringType(), True)]))))
        self.logger.info("Created publication per drug for trials: {}".format(
            merge_trial_df))
        merge_pub_df = df_dict.get(pubmed).groupby(drug_col_name).agg(
            collect_set(col(journal)).alias(journal),
            collect_list(col(id_col)).alias(pubmed))
        self.logger.info(
            "Created publication per drug for pubmed: {}".format(merge_pub_df))

        # Merge clinical trials publications with pubmed publication by drug with their associated journal (without repetition)
        merge_path = path.join(output_path, "enriched")
        Files.merge_write(self.logger, {trial: merge_trial_df},
                          self.params.get("merge sanitized rules"), merge_path,
                          self.spark)
        delta_path = path.join(merge_path, trial)
        from delta.tables import DeltaTable
        delta_trial = DeltaTable.forPath(self.spark, delta_path)
        update_match = "trial.{0} = pub.{0}".format(drug_col_name)
        update = {
            pubmed:
            col(f"pub.{pubmed}"),
            journal:
            array_distinct(
                concat(col(f"pub.{journal}"), col(f"trial.{journal}")))
        }
        insert = {
            pubmed: col(f"pub.{pubmed}"),
            journal: col(f"pub.{journal}"),
            drug_col_name: col(f"pub.{drug_col_name}"),
            trial: lit(None)
        }
        self.logger.info(
            "Merging publications with the matching rule: {}".format(
                update_match))
        (delta_trial.alias("trial").merge(
            merge_pub_df.alias("pub"), update_match).whenMatchedUpdate(
                set=update).whenNotMatchedInsert(values=insert).execute())
        # Save the end result
        graph_filename = self.params.get("names").get("graph_filename")
        json_df = self.spark.read.format("delta").load(delta_path)

        # To use the filesystem mounted on databricks with python process we need to prefix "/dbfs/" but Spark process don't work with this prefix
        pythonic_path = "/dbfs" + output_path if on_dbfs else output_path
        graph_path = path.join(pythonic_path, *graph_filename)
        json_df.withColumn(journal, to_json(col(journal))).withColumn(
            trial, to_json(col(trial))).withColumn(pubmed, to_json(
                col(pubmed))).toPandas().to_json(graph_path,
                                                 orient="records",
                                                 date_format="iso")
        # when used multiLine need to be enable on the reading spark process

        self.logger.info("Wrote the resulting JSON to: {}".format(graph_path))
        .withColumn('x1', (F.col('N1') + F.col('geoAltitude')) * F.cos(F.radians(F.col('latitude'))) * F.cos(F.radians(F.col('longitude')))) \
        .withColumn('y1', (F.col('N1') + F.col('geoAltitude')) * F.cos(F.radians(F.col('latitude'))) * F.sin(F.radians(F.col('longitude')))) \
        .withColumn('z1', ((1 - 8.1819190842622e-2**2) * F.col('N1') + F.col('geoAltitude')) * F.sin(F.radians(F.col('latitude')))) \
        .withColumn('x2', (F.col('N2') + F.col('pred_geoAltitude')) * F.cos(F.radians(F.col('pred_latitude'))) * F.cos(F.radians(F.col('pred_longitude')))) \
        .withColumn('y2', (F.col('N2') + F.col('pred_geoAltitude')) * F.cos(F.radians(F.col('pred_latitude'))) * F.sin(F.radians(F.col('pred_longitude')))) \
        .withColumn('z2', ((1 - 8.1819190842622e-2**2) * F.col('N2') + F.col('pred_geoAltitude')) * F.sin(F.radians(F.col('pred_latitude'))))

    pred_df = pred_df \
        .withColumn('dist_error', F.sqrt((F.col('x1') - F.col('x2'))**2 + \
                                           (F.col('y1') - F.col('y2'))**2 + \
                                           (F.col('z1') - F.col('z2'))**2) / 1000) \
        .drop('latitude', 'longitude', 'geoAltitude') \
        .drop('pred_latitude', 'pred_longitude', 'pred_geoAltitude') \
        .drop('N1', 'N2', 'x1', 'y1', 'z1',  'x2', 'y2', 'z2')

    # Write stream to console for debug purposes
    # pred_df.writeStream.outputMode("append").option("truncate", False).format("console").start().awaitTermination()

    # Write stream to Kafka
    pred_df = pred_df \
        .select(F.to_json(F.struct("pred", "target", "dist_error", "timeAtServer", "aircraft")).alias("value"))

    pred_df \
       .writeStream \
       .format("kafka") \
       .option("kafka.bootstrap.servers", ", ".join(kafka_config['servers'])) \
       .option("topic", kafka_config['topics'][1]) \
       .option("checkpointLocation", "checkpoint") \
       .start() \
       .awaitTermination()
Example #13
0
def transform_patient(patient):
    now = datetime.datetime.now().strftime('%Y-%m-%d')
    return patient.na.fill("").groupBy('person_id').\
                   agg(
                      f.first('person_id').alias('couch_id'),
                      f.to_json(f.struct(
                                f.first('person_uuid').alias('uuid'),
                                f.concat_ws(' ', f.first('given_name'), f.first('middle_name'), f.first('family_name')).alias('display'),
                                f.struct(
                                    f.first('person_uuid').alias('uuid'),
                                    f.concat_ws(' ', f.first('given_name'), f.first('middle_name'), f.first('family_name')).alias('display'),
                                    f.first('gender').alias('gender'),
                                    f.first('birthdate').alias('birthdate'),
                                    f.first('dead').alias('dead'),
                                    (f.year(f.to_date(f.lit(now))) - f.year(f.first('birthdate'))).alias('age') ,
                                    f.first('death_date').alias('deathDate'),
                                    f.first('cause_of_death').alias('causeOfDeath'),
                                    f.first('birthdate_estimated').alias('birthdateEstimated'),
                                    f.collect_set(
                                        f.struct(
                                            f.concat_ws(' ', f.col('person_attribute_type_name'), f.lit('='), f.col('person_attribute_value')).alias('display'),
                                            f.col('person_attribute_value').alias('value'),
                                            f.col('person_attribute_uuid').alias('uuid'),
                                            f.col('person_attribute_voided').alias('voided'),
                                            f.struct(
                                                f.col('person_attribute_type_name').alias('display'),
                                                f.col('person_attribute_type_uuid').alias('uuid'),
                                            ).alias('attributeType')
                                        )
                                    ).alias('attributes')
                                ).alias('person'),
                                f.collect_set(
                                    f.struct(
                                         f.col('identifier'),
                                         f.col('identifier_preferred').alias('preferred'),
                                         f.struct(
                                             f.col('identifier_location_name').alias('name'),
                                             f.col('identifier_location_uuid').alias('uuid')
                                        ).alias('location'),
                                        f.struct(
                                            f.col('identifier_type_name').alias('name'),
                                            f.col('identifier_type_uuid').alias('uuid')
                                        ).alias('identifierType'),

                                  )).alias('identifiers'),
                                  f.struct(
                                       f.first('person_address_city_village').alias('cityVillage'),
                                       f.first('person_address_longitude').alias('longitude'),
                                       f.first('person_address_latitude').alias('latitude'),
                                       f.first('person_address_country').alias('country'),
                                       f.first('person_address_county_district').alias('countyDistrict'),
                                       f.first('person_address_1').alias('address1'),
                                       f.first('person_address_2').alias('address2'),
                                       f.first('person_address_3').alias('address3'),
                                       f.first('person_address_4').alias('address4'),
                                       f.first('person_address_5').alias('address5'),
                                       f.first('person_address_6').alias('address6'),
                                       f.first('person_address_preferred').alias('preferred')
                                  ).alias('preferredAddress')
                               )).alias('patient')).\
                      withColumn('type', f.lit('patient'))
Example #14
0
def transform_encounter(encounter_dataframe,
                        obs_dataframe,
                        streaming=True,
                        filters=None):

    if (streaming):
        orders = get_orders(filters['encounter_ids'])
        encounter_types = get_encounter_types()
        forms = get_forms(filters['form_ids'])
        encounter_providers = get_encounter_providers(
            filters['encounter_ids']).alias('encounter_provider')
        locations = get_locations(filters['location_ids']).alias('location')
        visits = get_visits(locations, filters['visit_ids'])
        patients = get_patients(filters['patient_ids'])

    else:
        forms = get_forms()
        locations = get_locations().alias('location')
        visits = get_visits(locations)
        encounter_types = get_encounter_types()
        patients = get_patients()
        orders = get_orders()
        encounter_providers = get_encounter_providers().alias(
            'encounter_provider')

    obs = obs_dataframe.alias('obs')
    joined_encounters = encounter_dataframe.join(f.broadcast(forms),on='form_id')\
                                  .join(f.broadcast(locations), on='location_id')\
                                  .join(f.broadcast(visits), on='visit_id')\
                                  .join(f.broadcast(encounter_types), on=encounter_dataframe['encounter_type'] == encounter_types['encounter_type_id'])\
                                  .join(patients, on='patient_id')\
                                  .join(encounter_providers, on=encounter_providers['encounter_id'] == encounter_dataframe['encounter_id'], how='left')\
                                  .join(orders, on=orders['encounter_id'] == encounter_dataframe['encounter_id'], how='left')\
                                  .join(obs, on=obs['encounter_id'] == encounter_dataframe['encounter_id'], how='left')


    return joined_encounters\
    .groupBy('encounter.encounter_id').agg(
    f.first('patient_id').alias('person_id'),
    f.lit('encounter').alias('type'),
    f.first('encounter.location_id').alias('location_id'),
    f.first('person_uuid').alias('person_uuid'),
    f.col('encounter.encounter_id').cast('string').alias('couch_id'),
    f.first('encounter.uuid').alias('uuid'),
    f.first('encounter_datetime').alias('encounterdatetime'),
    f.struct(
        f.first('encounter_type_name').alias('display'),
        f.first('encounter_type_uuid').alias('uuid')
    ).alias('encountertype'),
    f.struct(
        f.first('form_name').alias('name'),
        f.first('form_uuid').alias('uuid')
    ).alias('form'),
    f.struct(
        f.first('location.location_name').alias('display'),
        f.first('location.location_uuid').alias('uuid')
    ).alias('location'),
    f.to_json(f.collect_set(
        f.when(f.col('encounter_provider_uuid').isNotNull(), f.struct(
            f.col('encounter_provider_uuid').alias('uuid'),
            f.col('encounter_provider.provider_name').alias('display'),
            f.struct(
                f.col('encounter_provider.provider_uuid').alias('uuid'),
                f.concat_ws(' ', f.col('encounter_provider.provider_identifier'), f.lit('-'), f.col('encounter_provider.provider_name')).alias('display')
            ).alias('provider')
        ))
    )).alias('encounterproviders'),
    f.to_json(f.struct(
        f.first('visit_uuid').alias('uuid'),
        f.first('visit.date_started').alias('dateStarted'),
        f.first('visit.date_stopped').alias('dateStopped'),
        f.struct(
            f.first('visit_type_name').alias('name'),
            f.first('visit_type_uuid').alias('uuid')
        ).alias('visitType'),
        f.struct(
            f.first('visit.location_name').alias('name'),
            f.first('visit.location_uuid').alias('uuid')
        ).alias('location'),
        f.concat_ws(' ', f.first('visit_type_name'), f.lit('@'), f.first('visit.location_name'), f.lit('-'), f.first('visit.date_started'))
        .alias('display')
    )).alias('visit'),
    f.to_json(f.collect_set(
        f.when(f.col('order_uuid').isNotNull(),f.struct(
            f.col('order_uuid').alias('uuid'),
            f.col('order_number').alias('orderNumber'),
            f.struct(
                f.col('orders.concept_uuid').alias('uuid'),
                f.col('orders.concept_name').alias('display')
            ).alias('concept'),
            f.struct(
                f.col('orders.provider_uuid').alias('uuid'),
                f.concat_ws(' ', 'orders.provider_identifier', 'orders.provider_name').alias('display')
            ).alias('orderer'),
            f.col('order_action').alias('action'),
            f.col('orders.date_activated').alias('dateActivated'),
            f.col('orders.date_created').alias('dateCreated'),
            f.col('orders.urgency').alias('urgency'),
            f.col('order_type_name').alias('type')
        )
    ).otherwise(None))).alias('orders'),
    f.to_json(f.collect_list(
       f.struct(
             f.lit('obs.uuid').alias('uuid'),
             f.col('obs_datetime').alias('obsDatetime'),
             f.struct(
                 f.col('parent_obs_concept_uuid').alias('uuid'),
                 f.struct(
                 f.col('parent_obs_concept_name').alias('display'))
                 .alias('name')
             ).alias('concept'),
            f.when(f.col('value_coded').isNotNull(),
                f.struct(
                        f.col('value_type').alias('type'),
                        f.to_json(
                                  f.struct(
                                      f.col('value_coded_concept_uuid').alias('uuid'),
                                      f.col('value_coded_concept_name').alias('display')
                                  )).alias('value')
                        )
            ).when(f.col('value_not_coded').isNotNull(),
                f.struct(
                        f.col('value_type').alias('type'),
                        f.col('value_not_coded').alias('value')
                        )
            ).alias('value'),
            f.when(f.col('groupmembers').isNotNull(),
                   f.col('groupmembers')
                  ).alias('groupMembers')
    ))).alias('obs'),
    ).withColumn('build_date', f.current_timestamp())
  .option("startingOffsets", "latest") \
  .load() \
  .selectExpr("CAST(value as string)")\
  .select(F.from_json("value", schema).alias("value"))\
  .select(F.col("value.*"))\
  .select("uid", F.col('visits').url.alias("urls"))\
  .withColumn('domains', foo_udf(F.col('urls')))

# Infer on test data

results = model.transform(st)

# get string classes from encoded values
converter = IndexToString(inputCol="prediction",
                          outputCol="gender_age",
                          labels=model.stages[1].labels)
converted = converter.transform(results)

#Saving to another topic
query = converted\
 .select(F.to_json(F.struct("uid", "gender_age")).alias("value"))\
 .writeStream\
 .outputMode("append")\
 .format("kafka") \
 .option("checkpointLocation", "file:///tmp/checkpoint")\
 .option("kafka.bootstrap.servers", kafka_bootstrap ) \
 .option("topic", topic_out) \
 .start()

query.awaitTermination()
Example #16
0
    RenamedDF= JsonDF.select(explode(F.col("TimeSerieDtos")).alias("TimeSerie"),\
                             col("EntityExternalId").alias("EntityName"),col("TimeSerie.Time").alias("StartDate")\
                             ,"TimeResolution","TimeSerie.Tags","TimeSerie.Value").drop("TimeSerie") \
    .withColumn('ImportDateTime',lit(nu)) \
    .withColumn("EntryId", row_number().over(window))

    #display(RenamedDF)

# COMMAND ----------

# making export dataframe
if DINO1 == 1:
    DINO1DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO1"))\
    .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\
             ,to_json("Location").alias("JsonValue"))
if SUN1 == 1:
    SUN1DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("SUN1"))\
    .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime",\
             to_json("Location").alias("JsonValue"))
if DINO2 == 1:
    DINO2DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO2")) \
    .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\
             ,col("Valid").alias("JsonValue"))
if SUN2 == 1:
    SUN2DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("SUN2")) \
    .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\
             ,col("Valid").alias("JsonValue"))
if DINO3 == 1:
    DINO3DF = RenamedDF.withColumn('ImportDateTime',lit(nu)).withColumn('DataImportCode',lit("DINO3")) \
    .select ("DataImportCode", "EntityName","EntryId","StartDate","ImportDateTime"\
Example #17
0
#then we can calculate the relative SAIDI/SAIFI contribution of each outage
pw_finalized_outages = pw_finalized_outages.join(
    pw_distinct_user_id,
    F.date_trunc("day", F.from_unixtime(
        pw_finalized_outages["outage_time"])) == F.date_trunc(
            "day", pw_distinct_user_id["window_mid_point"]))

pw_finalized_outages = pw_finalized_outages.select(
    "outage_time", "cluster_size", "phones_reporting", "user_id",
    "outage_times", "outage_times_range", "outage_times_stddev")
pw_finalized_outages = pw_finalized_outages.withColumn(
    "relative_cluster_size",
    col("cluster_size") / col("phones_reporting"))

pw_finalized_with_string = pw_finalized_outages.withColumn(
    "outage_times", F.to_json("outage_times"))
pw_finalized_with_string = pw_finalized_outages.withColumn(
    "user_id", F.to_json("user_id"))

#okay we should save this
#pw_finalized_with_string.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(args.result + '/full_outage_list')

#okay let's filter and print this
pw_finalized_outages = pw_finalized_outages.filter(col("cluster_size") >= 2)
pw_finalized_outages.show(1000)

#We need to zero fill for every date and cluster size not already present in the dataset
#to do this create a dataframe for range date_min to date_max and cluster_size cluster_size_min to cluster_size max with 0 rel saifi
#then join it with the actual DF preferentially choosing the non zero value
#min_time = pw_finalized_outages.agg(F.min("outage_time")).collect()[0].__getitem__("min(outage_time)")
#max_time = pw_finalized_outages.agg(F.max("outage_time")).collect()[0].__getitem__("max(outage_time)")
Example #18
0
        cvmodel = model.stages[1]
        vocabulary = cvmodel.vocabulary
        vocab_size = len(vocabulary)

        topics = model.stages[-1].describeTopics()
        topics = topics.withColumn(
            'terms',
            indices_to_terms(vocabulary)(topics.termIndices,
                                         topics.termWeights))

        scores = topics.select('terms').rdd.flatMap(
            lambda list: list).collect()

        val = Row(date=week, results=scores, subreddit=subreddit, \
                  vocab_size=vocab_size, num_docs=num_docs)
        line = (subreddit, week, val)
        week_df = spark.createDataFrame([line], cols)

        week_df = week_df.withColumn('date', to_date(week_df.date))
        week_df = week_df.withColumn(
            'results',
            to_json(week_df.results).cast(StringType()))

        week_df.write.jdbc(dburl,
                           'newresults',
                           mode='append',
                           properties={
                               'user': dbuser,
                               'password': dbpwd
                           })
Example #19
0
dump_df_to_s3(test_ids.toDF(), 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols,
                                                 cat_cols)

logger.info(f'Dumping features and labels for training...')
dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')

featurs_graph_df = features_df.withColumn(
    'props_values:String',
    to_json(
        struct(
            list(
                filter(lambda x: (x != TRANSACTION_ID),
                       features_df.schema.names)))))
featurs_graph_df = featurs_graph_df.select('TransactionID',
                                           'props_values:String')

logger.info(f'Creating glue dynamic frame from spark dataframe...')
features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext,
                                                'FeaturesDF')
features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')])
features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(
    features_graph_dynamic_df, 'Transaction')
features_graph_dynamic_df = SelectFields.apply(
    frame=features_graph_dynamic_df,
    paths=["~id", '~label', 'props_values:String'])
Example #20
0
    def read_data(self):
        userSchema = StructType([
            StructField('medallion', StringType()),
            StructField('pickup_time', TimestampType()),
            StructField('dropoff_time', TimestampType()),
            StructField('passenger_count', IntegerType()),
            StructField('trip_time', IntegerType()),
            StructField('trip_distance', DoubleType()),
            StructField('pickup_loc', MapType(StringType(), DoubleType())),
            StructField('dropoff_loc', MapType(StringType(), DoubleType()))
        ])

        self.df = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "localhost:9092") \
            .option("subscribe", "nycspeed12") \
            .option("startingOffsets", "earliest") \
            .option('failOnDataLoss','false') \
            .option('enable.auto.commit','false') \
            .option('group.id','nyc6') \
            .option('auto.offset.reset','earliest') \
            .option("kafka.client.id", "nycid6") \
            .option("maxOffsetsPerTrigger", 1000) \
            .load()

        self.dff = self.df.selectExpr("CAST(value as STRING) as json") \
                   .select(from_json("json", userSchema).alias('data'))\
                   .selectExpr(
                        "data.medallion as medallion",
                        "cast (data.pickup_time as timestamp)",
                        "cast (data.dropoff_time as timestamp)",
                        "cast (data.passenger_count as integer)",
                        "cast (data.trip_time as integer)",
                        "cast (data.trip_distance as float)",
                        "cast (data.pickup_loc.lat as float) as pickup_loc_lat",
                        "cast (data.pickup_loc.lon as float) as pickup_loc_lon",
                        "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat",
                        "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon",
                    )

        print(self.dff.printSchema())

        self.windowedCounts = self.dff \
            .filter('trip_time > 0') \
            .withWatermark("pickup_time", "30 days") \
            .groupBy("medallion",
            window("pickup_time", "24 hours")) \
            .agg(func.sum('trip_distance').alias('sum_trip_distance'),
                 func.avg('trip_distance').alias('avg_trip_distance'),
                 func.sum('trip_time').alias('sum_trip_time'),
                 func.avg('trip_time').alias('avg_trip_time'),
                 func.sum('passenger_count').alias('sum_passenger_count'),
                 func.avg('passenger_count').alias('avg_passenger_count')
                 )

        print((self.windowedCounts \
              .writeStream \
              .outputMode("complete") \
              .format("console") \
              .option('truncate','false')
              .option('numRows', 20)
              .start()
              .awaitTermination()
              ))

        query = self.windowedCounts \
              .select(to_json(struct("medallion",'window','sum_trip_distance',
                              'avg_trip_distance','sum_trip_time','avg_trip_time',
                              'sum_passenger_count','avg_passenger_count')).alias('value')) \
              .writeStream \
              .format("kafka") \
              .option("kafka.bootstrap.servers", "localhost:9092") \
              .option("topic", "es3") \
              .option("checkpointLocation", "/tmp/kafkachkpnt/")\
              .outputMode('update') \
              .start()
        query.awaitTermination()

# COMMAND ----------

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)


# COMMAND ----------

from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
  StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJSON"))\
  .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)


# COMMAND ----------
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_servers) \
  .option("subscribe", in_topic) \
  .option("failOnDataLoss", "false") \
  .load()

df=df.withColumn("value", from_avro("value", jsonFormatSchema)) \
  .select((col("value.timestamp")/milli).alias("time") \
  .cast(TimestampType()),col("value.house_id"),col("value.appliance_id"),col("value.appliance_name"),col("value.power"))

out=df.withWatermark("time", watermark + " seconds") \
  .groupBy(window(col("time"), str(window_converted) + " seconds", str(window_converted//5) + " second"),"house_id","appliance_id") \
  .agg(count("power").alias("c_all"),count(when(col("power") > powerthres, True)).alias("c_duty")) \
  .withColumn("duty_cycle", (col("c_duty")/col("c_all"))).withColumn("time_end", col("window.end")) \
  .drop("window", "c_all", "c_duty")

#query=out.writeStream.outputMode("append").format("console").option("truncate", False).start()

query = out.withColumn("value", to_json(struct("time_end","house_id","appliance_id","duty_cycle"))) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_servers) \
  .option("topic", out_topic) \
  .option("checkpointLocation", "checkpoints") \
  .start()

query.awaitTermination()
Example #23
0
#!/usr/bin/env python3

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("SimpleStreamingApp").getOrCreate()

to_kafka = spark.read.json('datasets/data1.json').drop('_corrupt_record')

columns = to_kafka.columns
schema = to_kafka.schema

to_kafka \
  .select(F.to_json(F.struct(*columns)).alias('value')) \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("topic", "test_topic") \
  .save()

batch_from_kafka = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "test_topic") \
  .load()


batch_from_kafka \
  .select(batch_from_kafka['value'].cast("String")) \
Example #24
0
df_tags = df_tags.withColumn("timestamp",
                             functions.from_unixtime(df_tags['timestamp']))
df_tags = df_tags.withColumnRenamed('tag', 'value')
df_tags = df_tags.withColumn('event_type', func.lit('tag'))

all_data = df_rating.unionAll(df_tags)
all_data_sorted = all_data.sort(all_data['timestamp'].asc())
all_data_sorted = all_data_sorted.withColumn(
    'key',
    func.concat_ws('|', all_data_sorted['user_id'],
                   all_data_sorted['movie_id'], all_data_sorted['value'],
                   all_data_sorted['timestamp'],
                   all_data_sorted['event_type']))
all_data_sorted = all_data_sorted.withColumn(
    'value_json',
    func.to_json(
        func.struct(all_data_sorted['user_id'], all_data_sorted['movie_id'],
                    all_data_sorted['value'], all_data_sorted['timestamp'],
                    all_data_sorted['event_type'])))

all_data_sorted.selectExpr("CAST(key AS STRING)", "CAST(value_json AS STRING) as value") \
    .write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("topic", kafka_topic) \
    .save()

# kafka-console-consumer --bootstrap-server 199.60.17.212:9092 --topic tag_rate_small --from-beginning
# {"user_id":"514","movie_id":"5247","value":"2.5","timestamp":"2018-09-23 19:44:00","event_type":"rate"}
#spark-submit --packages datastax:spark-cassandra-connector:2.3.1-s_2.11,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 data_loading/event_stream_generator.py
Example #25
0
# Selección los campos que se enviarán a Druid través de kafka
taxiTripsToKafka = taxiTrips.select("trip_id", "taxi_id", "company",
                                    "trip_start_timestamp",
                                    "trip_end_timestamp", "trip_seconds",
                                    "trip_miles", "pickup_community_area",
                                    "dropoff_community_area", "fare", "tips",
                                    "tolls", "extras", "trip_total")

# Enriquecimiento del Stream con los nombres de los areas de inicio y fin, y sus puntos centrales(lat. y long.)
taxiTripsEnrich = taxiTripsToKafka.join(pickupAreas, 'pickup_community_area')\
    .join(dropoffAreas, 'dropoff_community_area')

# Inicio de la query que escribe el resultado del enriquecimiennto a kafka
queryToKafka = taxiTripsEnrich\
    .select(taxiTripsEnrich["taxi_id"].cast('string').alias("key"),
            to_json(struct("*")).alias("value"))\
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", cfg.kafka_brokers) \
    .option("topic", cfg.kafka_outTopic) \
    .option("checkpointLocation", cfg.checkpointKafka_path) \
    .outputMode("Append") \
    .start()

# Inicio de la query que escribe los eventos a HDFS
queryToHDFS = taxiTrips.writeStream \
        .format("parquet") \
        .trigger(processingTime='15 minutes') \
        .partitionBy("year", "month") \
        .option("path", cfg.trips_path) \
        .option("checkpointLocation", cfg.checkpointHDFS_path) \
Example #26
0
    df.printSchema()

    print([3] * 5)

    # multi_ids_columns = ["tag_ids", "ids"]
    #
    # for column in multi_ids_columns:
    #     df = feature.multionehot(df, column)
    #     df.show()
    #

    # def array_to_string(my_list):
    #     return '[' + ','.join([str(elem) for elem in my_list]) + ']'
    #
    #
    # array_to_string_udf = udf(array_to_string, StringType())
    #
    # df = df.withColumn('categorystr', array_to_string_udf("category")).drop("category")
    # df = df.withColumn('categoryIdStr', array_to_string_udf("category_id")).drop("category_id")
    # df.show()
    # df.withColumn("features", to_json(struct($"features"))).write.csv(.
    df = df.withColumn("category-onehot",
                       pyf.to_json("category-onehot")).withColumn(
                           "category_id-onehot",
                           pyf.to_json("category_id-onehot"))
    df.drop("tag_ids").drop("ids").drop("tag_texts").coalesce(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").mode("overwrite").save("feature.csv")

    df = df.toPandas()
    df.to_csv('test.csv', index=False)
 def transform_into_openmrs_object(self, encounter_dataframe):
     return encounter_dataframe.groupBy('encounter.encounter_id').agg(
         f.first('patient_id').alias('person_id'),
         f.lit('encounter').alias('type'),
         f.first('encounter.location_id').alias('location_id'),
         f.first('person_uuid').alias('person_uuid'),
         f.col('encounter.encounter_id').cast('string').alias('couch_id'),
         f.first('uuid').alias('uuid'),
         f.first('encounter_datetime').alias('encounterdatetime'),
         f.struct(
             f.first('encounter_type_name').alias('display'),
             f.first('encounter_type_uuid').alias('uuid')).alias(
                 'encountertype'),
         f.struct(
             f.first('form_name').alias('name'),
             f.first('form_uuid').alias('uuid')).alias('form'),
         f.struct(
             f.first('location.location_name').alias('display'),
             f.first('location.location_uuid').alias('uuid')).alias(
                 'location'),
         f.to_json(
             f.collect_set(
                 f.when(
                     f.col('encounter_provider_uuid').isNotNull(),
                     f.struct(
                         f.col('encounter_provider_uuid').alias('uuid'),
                         f.col('encounter_provider.provider_name').alias(
                             'display'),
                         f.struct(
                             f.col('encounter_provider.provider_uuid').
                             alias('uuid'),
                             f.concat_ws(
                                 ' ',
                                 f.col(
                                     'encounter_provider.provider_identifier'
                                 ), f.lit('-'),
                                 f.col('encounter_provider.provider_name')).
                             alias('display')).alias('provider'))))).alias(
                                 'encounterproviders'),
         f.to_json(
             f.struct(
                 f.first('visit_uuid').alias('uuid'),
                 f.first('visit.date_started').alias('dateStarted'),
                 f.first('visit.date_stopped').alias('dateStopped'),
                 f.struct(
                     f.first('visit_type_name').alias('name'),
                     f.first('visit_type_uuid').alias('uuid')).alias(
                         'visitType'),
                 f.struct(
                     f.first('visit.location_name').alias('name'),
                     f.first('visit.location_uuid').alias('uuid')).alias(
                         'location'),
                 f.concat_ws(' ', f.first('visit_type_name'), f.lit('@'),
                             f.first('visit.location_name'), f.lit('-'),
                             f.first('visit.date_started')).alias(
                                 'display'))).alias('visit'),
         f.to_json(
             f.collect_set(
                 f.when(
                     f.col('order_uuid').isNotNull(),
                     f.struct(
                         f.col('order_uuid').alias('uuid'),
                         f.col('order_number').alias('orderNumber'),
                         f.struct(
                             f.col('orders.concept_uuid').alias('uuid'),
                             f.col('orders.concept_name').alias(
                                 'display')).alias('concept'),
                         f.struct(
                             f.col('orders.provider_uuid').alias('uuid'),
                             f.concat_ws(' ', 'orders.provider_identifier',
                                         'orders.provider_name').alias(
                                             'display')).alias('orderer'),
                         f.col('order_action').alias('action'),
                         f.col('orders.date_activated').alias(
                             'dateActivated'),
                         f.col('orders.date_created').alias('dateCreated'),
                         f.col('orders.urgency').alias('urgency'),
                         f.col('order_type_name').alias('type'))).otherwise(
                             None))).alias('orders'),
         f.to_json(
             f.collect_list(
                 f.struct(
                     f.lit('obs_uuid_to_be_included').alias('uuid'),
                     f.col('obs_datetime').alias('obsDatetime'),
                     f.struct(
                         f.col('parent_obs_concept_uuid').alias('uuid'),
                         f.struct(
                             f.col('parent_obs_concept_name').alias(
                                 'display')).alias('name')).alias(
                                     'concept'),
                     f.when(
                         f.col('value_coded').isNotNull(),
                         f.struct(
                             f.col('value_type').alias('type'),
                             f.to_json(
                                 f.struct(
                                     f.col('value_coded_concept_uuid').
                                     alias('uuid'),
                                     f.col('value_coded_concept_name'
                                           ).alias('display'))).
                             alias('value'))).when(
                                 f.col('value_not_coded').isNotNull(),
                                 f.struct(
                                     f.col('value_type').alias('type'),
                                     f.col('value_not_coded').alias(
                                         'value'))).alias('value'),
                     f.when(
                         f.col('groupmembers').isNotNull(),
                         f.col('groupmembers')).alias(
                             'groupMembers')))).alias('obs'),
     ).withColumn('build_date', f.current_timestamp())
    value_df.printSchema()

    #Choosing prime customers and calculate total transactions and earned points
    rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \
        .groupBy("value.CustomerCardNo") \
        .agg(sum("value.TotalValue").alias("TotalPurchase"),
             sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards"))

    #Rename column
    rewards_df = rewards_df.withColumn("CustomerCardNo", expr("`value.CustomerCardNo`")) \
                           .drop("value.CustomerCardNo")

    #Serilization to json format
    kafka_target_df = rewards_df.select(
        expr("CustomerCardNo as key"),
        to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value"))

    #Check schema
    rewards_df.printSchema()
    # Alternative statement for kafka target
    # kafka_target_df = rewards_df.selectExpr("value.CustomerCardNo as key",
    #                                                                 "to_json(struct(*)) as value")

    # kafka_target_df.show(truncate=False)

    #Writestream to kafka topic
    rewards_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Rewards Writer") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \

# COMMAND ----------

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)


# COMMAND ----------

from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
  StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJSON"))\
  .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)


# COMMAND ----------
Example #30
0
def main(topic):
    # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', topic)\
        .option('failOnDataLoss', 'false')\
        .option('auto.offset.reset', 'earliest')\
        .load()
    values = messages.select(messages['value'].cast('string'))
    words = values.select(
        functions.explode(functions.split(values.value, ';')).alias("words"))
    data = words.withColumn('text', functions.split('words',
                                                    ',')).select('text')
    data = data.withColumn('news_id', data['text'][0])
    data = data.withColumn('news_url', data['text'][1])
    print('finish load data')

    # 2. Scrap the news_text and tweets_comments
    data = data.withColumn('news_info', udf_get_news_info(data['news_url']))
    data = data.withColumn('news_title', data['news_info'][0])
    data = data.withColumn('news_text', data['news_info'][1])
    data = data.withColumn('news_image', data['news_info'][2])
    data = data.where(data['news_title'].isNotNull()
                      & (functions.length(data['news_title']) > 0))
    data = data.where(data['news_text'].isNotNull()
                      & (functions.length(data['news_text']) > 0))
    # data = data.where(data['tweets_comment'].isNotNull() & (functions.length(data['tweets_comment']) > 0)) # filter reviews with no text
    print('finish scrap')

    # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words
    data = data.withColumn('sentiment_scores',
                           udf_sentiment_score(data['news_text']))
    news_regex_tokenizer = RegexTokenizer(inputCol='news_text',
                                          outputCol='news_words',
                                          pattern='[^A-Za-z]+')
    news_stopwords_remover = StopWordsRemover(
        inputCol='news_words',
        outputCol='news_tokens',
        stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
    nlp_pipeline = Pipeline(
        stages=[news_regex_tokenizer, news_stopwords_remover])
    model = nlp_pipeline.fit(data)
    nlp_data = model.transform(data).select('news_id', 'news_title',
                                            'news_text', 'news_image',
                                            'news_tokens', 'sentiment_scores')

    # 4. Select Features
    nlp_data = nlp_data.withColumn('news_tokens',
                                   udf_morphy(nlp_data['news_tokens']))
    # nlp_data = nlp_data.withColumn('tweets_tokens', udf_morphy(nlp_data['tweets_tokens']))
    # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens'))
    nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0)
    # nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0)
    # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+'))
    nlp_data = nlp_data.withColumn('news_tokens',
                                   functions.concat_ws(' ', 'news_tokens'))
    print('finish scores')

    # 5. Save
    nlp_data = nlp_data.withColumn(
        'dl_value',
        functions.to_json(
            functions.struct([nlp_data[x] for x in nlp_data.columns])))

    stream = nlp_data.select(nlp_data.news_id.alias("key"),
                             nlp_data.dl_value.alias("value"))\
        .writeStream\
        .format('kafka')\
        .outputMode('update')\
        .option('kafka.bootstrap.servers', 'localhost:9092')\
        .option("topic", "mlnews-2")\
        .option("checkpointLocation", "../check")\
        .start()

    # stream = nlp_data.writeStream.format('console').outputMode('update').start()
    stream.awaitTermination()