Ejemplo n.º 1
0
    def read_input(self):
        config = self.config
        spark = self.spark

        kafka_server = config["kafka_server"]
        kafka_topic = config["kafka_topic"]
        schema_input = config["schema_input"]

        startInput = (spark
            .readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", kafka_server)
            .option("subscribe", kafka_topic)
            .option("startingOffsets", "earliest")
            .load()
            .selectExpr("CAST(value AS STRING) as value")
          )

        inputAnomalySchema = schema_of_json(F.lit(json.dumps(schema_input)))

        return (
            startInput.withColumn("data", from_json("value", inputAnomalySchema))
            .select('data.*')
            .withColumn("timestamp", to_timestamp("@timestamp"))
            .withColumn(
                "@timestamp", date_format('timestamp', constants.TIMESTAMP_FORMAT)
            )
        )
def get_schema_from_json(col_name):
  # get the json string and remove the header values
  json_typed = json.loads(pivotted_df.select(col_name).first()[0])
  del json_typed['header']

  # create the schema
  return schema_of_json(json.dumps(json_typed))
Ejemplo n.º 3
0
	  kafka_to_delta.py \
>log/kafka_to_delta.log 2>&1 &
"""

spark = get_spark()

# 读取kafka数据
kafka_reader = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", topic) \
    .load()

# 根据json格式的数据获取数据的schema
schema = f.schema_of_json(
    """{"userId":"44303","movieId":"3338","rating":"3.5","timestamp":"2020-05-06T09:40:14.603+08:00"}"""
)

rating_df = kafka_reader \
        .withColumn("value", col("value").cast("string")) \
        .withColumn("value", f.from_json("value", schema)) \
        .selectExpr("value.movieId",
                    "cast(value.rating as float)",
                    "value.userId",
                    "cast(value.timestamp as timestamp)",
                    "timestamp as kafka_timestamp",
                    "to_date(value.timestamp) as dt"
                    )


query=rating_df.writeStream\
Ejemplo n.º 4
0
# COMMAND ----------

df = spark.readStream.format('delta').load(untappd_raw_delta_path)

# COMMAND ----------

# MAGIC %md
# MAGIC ##### extract venues

# COMMAND ----------

from pyspark.sql.functions import col, json_tuple, from_json, schema_of_json

schema = schema_of_json(
    '''{"venue_id":9917985,"venue_name":"Untappd at Home","venue_slug":"untappd-at-home","primary_category_key":"Residence","primary_category":"Residence","parent_category_id":"4e67e38e036454776db1fb3a","categories":{"count":1,"items":[{"category_key":"home_private","category_name":"Home (private)","category_id":"4bf58dd8d48988d103941735","is_primary":true}]},"location":{"venue_address":"","venue_city":"","venue_state":"Everywhere","venue_country":"United States","lat":34.2347,"lng":-77.9482},"contact":{"twitter":"","venue_url":""},"foursquare":{"foursquare_id":"5e7b4d99c91df60008e8b168","foursquare_url":"https://4sq.com/3bDWYuq"},"venue_icon":{"sm":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_64.png","md":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_88.png","lg":"https://untappd.akamaized.net/venuelogos/venue_9917985_b3a5d245_bg_176.png?v=1"},"is_verified":true}'''
)
df = df.withColumn("venue", from_json(df.venue, schema))

# COMMAND ----------

# MAGIC %md
# MAGIC ### Badges

# COMMAND ----------

from pyspark.sql.functions import explode

df_badges = df.select(df.checkin_id, df.badges.count.alias('badge_count'),
                      df.badges.retro_status.alias('retro'),
                      explode(df.badges.items).alias('items'))