def queryAndServeWebsockets(args: dict) -> None: """Query SparkSession tables and returns via WebSocket. Args: args (dict): Args used to query. Returns: None """ spark = getRequiredParam(args, 'spark') query = getRequiredParam(args, 'query') event = args.get("event") or "new_report" try: df = spark.sql(query) except: raise('Invalid sql query %s', query) rdd = df.rdd.collect() @asyncio.coroutine def sendPayload(): print('sending data...') websocket = yield from websockets.connect('ws://172.17.0.1:4545/socket/websocket') data = dict(topic="alerts:lobby", event="phx_join", payload={}, ref=None) yield from websocket.send(json.dumps(data)) for entry in rdd: payload = { 'value': entry } msg = dict(topic="alerts:lobby", event=event, payload=payload, ref=None) yield from websocket.send(json.dumps(msg)) asyncio.get_event_loop().run_until_complete(sendPayload())
def __handleSetup(self, args): warnings.warn('deprecated', DeprecationWarning) streamName = getRequiredParam(args, 'stream') shockAction = getRequiredParam(args, 'shock_action') stream = self.sources.get(streamName) if (stream): fn = getAction('setup', shockAction) stream.setupAction = fn stream.setupArgs = args else: raise Exception('Stream not found!')
def parquetValueIngestion(args: dict) -> SparkDataFrame: spark = getRequiredParam(args, 'spark') path = getRequiredParam(args, 'path') mySchema = StructType() \ .add("value", "string") \ .add("uuid", "string") \ .add("timestamp", "string") \ .add("capability", "string") return spark.readStream \ .format('parquet') \ .schema(mySchema) \ .option('path', path) \ .load()
def socketIngestion(args: dict) -> SparkDataFrame: """Return a socket ingestion stream ready to be used. Args: args (dict): dict with options used to mount the stream. Returns: SparkDataFrame: socket ingestion dataframe ready to be used. """ spark = getRequiredParam(args, 'spark') host = getRequiredParam(args, 'host') port = getRequiredParam(args, 'port') return spark.readStream.format("socket") \ .option("host", host) \ .option("port", port) \ .load()
def kafkaIngestion(args: dict) -> SparkDataFrame: """Return a kafka ingestion stream ready to be used. Args: args (dict): dict with options used to mount the stream. Returns: SparkDataFrame: kafka ingestion dataframe ready to be used. """ spark = getRequiredParam(args, 'spark') topic = getRequiredParam(args, 'topic') brokers = getRequiredParam(args, 'brokers') return spark.readStream.format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load() \ .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
def __handlePublish(self, args): """Handle the new publish method of a stream. Args: args (dict): Arguments used for the publish. Returns: no return. """ streamName = getRequiredParam(args, 'stream') stream = self.sources.get(streamName) shockAction = getRequiredParam(args, 'shock_action') if (stream): fn = getAction('sinks', shockAction) stream.publishAction = fn stream.publishArgs = args else: raise Exception('Stream not found!')
def __handleIngestion(self, args): """Handle the new ingestion method of a stream. Args: args (dict): Arguments used for the ingestion. Returns: no return. """ streamName = getRequiredParam(args, 'stream') stream = self.sources.get(streamName) shockAction = getRequiredParam(args, 'shock_action') if (stream): args["spark"] = self.spark fn = getAction('ingestion', shockAction) stream.ingestAction = fn stream.ingestArgs = args else: raise Exception('Stream not found!')
def streamFilter(stream: SparkDataFrame, args: dict) -> SparkDataFrame: """Filter stream. Args: stream (SparkDataFrame): processed stream. args (dict): options to be used in the filter. Returns: SparkDataFrame: filtered stream. """ query = getRequiredParam(args, 'query') return stream.where(query)
def __newStream(self, args): """Creates new Shock stream. The stream will be registered in the sources dict. Args: args (dict): Arguments used for the registration. Returns: no return. """ name = getRequiredParam(args, 'stream') st = Stream(name) self.registerSource(name, st)
def __startStream(self, args): """Starts a stream. Args: args (dict): Arguments used to start the stream. Returns: no return. """ streamName = getRequiredParam(args, 'stream') stream = self.sources.get(streamName) if (stream): stream.start() else: raise Exception('Stream not found!')
def __handleAnalyze(self, args): """Handle the new process method of a stream. Args: args (dict): Arguments used for the processing. Returns: no return. """ stream = self.sources.get(args["stream"]) shockAction = getRequiredParam(args, 'shock_action') if (stream): fn = getAction('analyze', shockAction) stream.analyzeAction = fn stream.analyzeArgs = args else: raise Exception('Stream not found!')
def __flush(self, args): """Flushs pending actions. Used for sending websockets. Args: no arguments. Returns: no return. """ warnings.warn('deprecated', DeprecationWarning) args["spark"] = self.spark strategy = getRequiredParam(args, 'strategy') try: fn = getAction('flushes', strategy) except: raise ('Invalid flush strategy!') fn(args)
def readAndServeWebsockets(args: dict) -> None: """Publish parquet results written in /analysis via websocket Args: spark (SparkSession): processed stream. Returns: None """ spark = getRequiredParam(args, 'spark') path = args.get("path") or "/analysis" event = args.get("event") or "new_report" sch = interscitySchema() try: df = spark.read.parquet(path) except: df = spark.createDataFrame([], sch) # empty df return rdd = df.rdd.collect() @asyncio.coroutine def sendPayload(): websocket = yield from websockets.connect('ws://172.17.0.1:4545/socket/websocket') data = dict(topic="alerts:lobby", event="phx_join", payload={}, ref=None) yield from websocket.send(json.dumps(data)) for entry in rdd: payload = { 'uuid': entry.uuid, 'capability': entry.capability, 'timestamp': entry.timestamp, 'value': entry.value } msg = dict(topic="alerts:lobby", event=event, payload=payload, ref=None) yield from websocket.send(json.dumps(msg)) asyncio.get_event_loop().run_until_complete(sendPayload())