def stream_pipeline(context: click.core.Context): logging.info("Kafka -> Spark -> MongoDB") project_root = context.obj['PROJECT_ROOT'] configurator = get_configurator(project_root)._configuration_data context = AppSparkContext(configurator) st = StreamingPipeline(configurator, context) st.start_streaming('my_topic') context.stop_spark_context()
def io_pipeline(context: click.core.Context, cron: bool): logging.info("IO -> Spark -> MongoDB") project_root = context.obj['PROJECT_ROOT'] configurator = get_configurator(project_root)._configuration_data context = AppSparkContext(configurator) # TODO pass params if cron: CronTab(context.process_inquiries, configurator).start() # TEST PIPELINE # df = context.read_file('/amazon/data/metadata.json.gz') # df = df.limit(10) # context.save(df, 'mydb','spark') # READIN DATA metadata = context.read_file('/amazon/data/metadata.json.gz') review = context.read_file('/amazon/data/item_dedup.json.gz') # MAIN PIPELINE context.process_inquiries(review, metadata) context.stop_spark_context()