예제 #1
0
def stream_pipeline(context: click.core.Context):
    logging.info("Kafka -> Spark -> MongoDB")
    project_root = context.obj['PROJECT_ROOT']
    configurator = get_configurator(project_root)._configuration_data
    context = AppSparkContext(configurator)
    st = StreamingPipeline(configurator, context)
    st.start_streaming('my_topic')
    context.stop_spark_context()
예제 #2
0
def io_pipeline(context: click.core.Context, cron: bool):
    logging.info("IO -> Spark -> MongoDB")
    project_root = context.obj['PROJECT_ROOT']
    configurator = get_configurator(project_root)._configuration_data
    context = AppSparkContext(configurator)

    # TODO pass params
    if cron:
        CronTab(context.process_inquiries, configurator).start()

    # TEST PIPELINE
    # df = context.read_file('/amazon/data/metadata.json.gz')
    # df = df.limit(10)
    # context.save(df, 'mydb','spark')

    # READIN DATA
    metadata = context.read_file('/amazon/data/metadata.json.gz')
    review = context.read_file('/amazon/data/item_dedup.json.gz')
    # MAIN PIPELINE
    context.process_inquiries(review, metadata)
    context.stop_spark_context()