Esempio n. 1
0
def main(step_run_ref_filepath, pipeline_zip):
    # Extract any zip files to a temporary directory and add that temporary directory
    # to the site path so the contained files can be imported.
    #
    # We can't rely on pip or other packaging tools because the zipped files might not
    # even be Python packages.
    with tempfile.TemporaryDirectory() as tmp:

        print('Extracting {}'.format(pipeline_zip))
        with zipfile.ZipFile(pipeline_zip) as zf:
            zf.extractall(tmp)
        site.addsitedir(tmp)

        print('Loading step run ref')
        # We can use regular local filesystem APIs to access DBFS inside the Databricks runtime.
        with open(step_run_ref_filepath, 'rb') as handle:
            step_run_ref = pickle.load(handle)

        print('Step run ref:')
        print(step_run_ref)

        print('Setting up storage credentials')
        setup_storage(step_run_ref)

        print('Running pipeline')
        events = list(run_step_from_ref(step_run_ref))

    print('Saving events to DBFS')
    events_filepath = os.path.dirname(
        step_run_ref_filepath) + '/' + PICKLED_EVENTS_FILE_NAME
    with open(events_filepath, 'wb') as handle:
        pickle.dump(serialize_value(events), handle)
Esempio n. 2
0
def main(step_run_ref_bucket, s3_dir_key):
    session = boto3.client("s3")
    file_manager = S3FileManager(session, step_run_ref_bucket, "")
    file_handle = S3FileHandle(step_run_ref_bucket, s3_dir_key)
    step_run_ref_data = file_manager.read_data(file_handle)

    step_run_ref = pickle.loads(step_run_ref_data)

    events_bucket = step_run_ref_bucket
    events_s3_key = os.path.dirname(
        s3_dir_key) + "/" + PICKLED_EVENTS_FILE_NAME

    def put_events(events):
        file_obj = io.BytesIO(pickle.dumps(events))
        session.put_object(Body=file_obj,
                           Bucket=events_bucket,
                           Key=events_s3_key)

    # Set up a thread to handle writing events back to the plan process, so execution doesn't get
    # blocked on remote communication
    events_queue = Queue()
    event_writing_thread = Thread(
        target=event_writing_loop,
        kwargs=dict(events_queue=events_queue, put_events_fn=put_events),
    )
    event_writing_thread.start()

    with DagsterInstance.ephemeral() as instance:
        try:
            for event in run_step_from_ref(step_run_ref, instance):
                events_queue.put(event)
        finally:
            events_queue.put(DONE)
            event_writing_thread.join()
Esempio n. 3
0
def main(step_run_ref_filepath, pipeline_zip):
    # Extract any zip files to a temporary directory and add that temporary directory
    # to the site path so the contained files can be imported.
    #
    # We can't rely on pip or other packaging tools because the zipped files might not
    # even be Python packages.
    with tempfile.TemporaryDirectory() as tmp:

        with zipfile.ZipFile(pipeline_zip) as zf:
            zf.extractall(tmp)
        site.addsitedir(tmp)

        # We can use regular local filesystem APIs to access DBFS inside the Databricks runtime.
        with open(step_run_ref_filepath, "rb") as handle:
            step_run_ref = pickle.load(handle)

        setup_storage(step_run_ref)

        with DagsterInstance.ephemeral() as instance:
            events = list(run_step_from_ref(step_run_ref, instance))

    events_filepath = os.path.dirname(
        step_run_ref_filepath) + "/" + PICKLED_EVENTS_FILE_NAME
    with open(events_filepath, "wb") as handle:
        pickle.dump(serialize_value(events), handle)
Esempio n. 4
0
def main(step_run_ref_path):
    file_manager = LocalFileManager(".")
    file_handle = LocalFileHandle(step_run_ref_path)
    step_run_ref = pickle.loads(file_manager.read_data(file_handle))

    events = list(run_step_from_ref(step_run_ref))
    events_out_path = os.path.join(os.path.dirname(step_run_ref_path),
                                   PICKLED_EVENTS_FILE_NAME)
    with open(events_out_path, "wb") as events_file:
        pickle.dump(events, events_file)
Esempio n. 5
0
def main(step_run_ref_path: str) -> None:
    file_manager = LocalFileManager(".")
    file_handle = LocalFileHandle(step_run_ref_path)
    step_run_ref = pickle.loads(file_manager.read_data(file_handle))

    with DagsterInstance.ephemeral() as instance:
        events = list(run_step_from_ref(step_run_ref, instance))
        events_out_path = os.path.join(os.path.dirname(step_run_ref_path),
                                       PICKLED_EVENTS_FILE_NAME)
        with open(events_out_path, "wb") as events_file:
            pickle.dump(events, events_file)
Esempio n. 6
0
def main(step_run_ref_bucket, s3_dir_key):
    session = boto3.client('s3')
    file_manager = S3FileManager(session, step_run_ref_bucket, '')
    file_handle = S3FileHandle(step_run_ref_bucket, s3_dir_key)
    step_run_ref_data = file_manager.read_data(file_handle)

    step_run_ref = pickle.loads(step_run_ref_data)

    events = list(run_step_from_ref(step_run_ref))
    file_obj = io.BytesIO(pickle.dumps(events))
    events_key = os.path.dirname(s3_dir_key) + '/' + PICKLED_EVENTS_FILE_NAME
    session.put_object(Body=file_obj,
                       Bucket=step_run_ref_bucket,
                       Key=events_key)
Esempio n. 7
0
def main(step_run_ref_path: str) -> None:
    file_manager = LocalFileManager(".")
    file_handle = LocalFileHandle(step_run_ref_path)
    step_run_ref = pickle.loads(file_manager.read_data(file_handle))

    all_events: List[EventLogEntry] = []

    try:
        instance = external_instance_from_step_run_ref(
            step_run_ref, event_listener_fn=all_events.append)
        # consume entire step iterator
        list(run_step_from_ref(step_run_ref, instance))
    finally:
        events_out_path = os.path.join(os.path.dirname(step_run_ref_path),
                                       PICKLED_EVENTS_FILE_NAME)
        with open(events_out_path, "wb") as events_file:
            pickle.dump(serialize_value(all_events), events_file)
Esempio n. 8
0
def main(
    step_run_ref_filepath,
    setup_filepath,
    dagster_job_zip,
):
    # Extract any zip files to a temporary directory and add that temporary directory
    # to the site path so the contained files can be imported.
    #
    # We can't rely on pip or other packaging tools because the zipped files might not
    # even be Python packages.
    with tempfile.TemporaryDirectory() as tmp:

        with zipfile.ZipFile(dagster_job_zip) as zf:
            zf.extractall(tmp)
        site.addsitedir(tmp)

        # We can use regular local filesystem APIs to access DBFS inside the Databricks runtime.
        with open(setup_filepath, "rb") as handle:
            databricks_config = pickle.load(handle)

        # sc and dbutils are globally defined in the Databricks runtime.
        databricks_config.setup(dbutils, sc)  # noqa pylint: disable=undefined-variable

        with open(step_run_ref_filepath, "rb") as handle:
            step_run_ref = pickle.load(handle)
        print("Running dagster job")  # noqa pylint: disable=print-call

        events_filepath = os.path.dirname(
            step_run_ref_filepath) + "/" + PICKLED_EVENTS_FILE_NAME

        def put_events(events):
            with open(events_filepath, "wb") as handle:
                pickle.dump(serialize_value(events), handle)

        # Set up a thread to handle writing events back to the plan process, so execution doesn't get
        # blocked on remote communication
        events_queue = Queue()
        event_writing_thread = Thread(
            target=event_writing_loop,
            kwargs=dict(events_queue=events_queue, put_events_fn=put_events),
        )
        event_writing_thread.start()

        with StringIO() as stderr, StringIO() as stdout, redirect_stderr(
                stderr), redirect_stdout(stdout):
            try:
                instance = external_instance_from_step_run_ref(
                    step_run_ref, event_listener_fn=events_queue.put)
                # consume iterator
                list(run_step_from_ref(step_run_ref, instance))
            except Exception as e:
                # ensure that exceptiosn make their way into stdout
                traceback.print_exc()
                raise e
            finally:
                events_queue.put(DONE)
                event_writing_thread.join()
                # write final stdout and stderr
                with open(
                        os.path.dirname(step_run_ref_filepath) + "/stderr",
                        "wb") as handle:
                    stderr_str = stderr.getvalue()
                    sys.stderr.write(stderr_str)
                    handle.write(stderr_str.encode())
                with open(
                        os.path.dirname(step_run_ref_filepath) + "/stdout",
                        "wb") as handle:
                    stdout_str = stdout.getvalue()
                    sys.stdout.write(stdout_str)
                    handle.write(stdout_str.encode())