コード例 #1
0
ファイル: event_log.py プロジェクト: databill86/dagster
def _postgres_event_watcher_event_loop(conn_string, queue, run_id_dict):
    init_called = False
    queue.put(EventWatcherProcessStartedEvent())
    try:
        for notif in await_pg_notifications(conn_string,
                                            channels=[CHANNEL_NAME],
                                            timeout=POLLING_CADENCE,
                                            yield_on_timeout=True):
            if not init_called:
                init_called = True
                queue.put(EventWatcherStart())

            if notif is not None:
                event_record = deserialize_json_to_dagster_namedtuple(
                    notif.payload)
                if event_record.run_id in run_id_dict:
                    queue.put(EventWatcherEvent(event_record))
            else:
                # The polling window has timed out
                pass

    except Exception as e:  # pylint: disable=broad-except
        queue.put(EventWatchFailed(message=str(e)))
    finally:
        queue.put(EventWatcherEnd())
コード例 #2
0
def test_kitchen_sink():
    kitchen_sink = List[Dict({
        'opt_list_of_int':
        Field(List[int], is_optional=True),
        'tuple_of_things':
        Field(Tuple[int, str]),
        'nested_dict':
        Field(
            Dict({
                'list_list':
                Field(List[List[int]]),
                'nested_selector':
                Field(
                    Selector({
                        'some_field': Field(int),
                        'set': Field(Optional[Set[bool]])
                    })),
            })),
    })]

    kitchen_sink_meta = meta_from_dagster_type(kitchen_sink)

    rehydrated_meta = deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(kitchen_sink_meta))
    assert kitchen_sink_meta == rehydrated_meta
コード例 #3
0
def handle_execute_plan_result_raw(res):
    res_data = res['data']['executePlan']

    res_type = res_data['__typename']

    handle_error_states(res_type, res_data)

    if res_type == 'ExecutePlanSuccess':
        raw_event_records = [
            DagsterEventRecord(
                event_record.error_info,
                event_record.message,
                event_record.level,
                event_record.user_message,
                event_record.run_id,
                event_record.timestamp,
                event_record.step_key,
                event_record.pipeline_name,
                event_record.dagster_event,
            ) for event_record in [
                deserialize_json_to_dagster_namedtuple(e)
                for e in res_data['rawEventRecords']
            ]
        ]
        return raw_event_records

    raise DagsterGraphQLClientError('Unexpected result type')
コード例 #4
0
    def get_logs_for_run(self, run_id, cursor=-1):
        check.str_param(run_id, 'run_id')
        check.int_param(cursor, 'cursor')
        check.invariant(
            cursor >= -1,
            'Don\'t know what to do with negative cursor {cursor}'.format(cursor=cursor),
        )

        events = []
        if not os.path.exists(self.filepath_for_run_id(run_id)):
            return events

        cursor += 1  # adjust from 0 based offset to 1
        try:
            with self._connect(run_id) as conn:
                results = conn.cursor().execute(FETCH_EVENTS_SQL, (str(cursor),)).fetchall()
        except sqlite3.Error as err:
            six.raise_from(EventLogInvalidForRun(run_id=run_id), err)

        try:
            for (json_str,) in results:
                events.append(
                    check.inst_param(
                        deserialize_json_to_dagster_namedtuple(json_str), 'event', EventRecord
                    )
                )
        except (seven.JSONDecodeError, check.CheckError) as err:
            six.raise_from(EventLogInvalidForRun(run_id=run_id), err)

        return events
コード例 #5
0
    def _load_schedules(self):
        schedules_dir = os.path.join(self._base_dir)
        utils.mkdir_p(schedules_dir)

        for repository_name in os.listdir(schedules_dir):
            if not os.path.isdir(os.path.join(schedules_dir, repository_name)):
                continue

            self._schedules[repository_name] = {}
            for file in os.listdir(os.path.join(schedules_dir,
                                                repository_name)):
                if not file.endswith('.json'):
                    continue
                file_path = os.path.join(schedules_dir, repository_name, file)
                with open(file_path) as data:
                    try:
                        schedule = deserialize_json_to_dagster_namedtuple(
                            data.read())
                        self._schedules[repository_name][
                            schedule.name] = schedule

                    except Exception as ex:  # pylint: disable=broad-except
                        warnings.warn(
                            'Could not parse dagster schedule from {file_name} in {dir_name}. '
                            '{ex}: {msg}'.format(
                                file_name=file,
                                dir_name=self._base_dir,
                                ex=type(ex).__name__,
                                msg=ex,
                            ))
                        continue
コード例 #6
0
ファイル: event_log.py プロジェクト: pombredanne/dagster
def watcher_thread(conn_string, queue, handlers_dict, dict_lock, watcher_thread_exit):
    done = False
    while not done and not watcher_thread_exit.is_set():
        event_list = []
        while not queue.empty():
            try:
                event_list.append(queue.get_nowait())
            except Empty:
                pass

        for event in event_list:
            if not isinstance(event, EventWatcherThreadEvents):
                warnings.warn(
                    'Event watcher thread got unexpected event {event}'.format(event=event)
                )
                continue
            if isinstance(event, EventWatcherThreadNoopEvents):
                continue
            elif isinstance(event, EventWatcherThreadEndEvents):
                done = True
            else:
                assert isinstance(event, EventWatcherEvent)
                run_id, index_str = event.payload
                index = int(index_str)
                with dict_lock:
                    handlers = handlers_dict.get(run_id, [])

                with get_conn(conn_string).cursor() as curs:
                    curs.execute(SELECT_EVENT_LOG_SQL, (index,))
                    dagster_event = deserialize_json_to_dagster_namedtuple(curs.fetchone()[0])

                for (cursor, callback) in handlers:
                    if index >= cursor:
                        callback(dagster_event)
        time.sleep(WATCHER_POLL_INTERVAL)
コード例 #7
0
ファイル: test_event_log.py プロジェクト: databill86/dagster
def test_basic_event_store():
    @solid
    def return_one(_):
        return 1

    def _solids():
        return_one()

    events, _result = gather_events(_solids)

    event_log_storage = PostgresEventLogStorage.create_nuked_storage(
        get_test_conn_string())

    for event in events:
        event_log_storage.store_event(event)

    rows = fetch_all_events(get_test_conn_string())

    out_events = list(
        map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))

    assert list(map(lambda e: e.dagster_event.event_type, out_events)) == [
        DagsterEventType.PIPELINE_START,
        DagsterEventType.ENGINE_EVENT,
        DagsterEventType.STEP_START,
        DagsterEventType.STEP_OUTPUT,
        DagsterEventType.STEP_SUCCESS,
        DagsterEventType.ENGINE_EVENT,
        DagsterEventType.PIPELINE_SUCCESS,
    ]
コード例 #8
0
def test_dead_events():
    snapshot = path.join(path.dirname(path.realpath(__file__)), 'dead_events.txt')
    with open(snapshot, 'r') as fd:
        objs = []
        for line in fd.readlines():
            obj = deserialize_json_to_dagster_namedtuple(line)
            assert obj is not None
            objs.append(obj)

    assert len(objs) == 6
コード例 #9
0
def test_basic_solid_definition():
    @solid
    def noop_solid(_):
        pass

    solid_snap = build_solid_def_snap(noop_solid)

    assert solid_snap
    assert (deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(solid_snap)) == solid_snap)
コード例 #10
0
    def get_schedule_by_name(self, repository, schedule_name):
        check.inst_param(repository, 'repository', RepositoryDefinition)
        check.str_param(schedule_name, 'schedule_name')

        query = (db.select(
            [ScheduleTable.c.schedule_body]).select_from(ScheduleTable).where(
                ScheduleTable.c.repository_name == repository.name).where(
                    ScheduleTable.c.schedule_name == schedule_name))

        rows = self.execute(query)
        return deserialize_json_to_dagster_namedtuple(
            rows[0][0]) if len(rows) else None
コード例 #11
0
 def _load_historic_runs(self):
     for filename in glob.glob(os.path.join(self._base_dir, '*.json')):
         with open(filename, 'r') as fd:
             try:
                 pipeline_run = deserialize_json_to_dagster_namedtuple(
                     fd.read())
                 self.add_run(pipeline_run)
             except Exception as ex:  # pylint: disable=broad-except
                 print(
                     'Could not load pipeline run from {filename}, continuing.\n  Original '
                     'exception: {ex}: {msg}'.format(filename=filename,
                                                     ex=type(ex).__name__,
                                                     msg=ex))
                 continue
コード例 #12
0
    def get_run_by_id(self, run_id):
        '''Get a run by its id.

        Args:
            run_id (str): The id of the run

        Returns:
            Optional[PipelineRun]
        '''
        check.str_param(run_id, 'run_id')

        query = db.select([RunsTable.c.run_body]).where(RunsTable.c.run_id == run_id)
        rows = self.execute(query)
        return deserialize_json_to_dagster_namedtuple(rows[0][0]) if len(rows) else None
コード例 #13
0
    def get_schedule_ticks_by_schedule(self, repository, schedule_name):
        check.inst_param(repository, 'repository', RepositoryDefinition)
        check.str_param(schedule_name, 'schedule_name')

        query = (db.select([
            ScheduleTickTable.c.id, ScheduleTickTable.c.tick_body
        ]).select_from(ScheduleTickTable).where(
            ScheduleTickTable.c.repository_name == repository.name).where(
                ScheduleTickTable.c.schedule_name == schedule_name))

        rows = self.execute(query)
        return list(
            map(
                lambda r: ScheduleTick(
                    r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows))
コード例 #14
0
def test_simple_pipeline_smoke_test():
    @solid
    def solid_without_config(_):
        pass

    @pipeline
    def single_solid_pipeline():
        solid_without_config()

    config_schema_snapshot = build_config_schema_snapshot(
        single_solid_pipeline)
    assert config_schema_snapshot.all_config_snaps_by_key

    serialized = serialize_dagster_namedtuple(config_schema_snapshot)
    rehydrated_config_schema_snapshot = deserialize_json_to_dagster_namedtuple(
        serialized)
    assert config_schema_snapshot == rehydrated_config_schema_snapshot
コード例 #15
0
ファイル: runs.py プロジェクト: ghosthamlet/dagster
    def on_created(self, event):
        run_id, _extension = os.path.basename(event.src_path).split('.')
        # if we already know about the run, we kicked it off
        with self._lock:
            if self._run_storage.has_run(run_id):
                return

            with open(event.src_path, 'r') as fd:
                try:
                    pipeline_run = deserialize_json_to_dagster_namedtuple(fd.read())
                    self._run_storage.add_external_run(pipeline_run, event.src_path)
                except Exception as ex:  # pylint: disable=broad-except
                    warnings.warn(
                        'Error trying to load .json metadata file in filesystem run '
                        'storage: {ex}: {msg}'.format(ex=type(ex).__name__, msg=ex)
                    )
                    return
コード例 #16
0
    def get_run_by_id(self, run_id):
        '''Get a run by its id.

        Args:
            run_id (str): THe id of the run

        Returns:
            Optional[PipelineRun]
        '''
        check.str_param(run_id, 'run_id')

        conn = get_conn(self.conn_string)
        with conn.cursor() as curs:
            curs.execute('SELECT run_body FROM runs WHERE run_id = %s',
                         (run_id, ))
            rows = curs.fetchall()
            return deserialize_json_to_dagster_namedtuple(
                rows[0][0]) if len(rows) else None
コード例 #17
0
def test_kitchen_sink():
    kitchen_sink = resolve_to_config_type([{
        'opt_list_of_int':
        Field(int, is_optional=True),
        'nested_dict': {
            'list_list': [[int]],
            'nested_selector':
            Field(Selector({
                'some_field': int,
                'more_list': Noneable([bool])
            })),
        },
    }])

    kitchen_sink_meta = meta_from_dagster_type(kitchen_sink)

    rehydrated_meta = deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(kitchen_sink_meta))
    assert kitchen_sink_meta == rehydrated_meta
コード例 #18
0
ファイル: event_log.py プロジェクト: databill86/dagster
    def get_logs_for_run(self, run_id, cursor=-1):
        '''Get all of the logs corresponding to a run.

        Args:
            run_id (str): The id of the run for which to fetch logs.
            cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,
                i.e., if cursor is -1, all logs will be returned. (default: -1)
        '''
        check.str_param(run_id, 'run_id')
        check.int_param(cursor, 'cursor')
        check.invariant(cursor >= -1, 'Cursor must be -1 or greater')

        with get_conn(self.conn_string).cursor() as curs:
            FETCH_SQL = 'SELECT event_body FROM event_log WHERE run_id = %s OFFSET %s;'
            curs.execute(FETCH_SQL, (run_id, cursor + 1))

            rows = curs.fetchall()
            return list(
                map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]),
                    rows))
コード例 #19
0
def watcher_thread(conn_string, run_id_dict, handlers_dict, dict_lock,
                   watcher_thread_exit):

    try:
        for notif in await_pg_notifications(
                conn_string,
                channels=[CHANNEL_NAME],
                timeout=POLLING_CADENCE,
                yield_on_timeout=True,
                exit_event=watcher_thread_exit,
        ):
            if notif is None:
                if watcher_thread_exit.is_set():
                    break
            else:
                run_id, index_str = notif.payload.split('_')
                if run_id not in run_id_dict:
                    continue

                index = int(index_str)
                with dict_lock:
                    handlers = handlers_dict.get(run_id, [])

                engine = create_engine(conn_string,
                                       isolation_level='AUTOCOMMIT',
                                       poolclass=db.pool.NullPool)
                try:
                    res = engine.execute(
                        db.select([
                            SqlEventLogStorageTable.c.event
                        ]).where(SqlEventLogStorageTable.c.id == index), )
                    dagster_event = deserialize_json_to_dagster_namedtuple(
                        res.fetchone()[0])
                finally:
                    engine.dispose()

                for (cursor, callback) in handlers:
                    if index >= cursor:
                        callback(dagster_event)
    except psycopg2.OperationalError:
        pass
コード例 #20
0
    def get_logs_for_run(self, run_id, cursor=-1):
        '''Get all of the logs corresponding to a run.

        Args:
            run_id (str): The id of the run for which to fetch logs.
            cursor (Optional[int]): Zero-indexed logs will be returned starting from cursor + 1,
                i.e., if cursor is -1, all logs will be returned. (default: -1)
        '''
        check.str_param(run_id, 'run_id')
        check.int_param(cursor, 'cursor')
        check.invariant(
            cursor >= -1,
            'Don\'t know what to do with negative cursor {cursor}'.format(cursor=cursor),
        )

        # cursor starts at 0 & auto-increment column starts at 1 so adjust
        cursor = cursor + 1

        query = (
            db.select([SqlEventLogStorageTable.c.event])
            .where(SqlEventLogStorageTable.c.run_id == run_id)
            .where(SqlEventLogStorageTable.c.id > cursor)
            .order_by(SqlEventLogStorageTable.c.id.asc())
        )

        with self.connect(run_id) as conn:
            results = conn.execute(query).fetchall()

        events = []
        try:
            for (json_str,) in results:
                events.append(
                    check.inst_param(
                        deserialize_json_to_dagster_namedtuple(json_str), 'event', EventRecord
                    )
                )
        except (seven.JSONDecodeError, check.CheckError) as err:
            six.raise_from(DagsterEventLogInvalidForRun(run_id=run_id), err)

        return events
コード例 #21
0
def test_basic_event_store(conn_string):
    @solid
    def return_one(_):
        return 1

    def _solids():
        return_one()

    events, _result = gather_events(_solids)

    event_log_storage = PostgresEventLogStorage.create_clean_storage(
        conn_string)

    for event in events:
        event_log_storage.store_event(event)

    rows = fetch_all_events(conn_string)

    out_events = list(
        map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))

    # messages can come out of order
    assert Counter(event_types(out_events)) == Counter([
        DagsterEventType.PIPELINE_START,
        DagsterEventType.ENGINE_EVENT,
        DagsterEventType.STEP_START,
        DagsterEventType.STEP_SUCCESS,
        DagsterEventType.PIPELINE_SUCCESS,
        DagsterEventType.STEP_OUTPUT,
        DagsterEventType.ENGINE_EVENT,
    ])
    assert (sorted_event_types(out_events)) == [
        DagsterEventType.PIPELINE_START,
        DagsterEventType.ENGINE_EVENT,
        DagsterEventType.STEP_START,
        DagsterEventType.STEP_OUTPUT,
        DagsterEventType.STEP_SUCCESS,
        DagsterEventType.ENGINE_EVENT,
        DagsterEventType.PIPELINE_SUCCESS,
    ]
コード例 #22
0
def test_solid_definition_kitchen_sink():
    @solid(
        input_defs=[
            InputDefinition('arg_one', str, description='desc1'),
            InputDefinition('arg_two', int),
        ],
        output_defs=[
            OutputDefinition(name='output_one', dagster_type=str),
            OutputDefinition(name='output_two',
                             dagster_type=int,
                             description='desc2',
                             is_required=False),
        ],
        config={'foo': int},
        description='a description',
        tags={'a_tag': 'yup'},
        required_resource_keys={'a_resource'},
    )
    def kitchen_sink_solid(_, arg_two,
                           arg_one):  # out of order to test positional_inputs
        assert arg_one
        assert arg_two
        raise Exception('should not execute')

    kitchen_sink_solid_snap = build_solid_def_snap(kitchen_sink_solid)

    assert kitchen_sink_solid_snap
    assert kitchen_sink_solid_snap.name == 'kitchen_sink_solid'
    assert len(kitchen_sink_solid_snap.input_def_snaps) == 2
    assert [inp.name for inp in kitchen_sink_solid_snap.input_def_snaps
            ] == ['arg_one', 'arg_two']
    assert [
        inp.dagster_type_key for inp in kitchen_sink_solid_snap.input_def_snaps
    ] == [
        'String',
        'Int',
    ]

    assert kitchen_sink_solid_snap.get_input_snap(
        'arg_one').description == 'desc1'

    assert [out.name for out in kitchen_sink_solid_snap.output_def_snaps] == [
        'output_one',
        'output_two',
    ]

    assert [
        out.dagster_type_key
        for out in kitchen_sink_solid_snap.output_def_snaps
    ] == [
        'String',
        'Int',
    ]

    assert kitchen_sink_solid_snap.get_output_snap(
        'output_two').description == 'desc2'
    assert kitchen_sink_solid_snap.get_output_snap(
        'output_two').is_required is False

    assert (kitchen_sink_solid_snap.config_field_snap.type_key ==
            kitchen_sink_solid.config_field.config_type.key)

    assert kitchen_sink_solid_snap.required_resource_keys == ['a_resource']
    assert kitchen_sink_solid_snap.tags == {'a_tag': 'yup'}
    assert kitchen_sink_solid.positional_inputs == ['arg_two', 'arg_one']

    assert (deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(kitchen_sink_solid_snap)) ==
            kitchen_sink_solid_snap)
コード例 #23
0
ファイル: engine.py プロジェクト: mapbox/dagster
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        pipeline_name = pipeline_context.pipeline_def.name

        handle_dict = pipeline_context.execution_target_handle.to_dict()

        instance_ref_dict = pipeline_context.instance.get_ref().to_dict()

        environment_dict = dict(pipeline_context.environment_dict,
                                execution={'in_process': {}})

        mode = pipeline_context.mode_def.name

        run_id = pipeline_context.pipeline_run.run_id

        app = make_app(celery_config)

        task_signatures = {}  # Dict[step_key, celery.Signature]
        apply_kwargs = defaultdict(dict)  # Dict[step_key, Dict[str, Any]]

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)))
        priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][
            'priority'])
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        for step_key in execution_plan.step_keys_to_execute:
            step = execution_plan.get_step_by_key(step_key)
            priority = int(
                step.tags.get('dagster-celery/priority',
                              task_default_priority))
            queue = step.tags.get('dagster-celery/queue', task_default_queue)
            task = create_task(app)

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline_name
                    },
                    'environmentConfigData': environment_dict,
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'stepKeys': [step_key],
                }
            }
            task_signatures[step_key] = task.si(handle_dict, variables,
                                                instance_ref_dict)
            apply_kwargs[step_key] = {
                'priority': priority,
                'queue': queue,
                'routing_key': '{queue}.execute_query'.format(queue=queue),
            }

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_success = {}
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        if event.is_step_success:
                            step_success[step_key] = True
                        elif event.is_step_failure:
                            step_success[step_key] = False

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    was_success = step_success.get(step_key)
                    if was_success == True:
                        active_execution.mark_success(step_key)
                    elif was_success == False:
                        active_execution.mark_failed(step_key)
                    else:
                        # check errors list?
                        pipeline_context.log.error(
                            'Step {key} finished without success or failure event, assuming failure.'
                            .format(key=step_key))
                        active_execution.mark_failed(step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # dont add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    step_results[step.key] = task_signatures[
                        step.key].apply_async(**apply_kwargs[step.key])
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occured in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
コード例 #24
0
ファイル: engine.py プロジェクト: jmbrooks/dagster
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        pipeline_name = pipeline_context.pipeline_def.name

        handle_dict = pipeline_context.execution_target_handle.to_dict()

        instance_ref_dict = pipeline_context.instance.get_ref().to_dict()

        environment_dict = dict(pipeline_context.environment_dict,
                                execution={'in_process': {}})

        mode = pipeline_context.mode_def.name

        run_id = pipeline_context.pipeline_run.run_id

        app = make_app(celery_config)

        pending_steps = execution_plan.execution_deps()

        task_signatures = {}  # Dict[step_key, celery.Signature]
        apply_kwargs = defaultdict(dict)  # Dict[step_key, Dict[str, Any]]

        sort_by_priority = lambda step_key: (-1 * apply_kwargs[step_key][
            'priority'])

        for step_key in execution_plan.step_keys_to_execute:
            step = execution_plan.get_step_by_key(step_key)
            priority = step.metadata.get('dagster-celery/priority',
                                         task_default_priority)
            queue = step.metadata.get('dagster-celery/queue',
                                      task_default_queue)
            task = create_task(app)

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline_name
                    },
                    'environmentConfigData': environment_dict,
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'stepKeys': [step_key],
                }
            }
            task_signatures[step_key] = task.si(handle_dict, variables,
                                                instance_ref_dict)
            apply_kwargs[step_key] = {
                'priority': priority,
                'queue': queue,
                'routing_key': '{queue}.execute_query'.format(queue=queue),
            }

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        completed_steps = set({})  # Set[step_key]

        while pending_steps or step_results:
            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: sort_by_priority(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                    for step_event in step_events:
                        yield deserialize_json_to_dagster_namedtuple(
                            step_event)
                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)
            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]

            pending_to_pop = []
            for step_key, requirements in pending_steps.items():
                if requirements.issubset(completed_steps):
                    pending_to_pop.append(step_key)

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            to_execute = sorted(pending_to_pop, key=sort_by_priority)
            for step_key in to_execute:
                try:
                    step_results[step_key] = task_signatures[
                        step_key].apply_async(**apply_kwargs[step_key])
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            for step_key in pending_to_pop:
                if step_key in pending_steps:
                    del pending_steps[step_key]

            time.sleep(TICK_SECONDS)
コード例 #25
0
 def _rows_to_runs(self, rows):
     return list(map(lambda r: deserialize_json_to_dagster_namedtuple(r[0]), rows))
コード例 #26
0
 def get_run_by_id(self, run_id):
     path = self._known_runs[run_id]
     with open(path, 'r') as fd:
         return deserialize_json_to_dagster_namedtuple(fd.read())
コード例 #27
0
def test_deserialize_json_to_dagster_namedtuple_types_ok():
    unpacked_tuple = deserialize_json_to_dagster_namedtuple('{"foo": "bar"}')
    assert unpacked_tuple
    assert unpacked_tuple['foo'] == 'bar'
コード例 #28
0
def test_deserialize_json_to_dagster_namedtyple_invalid_types(bad_obj):
    with pytest.raises(ParameterCheckError):
        deserialize_json_to_dagster_namedtuple(bad_obj)
コード例 #29
0
ファイル: engine.py プロジェクト: cmrajan/dagster
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, CeleryConfig),
            'pipeline_context',
            'Expected executor_config to be CeleryConfig got {}'.format(
                pipeline_context.executor_config),
        )

        celery_config = pipeline_context.executor_config

        storage = pipeline_context.environment_dict.get('storage')

        if (celery_config.broker and not is_local_uri(celery_config.broker)
            ) or (celery_config.backend
                  and not is_local_uri(celery_config.backend)):
            check.invariant(
                storage.get('s3') or storage.get('gcs'),
                'Must use S3 or GCS storage with non-local Celery broker: {broker} '
                'and backend: {backend}'.format(broker=celery_config.broker,
                                                backend=celery_config.backend),
            )
        else:
            check.invariant(
                not storage.get('in_memory'),
                'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS',
            )

        app = make_app(celery_config)

        priority_for_step = lambda step: (-1 * int(
            step.tags.get('dagster-celery/priority', task_default_priority)
        ) + -1 * _get_run_priority(pipeline_context))
        priority_for_key = lambda step_key: (priority_for_step(
            execution_plan.get_step_by_key(step_key)))
        _warn_on_priority_misuse(pipeline_context, execution_plan)

        step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
        step_errors = {}
        completed_steps = set({})  # Set[step_key]
        active_execution = execution_plan.start(
            retries=pipeline_context.executor_config.retries,
            sort_key_fn=priority_for_step)
        stopping = False

        while (not active_execution.is_complete
               and not stopping) or step_results:

            results_to_pop = []
            for step_key, result in sorted(
                    step_results.items(),
                    key=lambda x: priority_for_key(x[0])):
                if result.ready():
                    try:
                        step_events = result.get()
                    except Exception as e:  # pylint: disable=broad-except
                        # We will want to do more to handle the exception here.. maybe subclass Task
                        # Certainly yield an engine or pipeline event
                        step_events = []
                        step_errors[
                            step_key] = serializable_error_info_from_exc_info(
                                sys.exc_info())
                        stopping = True
                    for step_event in step_events:
                        event = deserialize_json_to_dagster_namedtuple(
                            step_event)
                        yield event
                        active_execution.handle_event(event)

                    results_to_pop.append(step_key)
                    completed_steps.add(step_key)

            for step_key in results_to_pop:
                if step_key in step_results:
                    del step_results[step_key]
                    active_execution.verify_complete(pipeline_context,
                                                     step_key)

            # process skips from failures or uncovered inputs
            for event in active_execution.skipped_step_events_iterator(
                    pipeline_context):
                yield event

            # don't add any new steps if we are stopping
            if stopping:
                continue

            # This is a slight refinement. If we have n workers idle and schedule m > n steps for
            # execution, the first n steps will be picked up by the idle workers in the order in
            # which they are scheduled (and the following m-n steps will be executed in priority
            # order, provided that it takes longer to execute a step than to schedule it). The test
            # case has m >> n to exhibit this behavior in the absence of this sort step.
            for step in active_execution.get_steps_to_execute():
                try:
                    queue = step.tags.get('dagster-celery/queue',
                                          task_default_queue)
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Submitting celery task for step "{step_key}" to queue "{queue}".'
                        .format(step_key=step.key, queue=queue),
                        EngineEventData(marker_start=DELEGATE_MARKER),
                        step_key=step.key,
                    )
                    step_results[step.key] = _submit_task(
                        app, pipeline_context, step, queue)
                except Exception:
                    yield DagsterEvent.engine_event(
                        pipeline_context,
                        'Encountered error during celery task submission.'.
                        format(),
                        event_specific_data=EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()), ),
                    )
                    raise

            time.sleep(TICK_SECONDS)

        if step_errors:
            raise DagsterSubprocessError(
                'During celery execution errors occurred in workers:\n{error_list}'
                .format(error_list='\n'.join([
                    '[{step}]: {err}'.format(step=key, err=err.to_string())
                    for key, err in step_errors.items()
                ])),
                subprocess_error_infos=list(step_errors.values()),
            )
コード例 #30
0
ファイル: log.py プロジェクト: ghosthamlet/dagster
 def from_json(json_str):
     return deserialize_json_to_dagster_namedtuple(json_str)