def __new__(cls, host, port=None, socket=None, location_name=None, use_ssl=None): return super(GrpcServerRepositoryLocationOrigin, cls).__new__( cls, check.str_param(host, "host"), check.opt_int_param(port, "port"), check.opt_str_param(socket, "socket"), check.str_param(location_name, "location_name") if location_name else _assign_grpc_location_name(port, socket, host), use_ssl if check.opt_bool_param(use_ssl, "use_ssl") else None, )
def __init__( self, max_concurrent_runs=None, tag_concurrency_limits=None, dequeue_interval_seconds=None, inst_data=None, ): self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) self.max_concurrent_runs = check.opt_int_param(max_concurrent_runs, "max_concurrent_runs", 10) self.tag_concurrency_limits = check.opt_list_param( tag_concurrency_limits, "tag_concurrency_limits", ) self.dequeue_interval_seconds = check.opt_int_param( dequeue_interval_seconds, "dequeue_interval_seconds", 5) super().__init__()
def __init__(self, port=None, socket=None, host='localhost'): check.opt_int_param(port, 'port') check.opt_str_param(socket, 'socket') check.opt_str_param(host, 'host') check.invariant( port is not None if seven.IS_WINDOWS else True, 'You must pass a valid `port` on Windows: `socket` not supported.', ) check.invariant( (port or socket) and not (port and socket), 'You must pass one and only one of `port` or `socket`.', ) check.invariant( host is not None if port else True, 'Must provide a hostname', ) if port: self._server_address = host + ':' + str(port) else: self._server_address = 'unix:' + os.path.abspath(socket)
def get_run_groups(graphene_info, filters=None, cursor=None, limit=None): from ..schema.runs import DauphinRunGroup check.opt_inst_param(filters, "filters", PipelineRunsFilter) check.opt_str_param(cursor, "cursor") check.opt_int_param(limit, "limit") instance = graphene_info.context.instance run_groups = instance.get_run_groups(filters=filters, cursor=cursor, limit=limit) for root_run_id in run_groups: run_groups[root_run_id]["runs"] = [ graphene_info.schema.type_named("PipelineRun")(run) for run in run_groups[root_run_id]["runs"] ] return [ DauphinRunGroup(root_run_id=root_run_id, runs=run_group["runs"]) for root_run_id, run_group in run_groups.items() ]
def get_runs(graphene_info, filters, cursor=None, limit=None): check.opt_inst_param(filters, 'filters', PipelineRunsFilter) check.opt_str_param(cursor, 'cursor') check.opt_int_param(limit, 'limit') instance = graphene_info.context.instance runs = [] if filters and filters.run_ids and len(filters.run_ids) == 1: run = instance.get_run_by_id(filters.run_ids[0]) if run: runs = [run] elif filters and (filters.pipeline_name or filters.tags or filters.status): runs = instance.get_runs(filters, cursor, limit) else: runs = instance.get_runs(cursor=cursor, limit=limit) return [ graphene_info.schema.type_named('PipelineRun')(run) for run in runs ]
def get_pipeline_run_observable(graphene_info, run_id, after=None): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(run_id, 'run_id') check.opt_int_param(after, 'after') instance = graphene_info.context.instance run = instance.get_run_by_id(run_id) if not run: def _get_error_observable(observer): observer.on_next( graphene_info.schema.type_named('PipelineRunLogsSubscriptionFailure')( missingRunId=run_id, message='Could not load run with id {}'.format(run_id) ) ) return Observable.create(_get_error_observable) # pylint: disable=E1101 pipeline = get_dauphin_pipeline_reference_from_selector(graphene_info, run.selector) from ..schema.pipelines import DauphinPipeline if not isinstance(pipeline, DauphinPipeline): return Observable.empty() # pylint: disable=no-member execution_plan = create_execution_plan( pipeline.get_dagster_pipeline(), run.environment_dict, RunConfig(mode=run.mode) ) # pylint: disable=E1101 return Observable.create( PipelineRunObservableSubscribe(instance, run_id, after_cursor=after) ).map( lambda events: graphene_info.schema.type_named('PipelineRunLogsSubscriptionSuccess')( run=graphene_info.schema.type_named('PipelineRun')(run), messages=[ from_event_record(graphene_info, event, pipeline, execution_plan) for event in events ], ) )
def get_pipeline_run_observable(graphene_info, run_id, after=None): from ...schema.pipelines.pipeline import GrapheneRun from ...schema.pipelines.subscription import ( GraphenePipelineRunLogsSubscriptionFailure, GraphenePipelineRunLogsSubscriptionSuccess, ) from ..events import from_event_record check.inst_param(graphene_info, "graphene_info", ResolveInfo) check.str_param(run_id, "run_id") check.opt_int_param(after, "after") instance = graphene_info.context.instance records = instance.get_run_records(RunsFilter(run_ids=[run_id])) if not records: def _get_error_observable(observer): observer.on_next( GraphenePipelineRunLogsSubscriptionFailure( missingRunId=run_id, message="Could not load run with id {}".format(run_id))) return Observable.create(_get_error_observable) # pylint: disable=E1101 record = records[0] run = record.pipeline_run def _handle_events(payload): events, loading_past = payload return GraphenePipelineRunLogsSubscriptionSuccess( run=GrapheneRun(record), messages=[ from_event_record(event, run.pipeline_name) for event in events ], hasMorePastEvents=loading_past, ) # pylint: disable=E1101 return Observable.create( PipelineRunObservableSubscribe(instance, run_id, after_cursor=after)).map(_handle_events)
def get_pipeline_run_observable(graphene_info, run_id, after=None): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(run_id, 'run_id') check.opt_int_param(after, 'after') instance = graphene_info.context.instance run = instance.get_run_by_id(run_id) if not run: def _get_error_observable(observer): observer.on_next( graphene_info.schema.type_named('PipelineRunLogsSubscriptionFailure')( missingRunId=run_id, message='Could not load run with id {}'.format(run_id) ) ) return Observable.create(_get_error_observable) # pylint: disable=E1101 external_execution_plan = ( ExternalExecutionPlan( execution_plan_snapshot=instance.get_execution_plan_snapshot( run.execution_plan_snapshot_id ), represented_pipeline=instance.get_historical_pipeline(run.pipeline_snapshot_id), ) if run.pipeline_snapshot_id and run.execution_plan_snapshot_id else None ) # pylint: disable=E1101 return Observable.create( PipelineRunObservableSubscribe(instance, run_id, after_cursor=after) ).map( lambda events: graphene_info.schema.type_named('PipelineRunLogsSubscriptionSuccess')( run=graphene_info.schema.type_named('PipelineRun')(run), messages=[ from_event_record(event, run.pipeline_name, external_execution_plan) for event in events ], ) )
def __new__( cls, name, pipeline_name, solid_selection, mode, min_interval=None, description=None ): return super(ExternalSensorData, cls).__new__( cls, name=check.str_param(name, "name"), pipeline_name=check.str_param(pipeline_name, "pipeline_name"), solid_selection=check.opt_nullable_list_param(solid_selection, "solid_selection", str), mode=check.opt_str_param(mode, "mode"), min_interval=check.opt_int_param(min_interval, "min_interval"), description=check.opt_str_param(description, "description"), )
def __new__(cls, port, socket, host, location_name, client, repository_names): from dagster.grpc.client import DagsterGrpcClient return super(GrpcServerRepositoryLocationHandle, cls).__new__( cls, check.opt_int_param(port, 'port'), check.opt_str_param(socket, 'socket'), check.str_param(host, 'host'), check.str_param(location_name, 'location_name'), check.inst_param(client, 'client', DagsterGrpcClient), check.set_param(repository_names, 'repository_names', of_type=str), )
def _runs_query(self, filters=None, cursor=None, limit=None): filters = check.opt_inst_param(filters, 'filters', PipelineRunsFilter, default=PipelineRunsFilter()) check.opt_str_param(cursor, 'cursor') check.opt_int_param(limit, 'limit') # If we have a tags filter, then we need to select from a joined table if filters.tags: base_query = db.select([RunsTable.c.run_body]).select_from( RunsTable.outerjoin( RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)) else: base_query = db.select([RunsTable.c.run_body ]).select_from(RunsTable) query = self._add_filters_to_query(base_query, filters) query = self._add_cursor_limit_to_query(query, cursor, limit) return query
def __new__(cls, port, socket, host, location_name, client, repository_names): from dagster.grpc.client import DagsterGrpcClient return super(GrpcServerRepositoryLocationHandle, cls).__new__( cls, check.opt_int_param(port, "port"), check.opt_str_param(socket, "socket"), check.str_param(host, "host"), check.str_param(location_name, "location_name"), check.inst_param(client, "client", DagsterGrpcClient), check.set_param(repository_names, "repository_names", of_type=str), )
def __new__(cls, last_tick_timestamp=None, last_run_key=None, min_interval=None, cursor=None): return super(SensorJobData, cls).__new__( cls, check.opt_float_param(last_tick_timestamp, "last_tick_timestamp"), check.opt_str_param(last_run_key, "last_run_key"), check.opt_int_param(min_interval, "min_interval"), check.opt_str_param(cursor, "cursor"), )
def get_runs(graphene_info, filters, cursor=None, limit=None): from ..schema.pipelines.pipeline import GrapheneRun check.opt_inst_param(filters, "filters", PipelineRunsFilter) check.opt_str_param(cursor, "cursor") check.opt_int_param(limit, "limit") instance = graphene_info.context.instance runs = [] if filters and filters.run_ids and len(filters.run_ids) == 1: run = instance.get_run_by_id(filters.run_ids[0]) if run: runs = [run] elif filters and (filters.pipeline_name or filters.tags or filters.statuses or filters.snapshot_id or filters.run_ids): runs = instance.get_runs(filters, cursor, limit) else: runs = instance.get_runs(cursor=cursor, limit=limit) return [GrapheneRun(run) for run in runs]
def _runs_query( self, filters: PipelineRunsFilter = None, cursor: str = None, limit: int = None, columns: List[str] = None, order_by: str = None, ascending: bool = False, ): filters = check.opt_inst_param(filters, "filters", PipelineRunsFilter, default=PipelineRunsFilter()) check.opt_str_param(cursor, "cursor") check.opt_int_param(limit, "limit") check.opt_list_param(columns, "columns") check.opt_str_param(order_by, "order_by") check.opt_bool_param(ascending, "ascending") if columns is None: columns = ["run_body"] base_query_columns = [ getattr(RunsTable.c, column) for column in columns ] # If we have a tags filter, then we need to select from a joined table if filters.tags: base_query = db.select(base_query_columns).select_from( RunsTable.join(RunTagsTable, RunsTable.c.run_id == RunTagsTable.c.run_id)) else: base_query = db.select(base_query_columns).select_from(RunsTable) query = self._add_filters_to_query(base_query, filters) query = self._add_cursor_limit_to_query(query, cursor, limit, order_by, ascending) return query
def __new__( cls, instigator_origin_id, instigator_name, instigator_type, status, timestamp, run_ids=None, run_keys=None, error=None, skip_reason=None, cursor=None, origin_run_ids=None, failure_count=None, ): """ This class defines the data that is serialized and stored for each schedule/sensor tick. We depend on the storage implementation to provide tick ids, and therefore separate all other data into this serializable class that can be stored independently of the id. Arguments: instigator_origin_id (str): The id of the instigator target for this tick instigator_name (str): The name of the instigator for this tick instigator_type (InstigatorType): The type of this instigator for this tick status (TickStatus): The status of the tick, which can be updated timestamp (float): The timestamp at which this instigator evaluation started Keyword Arguments: run_id (str): The run created by the tick. error (SerializableErrorInfo): The error caught during execution. This is set only when the status is ``TickStatus.Failure`` skip_reason (str): message for why the tick was skipped origin_run_ids (List[str]): The runs originated from the schedule/sensor. failure_count (int): The number of times this tick has failed. If the status is not FAILED, this is the number of previous failures before it reached the current state. """ _validate_tick_args(instigator_type, status, run_ids, error, skip_reason) return super(TickData, cls).__new__( cls, check.str_param(instigator_origin_id, "instigator_origin_id"), check.str_param(instigator_name, "instigator_name"), check.inst_param(instigator_type, "instigator_type", InstigatorType), check.inst_param(status, "status", TickStatus), check.float_param(timestamp, "timestamp"), check.opt_list_param(run_ids, "run_ids", of_type=str), check.opt_list_param(run_keys, "run_keys", of_type=str), error, # validated in _validate_tick_args skip_reason, # validated in _validate_tick_args cursor=check.opt_str_param(cursor, "cursor"), origin_run_ids=check.opt_list_param(origin_run_ids, "origin_run_ids", of_type=str), failure_count=check.opt_int_param(failure_count, "failure_count", 0), )
def get_ticks(self, origin_id, selector_id, before=None, after=None, limit=None, statuses=None): check.str_param(origin_id, "origin_id") check.opt_float_param(before, "before") check.opt_float_param(after, "after") check.opt_int_param(limit, "limit") check.opt_list_param(statuses, "statuses", of_type=TickStatus) base_query = (db.select([ JobTickTable.c.id, JobTickTable.c.tick_body ]).select_from(JobTickTable).order_by(JobTickTable.c.timestamp.desc())) if self.has_instigators_table(): query = base_query.where( db.or_( JobTickTable.c.selector_id == selector_id, db.and_( JobTickTable.c.selector_id == None, JobTickTable.c.job_origin_id == origin_id, ), )) else: query = base_query.where(JobTickTable.c.job_origin_id == origin_id) query = self._add_filter_limit(query, before=before, after=after, limit=limit, statuses=statuses) rows = self.execute(query) return list( map( lambda r: InstigatorTick( r[0], deserialize_json_to_dagster_namedtuple(r[1])), rows))
def get_batch_ticks( self, selector_ids: Sequence[str], limit: Optional[int] = None, statuses: Optional[Sequence[TickStatus]] = None, ) -> Mapping[str, Iterable[InstigatorTick]]: check.list_param(selector_ids, "selector_ids", of_type=str) check.opt_int_param(limit, "limit") check.opt_list_param(statuses, "statuses", of_type=TickStatus) bucket_rank_column = (db.func.rank().over( order_by=db.desc(JobTickTable.c.timestamp), partition_by=JobTickTable.c.selector_id, ).label("rank")) subquery = (db.select([ JobTickTable.c.id, JobTickTable.c.selector_id, JobTickTable.c.tick_body, bucket_rank_column, ]).select_from(JobTickTable).where( JobTickTable.c.selector_id.in_(selector_ids)).alias("subquery")) if statuses: subquery = subquery.where( JobTickTable.c.status.in_( [status.value for status in statuses])) query = (db.select([ subquery.c.id, subquery.c.selector_id, subquery.c.tick_body ]).order_by(subquery.c.rank.asc()).where(subquery.c.rank <= limit)) rows = self.execute(query) results = defaultdict(list) for row in rows: tick_id = row[0] selector_id = row[1] tick_data = cast(TickData, deserialize_json_to_dagster_namedtuple(row[2])) results[selector_id].append(InstigatorTick(tick_id, tick_data)) return results
def get_tunnel(self, remote_port, remote_host='localhost', local_port=None): check.int_param(remote_port, 'remote_port') check.str_param(remote_host, 'remote_host') check.opt_int_param(local_port, 'local_port') if local_port is not None: local_bind_address = ('localhost', local_port) else: local_bind_address = ('localhost', ) if self.password and self.password.strip(): client = SSHTunnelForwarder( self.remote_host, ssh_port=self.remote_port, ssh_username=self.username, ssh_password=self.password, ssh_pkey=self.key_file, ssh_proxy=self.host_proxy, local_bind_address=local_bind_address, remote_bind_address=(remote_host, remote_port), logger=self.log, ) else: client = SSHTunnelForwarder( self.remote_host, ssh_port=self.remote_port, ssh_username=self.username, ssh_pkey=self.key_file, ssh_proxy=self.host_proxy, local_bind_address=local_bind_address, remote_bind_address=(remote_host, remote_port), host_pkey_directories=[], logger=self.log, ) return client
def get_run_groups(graphene_info, filters=None, cursor=None, limit=None): from ..schema.runs import DauphinRunGroup check.opt_inst_param(filters, 'filters', PipelineRunsFilter) check.opt_str_param(cursor, 'cursor') check.opt_int_param(limit, 'limit') instance = graphene_info.context.instance run_groups = instance.get_run_groups(filters=filters, cursor=cursor, limit=limit) for root_run_id in run_groups: run_groups[root_run_id]['runs'] = [ graphene_info.schema.type_named('PipelineRun')(run) for run in run_groups[root_run_id]['runs'] ] return [ DauphinRunGroup(root_run_id=root_run_id, runs=run_group['runs']) for root_run_id, run_group in run_groups.items() ]
def get_steps_to_execute(self, limit: int = None) -> List[ExecutionStep]: check.invariant( self._context_guard, "ActiveExecution must be used as a context manager", ) check.opt_int_param(limit, "limit") self._update() steps = sorted( [self.get_step_by_key(key) for key in self._executable], key=self._sort_key_fn, ) if limit: steps = steps[:limit] for step in steps: self._in_flight.add(step.key) self._executable.remove(step.key) self._prep_for_dynamic_outputs(step) return steps
def test_opt_int_param(): assert check.opt_int_param(-1, 'param_name') == -1 assert check.opt_int_param(0, 'param_name') == 0 assert check.opt_int_param(1, 'param_name') == 1 assert check.opt_int_param(None, 'param_name') is None with pytest.raises(ParameterCheckError): check.opt_int_param('s', 'param_name')
def test_opt_int_param(): assert check.opt_int_param(-1, "param_name") == -1 assert check.opt_int_param(0, "param_name") == 0 assert check.opt_int_param(1, "param_name") == 1 assert check.opt_int_param(None, "param_name") is None with pytest.raises(ParameterCheckError): check.opt_int_param("s", "param_name")
def get_runs(self, filters: PipelineRunsFilter = None, cursor: str = None, limit: int = None) -> List[PipelineRun]: check.opt_inst_param(filters, "filters", PipelineRunsFilter) check.opt_str_param(cursor, "cursor") check.opt_int_param(limit, "limit") if not filters: return self._slice(list(self._runs.values())[::-1], cursor, limit) def run_filter(run): if filters.run_ids and run.run_id not in filters.run_ids: return False if filters.statuses and run.status not in filters.statuses: return False if filters.pipeline_name and filters.pipeline_name != run.pipeline_name: return False if filters.mode and filters.mode != run.mode: return False if filters.tags and not all( run.tags.get(key) == value for key, value in filters.tags.items()): return False if filters.snapshot_id and filters.snapshot_id != run.pipeline_snapshot_id: return False return True matching_runs = list( filter(run_filter, list(self._runs.values())[::-1])) return self._slice(matching_runs, cursor=cursor, limit=limit)
def get_asset_events(graphene_info, asset_key, partitions=None, cursor=None, limit=None): from ..schema.errors import GrapheneAssetsNotSupportedError check.inst_param(asset_key, "asset_key", AssetKey) check.opt_str_param(cursor, "cursor") check.opt_int_param(limit, "limit") instance = graphene_info.context.instance if not instance.is_asset_aware: return GrapheneAssetsNotSupportedError( message="The configured event log storage is not asset aware.") events = instance.events_for_asset_key(asset_key, partitions=partitions, before_cursor=cursor, limit=limit) return [ event for record_id, event in events if event.is_dagster_event and event.dagster_event.event_type_value == DagsterEventType.STEP_MATERIALIZATION.value ]
def __new__( cls, last_tick_timestamp: Optional[float] = None, last_run_key: Optional[str] = None, min_interval: Optional[int] = None, cursor: Optional[str] = None, ): return super(SensorInstigatorData, cls).__new__( cls, check.opt_float_param(last_tick_timestamp, "last_tick_timestamp"), check.opt_str_param(last_run_key, "last_run_key"), check.opt_int_param(min_interval, "min_interval"), check.opt_str_param(cursor, "cursor"), )
def _get_pipeline_definition_args(dag, unique_id=None): check.inst_param(dag, 'dag', DAG) unique_id = check.opt_int_param(unique_id, 'unique_id') pipeline_dependencies = {} solid_defs = [] seen_tasks = [] # To enforce predictable iteration order dag_roots = sorted(dag.roots, key=lambda x: x.task_id) for task in dag_roots: _traverse_airflow_dag(task, seen_tasks, pipeline_dependencies, solid_defs, unique_id) return (pipeline_dependencies, solid_defs)
def __init__( self, max_concurrent_runs=None, tag_concurrency_limits=None, dequeue_interval_seconds=None, inst_data=None, ): self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData) self._max_concurrent_runs = check.opt_int_param( max_concurrent_runs, "max_concurrent_runs", 10) check.invariant( self._max_concurrent_runs >= -1, "Negative values other than -1 (which disables the limit) for max_concurrent_runs are disallowed.", ) self._tag_concurrency_limits = check.opt_list_param( tag_concurrency_limits, "tag_concurrency_limits", ) self._dequeue_interval_seconds = check.opt_int_param( dequeue_interval_seconds, "dequeue_interval_seconds", 5) super().__init__()
def get_asset_materializations( graphene_info, asset_key, partitions=None, limit=None, before_timestamp=None, after_timestamp=None, ): check.inst_param(asset_key, "asset_key", AssetKey) check.opt_int_param(limit, "limit") check.opt_float_param(before_timestamp, "before_timestamp") instance = graphene_info.context.instance event_records = instance.get_event_records( EventRecordsFilter( event_type=DagsterEventType.ASSET_MATERIALIZATION, asset_key=asset_key, asset_partitions=partitions, before_timestamp=before_timestamp, after_timestamp=after_timestamp, ), limit=limit, ) return [event_record.event_log_entry for event_record in event_records]
def _add_filter_limit(self, query, before=None, after=None, limit=None, statuses=None): check.opt_float_param(before, "before") check.opt_float_param(after, "after") check.opt_int_param(limit, "limit") check.opt_list_param(statuses, "statuses", of_type=TickStatus) if before: query = query.where( JobTickTable.c.timestamp < utc_datetime_from_timestamp(before)) if after: query = query.where( JobTickTable.c.timestamp > utc_datetime_from_timestamp(after)) if limit: query = query.limit(limit) if statuses: query = query.where( JobTickTable.c.status.in_( [status.value for status in statuses])) return query