def execute(self, env: MasonEnvironment, response: Response, dry_run: bool = True) -> OperatorResponse: try: module = self.module(env) if isinstance(module, OperatorDefinition): if dry_run: response.add_info( f"Valid Operator: {self.namespace}:{self.command} with specified parameters." ) return OperatorResponse(response) else: operator_response: OperatorResponse = module.run( env, self.config, self.parameters, response) else: response.add_error( f"Module does not contain a valid OperatorDefinition. See /examples for sample operator implementations. \nMessage: {module.reason}" ) operator_response = OperatorResponse(response) except ModuleNotFoundError as e: response.add_error(f"Module Not Found: {e}") operator_response = OperatorResponse(response) return operator_response
def run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: scheduler = self.config.scheduler() if isinstance(scheduler, SchedulerClient): response.add_info( f"Registering workflow dag {self.name} with {scheduler.client.name()}." ) schedule_id, response, client_dag = scheduler.register_dag( self.name, self.dag, self.schedule, response) if not response.errored(): response.add_info(f"Registered schedule {schedule_id}") # TODO: FIX # if client_dag and output_path: # with tempfile.NamedTemporaryFile("w", delete=False) as f: # json = client_dag.to_json() # response.add_info(f"Saving client dag to {output_path}") # f.write(json) # f.close() # response = self.config.storage.client.save_to(f.name, output_path, response) if self.schedule: response.add_warning( f"Triggering workflow off schedule: {self.schedule.definition}" ) response.add_info(f"Triggering schedule: {schedule_id}") response = scheduler.trigger_schedule(schedule_id, response, env) else: response.add_error("Scheduler client not defined") return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: database_name: str = parameters.get_required("database_name") table_name: str = parameters.get_required("table_name") # TODO: break this up into 2 calls between scheduler and metastore, remove trigger_schedule_for_table from engine definition, table may not be defined for scheduler response = config.scheduler().trigger_schedule_for_table(table_name, database_name, response) return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: query_string = parameters.get_required("query_string") database_name = parameters.get_required("database_name") table_name = parameters.get_required("table_name") output_path = parameters.get_optional("output_path") # TODO?: Sanitize the query string query = query_string final: Union[ExecutedJob, InvalidJob] table, response = config.metastore().get_table(database_name, table_name) if output_path and isinstance(config.storage(), StorageClient): outp: Optional[Path] = config.storage().path(output_path) else: outp = None if isinstance(table, Table): response.add_info(f"Running Query \"{query}\"") job = QueryJob(query_string, table, outp) final, response = config.execution().run_job(job, response) else: final = InvalidJob(table.message()) return OperatorResponse(response, final)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: schedule_name: str = parameters.get_required("schedule_name") response = config.scheduler().delete_schedule(schedule_name, response) return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: database_name: str = parameters.get_required("database_name") tables, response = config.metastore().list_tables( database_name, response) tb = compute(tables) return OperatorResponse(response, tb)
def dry_run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: response.add_info(f"Performing Dry Run for Workflow") response.add_info("") response.add_info(f"Valid Workflow DAG Definition:") response.add_info(f"-" * 80) response.add_info(f"\n{self.dag.display()}") response.add_info("Finished") for r in list(map(lambda s: s.reason, self.dag.invalid_steps)): response.add_warning(r) return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: SUPPORTED_SCHEMAS = {"parquet", "csv", "json", "jsonl"} input_path = parameters.get_required("input_path") output_path = parameters.get_required("output_path") parse_headers = parameters.get_optional("parse_headers") table, response = config.storage().infer_table( input_path, "input_table", {"read_headers": parse_headers}, response) final: Union[ExecutedJob, InvalidJob] if isinstance(table, Table): final = InvalidJob( "No conflicting schemas found. Merge Unecessary") else: conflicting_table = table.conflicting_table() if conflicting_table: schemas = conflicting_table.schema_conflict.unique_schemas schema_types: Set[str] = set( map(lambda schema: schema.type, schemas)) job = MergeJob(config.storage().path(input_path), config.storage().path(output_path), next(iter(schema_types))) if len(schemas) > 0 and schema_types.issubset( SUPPORTED_SCHEMAS): if len(schema_types) == 1: executed, response = config.execution().run_job( job, response) if isinstance(executed, ExecutedJob): final = job.running() else: final = InvalidJob( f"Job {job.id} errored: {executed.reason}") else: final = InvalidJob( "Mixed schemas not supported at this time.") else: final = InvalidJob( f"Unsupported schemas for merge operator: {', '.join(list(schema_types.difference(SUPPORTED_SCHEMAS)))}" ) else: final = InvalidJob( f"No conflicting schemas found at {input_path}. Merge unecessary. Invalid Schemas {table.message()}" ) return OperatorResponse(response, final)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, resp: Response) -> OperatorResponse: database_name: str = parameters.get_required("database_name") table_name: str = parameters.get_required("table_name") read_headers: bool = isinstance( parameters.get_optional("read_headers"), str) table, response = config.metastore().get_table( database_name, table_name, options={"read_headers": read_headers}, response=resp) oR = OperatorResponse(response, table) return oR
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: table_name: str = parameters.get_required("table_name") database_name: str = parameters.get_required("database_name") output_path: str = parameters.get_required("output_path") format: str = parameters.get_required("format") sample_size: str = parameters.get_optional("sample_size") or "3" partition_columns: Optional[str] = parameters.get_optional( "partition_columns") filter_columns: Optional[str] = parameters.get_optional( "filter_columns") partitions: Optional[str] = parameters.get_optional("partitions") outp = config.storage().path(output_path) table, response = config.metastore().get_table( database_name, table_name, options={"sample_size": sample_size}, response=response) credentials = config.metastore().credentials() if isinstance(credentials, MetastoreCredentials): if isinstance(table, Table): job = FormatJob(table, outp, format, partition_columns, filter_columns, partitions, credentials) executed, response = config.execution().run_job(job, response) else: message = f"Table not found: {table_name}, {database_name}. Messages: {table.message()}" executed = InvalidJob(message) else: message = f"Invalid Metastore Credentials: {credentials.reason}" executed = InvalidJob(message) return OperatorResponse(response, executed)
def run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: response.add_error(f"Invalid Operator. Reason: {self.reason}") response.set_status(400) return OperatorResponse(response)
def dry_run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: response.add_error("Invalid Resource: " + self.reason) response.set_status(400) return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: response.add_info("Running operator2") return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: return OperatorResponse(response)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: job_id: str = parameters.get_required("job_id") execution = config.execution() job, response = config.execution().get_job(job_id, response) return OperatorResponse(response, job)
def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse: table = Table("test_table", EmptySchema()) response.add_info("Running operator1") return OperatorResponse(response, table)