def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string= f"table_name:good_table,database_name:good_database") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Table good_table successfully deleted.'] }, 200)) # database DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:bad_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Database bad_database not found.'] }, 400)) # table DNE params = OperatorParameters( parameter_string=f"table_name:bad_table,database_name:good_database" ) bad = op.validate(config, params).run(env, Response()) assert (bad.with_status() == ({ 'Errors': ['Table bad_table not found.'] }, 400))
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid refresh params = OperatorParameters( parameter_string= "table_name:catalog_poc_data,database_name:crawler-poc") refresh = op.validate(config, params).run(env, Response()) assert (refresh.with_status() == table.refresh(False)) # already refreshing params = OperatorParameters( parameter_string= "table_name:catalog_poc_data_refreshing,database_name:crawler-poc") refreshing = op.validate(config, params).run(env, Response()) assert (refreshing.with_status() == table.refresh(True))
def run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: scheduler = self.config.scheduler() if isinstance(scheduler, SchedulerClient): response.add_info( f"Registering workflow dag {self.name} with {scheduler.client.name()}." ) schedule_id, response, client_dag = scheduler.register_dag( self.name, self.dag, self.schedule, response) if not response.errored(): response.add_info(f"Registered schedule {schedule_id}") # TODO: FIX # if client_dag and output_path: # with tempfile.NamedTemporaryFile("w", delete=False) as f: # json = client_dag.to_json() # response.add_info(f"Saving client dag to {output_path}") # f.write(json) # f.close() # response = self.config.storage.client.save_to(f.name, output_path, response) if self.schedule: response.add_warning( f"Triggering workflow off schedule: {self.schedule.definition}" ) response.add_info(f"Triggering schedule: {schedule_id}") response = scheduler.trigger_schedule(schedule_id, response, env) else: response.add_error("Scheduler client not defined") return OperatorResponse(response)
def run( self, config: SparkConfig, job: Job, resp: Optional[Response] = None ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: # TODO: Replace with python kubernetes api # TODO: Set up kubernetes configuration, run on docker version response: Response = resp or Response() job.set_id("mason" + "-" + job.type + "-" + str(uuid4())) merged_config = merge_config(config, job) job_id = merged_config["metadata"]["name"] conf = dict(merged_config) final: Union[ExecutedJob, InvalidJob] with tempfile.NamedTemporaryFile(delete=False, mode='w') as yaml_file: yaml_dump = yaml.dump(conf, yaml_file) command = ["kubectl", "apply", "-f", yaml_file.name] response.add_info( f"Executing Spark Kubernetes Operator. job_id: {job_id}") stdout, stderr = run_sys_call(command) if len(stdout) > 0: final = job.running(stdout) else: if len(stderr) > 0: final = job.errored(stderr) else: final = job.running() return final, response
def tests(env: MasonEnvironment, config: Config, op: Operator): params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/" ) good = op.validate(config, params).run(env, Response()) invalid_job = good.object assert (isinstance(invalid_job, InvalidJob)) params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path" ) good = op.validate(config, params).run(env, Response()) executed_job = good.object assert (isinstance(executed_job, ExecutedJob))
def test_local_client(self): base.set_log_level() env = self.before() config = Resources(env).get_config("8") # DAG has cycle step_params = {"config_id": "8", "parameters": {"test_param": "test"}} params = { "step_1": step_params, "step_2": step_params, "step_3": step_params, "step_4": step_params, "step_5": step_params, "step_6": step_params, } wf = self.get_workflow(env, "workflow_local_scheduler") if isinstance(wf, MalformedResource): raise Exception(f"Workflow not found: {wf.get_message()}") if isinstance(config, MalformedResource): raise Exception(f"Config not found: {config.get_message()}") parameters = WorkflowParameters(parameter_dict=params) validated = wf.validate(env, config, parameters) assert (isinstance(validated, ValidWorkflow)) operator_response = validated.run(env, Response()) info = """ Registering workflow dag test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 with local. Registering DAG in local memory Registered schedule test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 Triggering schedule: test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 Running dag * step_1 | * step_2 | | * step_3 | |/ * | step_4 |/ * step_5 * step_6 Running step step_1 Running operator1 Running step step_2 Running operator2 Running step step_3 Running operator3 Running step step_4 Running operator4 Running step step_5 Running operator5 Running step step_6 Running operator6 """ response = operator_response.response assert (len(response.errors) == 0) assert (clean_uuid(clean_string("\n".join( response.info))) == clean_uuid(clean_string(info)))
def tests(env: MasonEnvironment, config: Config, op: Operator): # Database Exists params = OperatorParameters( parameter_string="database_name:crawler-poc") valid = op.validate(config, params) exists = valid.run(env, Response()) assert exists.with_status() == table.index( config.metastore().client.name()) # Database DNE params = OperatorParameters( parameter_string="database_name:bad-database") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.index( config.metastore().client.name(), False))
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid delete params = OperatorParameters( parameter_string=f"schedule_name:good_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Info': ['Schedule good_schedule successfully deleted.'] }, 200)) # dne params = OperatorParameters( parameter_string=f"schedule_name:bad_schedule") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ["Crawler entry with name bad_schedule does not exist"] }, 400))
def run(self, job: Job, resp: Optional[Response] = None, mode: str = "async" ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: final: Union[ExecutedJob, InvalidJob] response: Response = resp or Response() try: if self.scheduler: if isinstance(job, FormatJob): final = self.run_job(job.type, job.spec( ), self.scheduler, mode) or ExecutedJob( "format_job", f"Job queued to format {job.table.schema.type} table as {job.format} and save to {job.output_path.path_str}" ) elif isinstance(job, QueryJob): final = self.run_job(job.type, job.spec(), self.scheduler) else: final = job.errored("Job type not supported for Dask") else: final = InvalidJob("Dask Scheduler not defined") except OSError as e: final = InvalidJob(message(e)) return final, response
def execute_ddl( self, ddl: DDLStatement, database: Database, response: Optional[Response] = None ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: return InvalidJob( "Client 'execute_ddl' not implemented"), response or Response()
def save( self, state_store: MasonStateStore, overwrite: bool = False, response: Response = Response()) -> Response: message = self.get_message() if message: response.add_error(message) return response
def apply(file: str, overwrite: bool = False, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None): environment: MasonEnvironment = env or MasonEnvironment().initialize() logger.set_level(log_level) response = Response() all = Resources(environment).get_all(file) for r in all: response = r.save(environment.state_store, overwrite, response) return response.with_status()
def tests(env: MasonEnvironment, config: Config, op: Operator): # valid query query = "SELECT * from $table limit 3" output_path = from_root("/.tmp/") params = OperatorParameters( parameter_string= f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp = { "1": [ 'Running Query "SELECT * from $table limit 3"', 'Running Athena query. query_id: test', 'Running job id=test' ], "4": [ f'Table succesfully formatted as parquet and exported to {output_path}' ] } expect = {'Info': exp[config.id]} assert (result.with_status() == (expect, 200)) # bad permissions query = "SELECT * from $table limit 3" params = OperatorParameters( parameter_string= f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp_2 = { "1": ({ 'Errors': [ 'Job errored: Access denied for credentials. Ensure associated user or role has permission to CreateNamedQuery on athena' ], 'Info': ['Running Query "SELECT * from $table limit 3"'] }, 403), "4": ({ 'Info': [ f'Table succesfully formatted as parquet and exported to {output_path}' ] }, 200) } assert (result.with_status() == exp_2[config.id])
def get_database( self, database_name: str, response: Optional[Response] = None ) -> Tuple[Result[Database, InvalidDatabase], Response]: tables, response = self.list_tables(database_name, response or Response()) database = tables.map(lambda a: Database("s3_table", a)).alt( lambda b: InvalidDatabase(b.error or b.message())) return database, response
def tests(env: MasonEnvironment, config: Config, op: Operator): # database DNE params = OperatorParameters( parameter_string= f"database_name:bad-database,storage_path:crawler-poc/catalog_poc_data" ) good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': ['Job errored: Metastore database bad-database not found'], 'Info': ['Table inferred: catalog_poc_data'] }, 404)) # bad path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/bad-table") good = op.validate(config, params).run(env, Response()) assert (good.with_status() == ({ 'Errors': [ 'No keys at s3://crawler-poc/bad-table', 'Job errored: Invalid Tables: No keys at s3://crawler-poc/bad-table' ] }, 404)) # valid path params = OperatorParameters( parameter_string= f"database_name:crawler-poc,storage_path:crawler-poc/catalog_poc_data,output_path:crawler-poc/athena/" ) good = op.validate(config, params).run(env, Response()) def clean(s: List[str]): return list(map(lambda i: clean_uuid(clean_string(i)), s)) infos = clean(good.formatted()["Info"]) expect = [ 'Tableinferred:catalog_poc_data', 'RunningAthenaquery.query_id:test_id', 'Runningjobid=test_id' ] assert (infos == expect)
def dry_run( self, env: MasonEnvironment, response: Response = Response() ) -> OperatorResponse: response.add_info(f"Performing Dry Run for Workflow") response.add_info("") response.add_info(f"Valid Workflow DAG Definition:") response.add_info(f"-" * 80) response.add_info(f"\n{self.dag.display()}") response.add_info("Finished") for r in list(map(lambda s: s.reason, self.dag.invalid_steps)): response.add_warning(r) return OperatorResponse(response)
def list_keys( self, path: str, response: Optional[Response] = None ) -> Tuple[List[Path], Response]: resp: Response = response or Response() keys = self.client().find(path) resp.add_response({'keys': keys}) if len(keys) > 0: paths = list(map(lambda k: self.get_path(k), keys)) else: paths = [] return paths, resp
def tests(env: MasonEnvironment, config: Config, op: Operator): # Database and table Exist params = OperatorParameters( parameter_string= "database_name:crawler-poc,table_name:catalog_poc_data") exists = op.validate(config, params).run(env, Response()) assert (exists.with_status() == table.get( config.metastore().client.name(), 1)) # Database DNE params = OperatorParameters( parameter_string= "database_name:bad-database,table_name:catalog_poc_data") dne = op.validate(config, params).run(env, Response()) assert (dne.with_status() == table.get( config.metastore().client.name(), 2)) # Table DNE params = OperatorParameters( parameter_string="database_name:crawler-poc,table_name:bad-table") dne2 = op.validate(config, params).run(env, Response()) assert (dne2.with_status() == table.get( config.metastore().client.name(), 3))
def get_database( self, database_name: str, response: Optional[Response] = None ) -> Tuple[Result[Database, InvalidDatabase], Response]: resp = response or Response() try: result = self.client().get_tables(DatabaseName=database_name) except ClientError as e: result = e.response error, status, message = self.parse_response(result) resp.add_response(result) if error == "EntityNotFoundException": resp.set_status(404) return Failure( InvalidDatabase(f"Database {database_name} not found")), resp elif 200 <= status < 300: table_list = result.get("TableList") if table_list: valid, invalid = sequence( list( map( lambda x: self.parse_table( x, Path(database_name, "glue"), database_name), table_list)), Table, InvalidTable) if len(invalid) > 0: invalid_messages = ", ".join( list(map(lambda i: i.reason, invalid))) resp.add_warning( f"Invalid Tables in glue response: {invalid_messages}") if len(valid) == 0: return Failure(InvalidDatabase(f"No valid tables")), resp else: return Success(Database(database_name, TableList(valid))), resp else: return Failure( InvalidDatabase( "TableList not found in glue response")), resp else: resp.set_status(status) return Failure( InvalidDatabase( f"Invalid response from glue: {message}. Status: {status}" )), resp
def save(self, state_store: MasonStateStore, overwrite: bool = False, response: Response = Response()): try: result = state_store.cp_source(self.source_path, "workflow", self.namespace, self.command, overwrite) if isinstance(result, FailedOperation): response.add_error(f"{result.message}") else: response.add_info(result) except Exception as e: response.add_error(f"Error copying source: {message(e)}") return response
def tests(env: MasonEnvironment, config: Config, wf: Workflow): # DNE params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_1.yaml")) dne = wf.validate(env, config, params).run(env, Response()) assert (dne.with_status() == expects.post(False)) # Exists params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_2.yaml")) exists = wf.validate(env, config, params).run(env, Response()) assert (exists.with_status() == expects.post(True)) # API response, status = run( "workflow", wf.namespace, wf.command, param_file=from_root( "/test/support/parameters/table_infer_parameters_1.yaml"), config_id=config.id, env=env, log_level="fatal") assert ((response, status) == expects.post(False))
def test_workflow_basic_valid(self): env = get_env("/test/support/", "/test/support/validations/") step_params = {"config_id": "5", "parameters": {"test_param": "test"}} params = { "step_1": step_params, "step_2": step_params, "step_3": step_params } expects = [ 'Performing Dry Run for Workflow', '', 'Valid Workflow DAG Definition:', '--------------------------------------------------------------------------------', '\n* step_1\n* step_2\n* step_3\n', 'Finished' ] validated = self.validate_workflow(env, "workflow_basic", '3', params) run = validated.dry_run(env, Response()) assert (run.response.info == expects)
def run(resource_type: str, namespace: str, command: str, parameter_string: Optional[str] = None, param_file: Optional[str] = None, config_id: Optional[str] = None, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None, dry_run: bool = False, parameters: Optional[dict] = None, printer=ApiPrinter()): response = Response() environment: MasonEnvironment = env or MasonEnvironment().initialize() logger.set_level(log_level) res = base.Resources(environment) resource: Union[Resource, MalformedResource] = res.get_resource( resource_type, namespace, command) config: Union[Config, MalformedResource] = res.get_best_config(config_id) params: Union[Parameters, MalformedResource] = res.get_parameters( resource_type, parameter_string, param_file, parameters) if isinstance(resource, Resource) and isinstance( config, Config) and isinstance(params, Parameters): if dry_run: response = validate_resource(resource, config, params, environment).dry_run( environment, response).to_response(response) else: response = validate_resource(resource, config, params, environment).run( environment, response).to_response(response) else: if isinstance(resource, MalformedResource): response.add_error(f"Malformed Resource: {resource.get_message()}") elif isinstance(config, MalformedResource): response.add_error(f"Bad Config: {config.get_message()}") elif isinstance(params, MalformedResource): response.add_error(f"Bad Parameters: {params.get_message()}") return printer.print_response(response)
def run(self, job: Job, resp: Optional[Response] = None, mode: str = "sync" ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: # TODO: Figure out how to get init variables through MagicMock so i can test scheduler with this test case: # InvalidJob Timed out trying to connect # OSError: Timed out trying to connect to 'tcp://dask-scheduler:8786' r = resp or Response() if isinstance(job, FormatJob): if job.format == "csv" and job.output_path.path_str == "good_output_path": return (ExecutedJob('Table succesfully formatted as csv'), r) else: return (InvalidJob('Invalid Dask Job: Invalid Schema'), r) elif isinstance(job, QueryJob): job.output_path.protocal = "file" return (KubernetesWorker({ "scheduler": "local:8786" }).run_job(job.type, job.spec(), "local:8786", mode), r) else: raise Exception(f"Mock job not implemented: {job.type}")
def get( self, job_id: str, resp: Optional[Response] = None ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]: response: Response = resp or Response() command = ["kubectl", "logs", job_id + "-driver"] stdout, stderr = run_sys_call(command) job = Job("get") final: Union[ExecutedJob, InvalidJob] if len(stdout) > 0: job.add_log(stdout) final = job.running(past=True) if len(stderr) > 0: job.add_log(stderr) final = job.errored(f"Kubernetes Error") return final, response
def run_job( self, job: Job, response: Optional[Response] = None ) -> Tuple[Union[InvalidJob, ExecutedJob], Response]: resp: Response = response or Response() # if isinstance(job, FormatJob): # for path in job.paths: # workbook = Workbook(path.path_str + '.xlsx') # worksheet = workbook.add_worksheet() # with fsspec.open(path.full_path(), mode='rt') as f: # f.read() # for col in split: # worksheet.writer(row,col, i) # i+=1 # # workbook.close() # else: final = InvalidJob(f"Job type {job.type} not supported") return final, resp
def get_table( self, database_name: str, table_name: str, resp: Optional[Response] = None ) -> Tuple[Union[Table, InvalidTables], Response]: try: result = self.client().get_table(DatabaseName=database_name, Name=table_name) except ClientError as e: result = e.response response: Response = resp or Response() response.add_response(result) error, status, message = self.parse_response(result) table = self.parse_table(result.get("Table", {}), Path(database_name + ":" + table_name, "glue"), database_name=database_name) final: Union[Table, InvalidTables] if error == "EntityNotFoundException": final = InvalidTables([ TableNotFound( f"Database {database_name} or table {table_name} not found" ) ]) elif 200 <= status < 300: if isinstance(table, Table): final = table else: final = InvalidTables([table]) else: final = InvalidTables([InvalidTable(f"Invalid Table: {message}")]) response.set_status(status) return final, response
def print_resources( self, resources: List[Union[Operator, Workflow, Config, MalformedResource]], type: Optional[str] = None, namespace: Optional[str] = None, command: Optional[str] = None, environment: Optional[MasonEnvironment] = None) -> Response: if len(resources) == 0: logger.error(self.none_message(type, namespace, command)) else: operators, workflows, configs, bad = sequence_4( resources, Operator, Workflow, Config, MalformedResource) type_name = type or "all" # TODO: dry up with resources if type in ["all", "operator", "operators"]: self.print_operators(operators, namespace, command) if type in ["all", "workflow", "workflows"]: self.print_workflows(workflows, namespace, command) if type in ["all", "config", "configs"]: self.print_configs(configs, environment) return Response()
def delete_table(self, database_name: str, table_name: str, resp: Optional[Response] = None) -> Response: response = resp or Response() try: glue_response = self.client().delete_table( DatabaseName=database_name, Name=table_name) except ClientError as e: glue_response = e.response error, status, message = self.parse_response(glue_response) response.add_response(glue_response) if not error == "": response.set_status(status) response.add_error(message) else: response.add_info(f"Table {table_name} successfully deleted.") return response
def config(config_id: Optional[str], set_current: bool = False, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None, printer: Printer = ApiPrinter()): environment = env or MasonEnvironment().initialize() logger.set_level(log_level) response = Response() if set_current and config_id: result = Resources(environment).set_session_config(config_id) if isinstance(result, str): response.add_error(result) response.set_status(404) else: response.add_info(f"Set session config to {config_id}") config_id = None res = Resources(environment) configs = res.get_resources("config", config_id) response = printer.print_resources(configs, "config", config_id, environment=environment) return response.with_status()