Beispiel #1
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid delete
        params = OperatorParameters(
            parameter_string=
            f"table_name:good_table,database_name:good_database")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Info': ['Table good_table successfully deleted.']
        }, 200))

        # database DNE
        params = OperatorParameters(
            parameter_string=f"table_name:bad_table,database_name:bad_database"
        )
        bad = op.validate(config, params).run(env, Response())
        assert (bad.with_status() == ({
            'Errors': ['Database bad_database not found.']
        }, 400))

        # table DNE
        params = OperatorParameters(
            parameter_string=f"table_name:bad_table,database_name:good_database"
        )
        bad = op.validate(config, params).run(env, Response())
        assert (bad.with_status() == ({
            'Errors': ['Table bad_table not found.']
        }, 400))
Beispiel #2
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid refresh
        params = OperatorParameters(
            parameter_string=
            "table_name:catalog_poc_data,database_name:crawler-poc")
        refresh = op.validate(config, params).run(env, Response())
        assert (refresh.with_status() == table.refresh(False))

        # already refreshing
        params = OperatorParameters(
            parameter_string=
            "table_name:catalog_poc_data_refreshing,database_name:crawler-poc")
        refreshing = op.validate(config, params).run(env, Response())
        assert (refreshing.with_status() == table.refresh(True))
Beispiel #3
0
    def run(
        self, env: MasonEnvironment, response: Response = Response()
    ) -> OperatorResponse:
        scheduler = self.config.scheduler()
        if isinstance(scheduler, SchedulerClient):
            response.add_info(
                f"Registering workflow dag {self.name} with {scheduler.client.name()}."
            )
            schedule_id, response, client_dag = scheduler.register_dag(
                self.name, self.dag, self.schedule, response)
            if not response.errored():
                response.add_info(f"Registered schedule {schedule_id}")
            # TODO: FIX
            # if client_dag and output_path:
            #     with tempfile.NamedTemporaryFile("w", delete=False) as f:
            #         json = client_dag.to_json()
            #         response.add_info(f"Saving client dag to {output_path}")
            #         f.write(json)
            #         f.close()
            #         response = self.config.storage.client.save_to(f.name, output_path, response)
            if self.schedule:
                response.add_warning(
                    f"Triggering workflow off schedule: {self.schedule.definition}"
                )

            response.add_info(f"Triggering schedule: {schedule_id}")
            response = scheduler.trigger_schedule(schedule_id, response, env)
        else:
            response.add_error("Scheduler client not defined")

        return OperatorResponse(response)
Beispiel #4
0
    def run(
        self,
        config: SparkConfig,
        job: Job,
        resp: Optional[Response] = None
    ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
        #  TODO: Replace with python kubernetes api
        #  TODO: Set up kubernetes configuration, run on docker version

        response: Response = resp or Response()

        job.set_id("mason" + "-" + job.type + "-" + str(uuid4()))
        merged_config = merge_config(config, job)
        job_id = merged_config["metadata"]["name"]
        conf = dict(merged_config)

        final: Union[ExecutedJob, InvalidJob]

        with tempfile.NamedTemporaryFile(delete=False, mode='w') as yaml_file:
            yaml_dump = yaml.dump(conf, yaml_file)

            command = ["kubectl", "apply", "-f", yaml_file.name]
            response.add_info(
                f"Executing Spark Kubernetes Operator. job_id:  {job_id}")
            stdout, stderr = run_sys_call(command)

            if len(stdout) > 0:
                final = job.running(stdout)
            else:
                if len(stderr) > 0:
                    final = job.errored(stderr)
                else:
                    final = job.running()

        return final, response
Beispiel #5
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        params = OperatorParameters(
            parameter_string=
            f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/"
        )
        good = op.validate(config, params).run(env, Response())
        invalid_job = good.object
        assert (isinstance(invalid_job, InvalidJob))

        params = OperatorParameters(
            parameter_string=
            f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path"
        )
        good = op.validate(config, params).run(env, Response())
        executed_job = good.object
        assert (isinstance(executed_job, ExecutedJob))
Beispiel #6
0
    def test_local_client(self):
        base.set_log_level()
        env = self.before()
        config = Resources(env).get_config("8")

        # DAG has cycle
        step_params = {"config_id": "8", "parameters": {"test_param": "test"}}
        params = {
            "step_1": step_params,
            "step_2": step_params,
            "step_3": step_params,
            "step_4": step_params,
            "step_5": step_params,
            "step_6": step_params,
        }

        wf = self.get_workflow(env, "workflow_local_scheduler")

        if isinstance(wf, MalformedResource):
            raise Exception(f"Workflow not found: {wf.get_message()}")

        if isinstance(config, MalformedResource):
            raise Exception(f"Config not found: {config.get_message()}")

        parameters = WorkflowParameters(parameter_dict=params)
        validated = wf.validate(env, config, parameters)
        assert (isinstance(validated, ValidWorkflow))
        operator_response = validated.run(env, Response())
        info = """
        Registering workflow dag test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 with local.
        Registering DAG in local memory
        Registered schedule test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5
        Triggering schedule: test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5
        Running dag
        * step_1
        | * step_2
        | | * step_3
        | |/  
        * | step_4
        |/  
        * step_5
        * step_6

        Running step step_1
        Running operator1
        Running step step_2
        Running operator2
        Running step step_3
        Running operator3
        Running step step_4
        Running operator4
        Running step step_5
        Running operator5
        Running step step_6
        Running operator6
        """
        response = operator_response.response
        assert (len(response.errors) == 0)
        assert (clean_uuid(clean_string("\n".join(
            response.info))) == clean_uuid(clean_string(info)))
Beispiel #7
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):

        # Database Exists
        params = OperatorParameters(
            parameter_string="database_name:crawler-poc")
        valid = op.validate(config, params)
        exists = valid.run(env, Response())
        assert exists.with_status() == table.index(
            config.metastore().client.name())

        # Database DNE
        params = OperatorParameters(
            parameter_string="database_name:bad-database")
        dne = op.validate(config, params).run(env, Response())
        assert (dne.with_status() == table.index(
            config.metastore().client.name(), False))
Beispiel #8
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid delete
        params = OperatorParameters(
            parameter_string=f"schedule_name:good_schedule")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Info': ['Schedule good_schedule successfully deleted.']
        }, 200))

        # dne
        params = OperatorParameters(
            parameter_string=f"schedule_name:bad_schedule")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors': ["Crawler entry with name bad_schedule does not exist"]
        }, 400))
Beispiel #9
0
    def run(self,
            job: Job,
            resp: Optional[Response] = None,
            mode: str = "async"
            ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
        final: Union[ExecutedJob, InvalidJob]
        response: Response = resp or Response()

        try:
            if self.scheduler:
                if isinstance(job, FormatJob):
                    final = self.run_job(job.type, job.spec(
                    ), self.scheduler, mode) or ExecutedJob(
                        "format_job",
                        f"Job queued to format {job.table.schema.type} table as {job.format} and save to {job.output_path.path_str}"
                    )
                elif isinstance(job, QueryJob):
                    final = self.run_job(job.type, job.spec(), self.scheduler)
                else:
                    final = job.errored("Job type not supported for Dask")
            else:
                final = InvalidJob("Dask Scheduler not defined")
        except OSError as e:
            final = InvalidJob(message(e))

        return final, response
Beispiel #10
0
 def execute_ddl(
     self,
     ddl: DDLStatement,
     database: Database,
     response: Optional[Response] = None
 ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
     return InvalidJob(
         "Client 'execute_ddl' not implemented"), response or Response()
Beispiel #11
0
 def save(
     self,
     state_store: MasonStateStore,
     overwrite: bool = False,
     response: Response = Response()) -> Response:
     message = self.get_message()
     if message:
         response.add_error(message)
     return response
Beispiel #12
0
def apply(file: str, overwrite: bool = False, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None):
    environment: MasonEnvironment = env or MasonEnvironment().initialize()
    logger.set_level(log_level)
    response = Response()
    
    all = Resources(environment).get_all(file)
    for r in all:
        response = r.save(environment.state_store, overwrite, response)
    return response.with_status()
Beispiel #13
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid query
        query = "SELECT * from $table limit 3"
        output_path = from_root("/.tmp/")
        params = OperatorParameters(
            parameter_string=
            f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}"
        )
        result = op.validate(config, params).run(env, Response())
        exp = {
            "1": [
                'Running Query "SELECT * from $table limit 3"',
                'Running Athena query.  query_id: test', 'Running job id=test'
            ],
            "4": [
                f'Table succesfully formatted as parquet and exported to {output_path}'
            ]
        }

        expect = {'Info': exp[config.id]}
        assert (result.with_status() == (expect, 200))

        # bad permissions
        query = "SELECT * from $table limit 3"
        params = OperatorParameters(
            parameter_string=
            f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}"
        )
        result = op.validate(config, params).run(env, Response())
        exp_2 = {
            "1": ({
                'Errors': [
                    'Job errored: Access denied for credentials.  Ensure associated user or role has permission to CreateNamedQuery on athena'
                ],
                'Info': ['Running Query "SELECT * from $table limit 3"']
            }, 403),
            "4": ({
                'Info': [
                    f'Table succesfully formatted as parquet and exported to {output_path}'
                ]
            }, 200)
        }

        assert (result.with_status() == exp_2[config.id])
Beispiel #14
0
 def get_database(
     self,
     database_name: str,
     response: Optional[Response] = None
 ) -> Tuple[Result[Database, InvalidDatabase], Response]:
     tables, response = self.list_tables(database_name, response
                                         or Response())
     database = tables.map(lambda a: Database("s3_table", a)).alt(
         lambda b: InvalidDatabase(b.error or b.message()))
     return database, response
Beispiel #15
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # database DNE
        params = OperatorParameters(
            parameter_string=
            f"database_name:bad-database,storage_path:crawler-poc/catalog_poc_data"
        )
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors':
            ['Job errored: Metastore database bad-database not found'],
            'Info': ['Table inferred: catalog_poc_data']
        }, 404))

        # bad path
        params = OperatorParameters(
            parameter_string=
            f"database_name:crawler-poc,storage_path:crawler-poc/bad-table")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors': [
                'No keys at s3://crawler-poc/bad-table',
                'Job errored: Invalid Tables: No keys at s3://crawler-poc/bad-table'
            ]
        }, 404))

        # valid path
        params = OperatorParameters(
            parameter_string=
            f"database_name:crawler-poc,storage_path:crawler-poc/catalog_poc_data,output_path:crawler-poc/athena/"
        )
        good = op.validate(config, params).run(env, Response())

        def clean(s: List[str]):
            return list(map(lambda i: clean_uuid(clean_string(i)), s))

        infos = clean(good.formatted()["Info"])
        expect = [
            'Tableinferred:catalog_poc_data',
            'RunningAthenaquery.query_id:test_id', 'Runningjobid=test_id'
        ]
        assert (infos == expect)
Beispiel #16
0
 def dry_run(
     self, env: MasonEnvironment, response: Response = Response()
 ) -> OperatorResponse:
     response.add_info(f"Performing Dry Run for Workflow")
     response.add_info("")
     response.add_info(f"Valid Workflow DAG Definition:")
     response.add_info(f"-" * 80)
     response.add_info(f"\n{self.dag.display()}")
     response.add_info("Finished")
     for r in list(map(lambda s: s.reason, self.dag.invalid_steps)):
         response.add_warning(r)
     return OperatorResponse(response)
Beispiel #17
0
    def list_keys(
            self,
            path: str,
            response: Optional[Response] = None
    ) -> Tuple[List[Path], Response]:
        resp: Response = response or Response()
        keys = self.client().find(path)
        resp.add_response({'keys': keys})
        if len(keys) > 0:
            paths = list(map(lambda k: self.get_path(k), keys))
        else:
            paths = []

        return paths, resp
Beispiel #18
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # Database and table Exist
        params = OperatorParameters(
            parameter_string=
            "database_name:crawler-poc,table_name:catalog_poc_data")
        exists = op.validate(config, params).run(env, Response())
        assert (exists.with_status() == table.get(
            config.metastore().client.name(), 1))

        # Database DNE
        params = OperatorParameters(
            parameter_string=
            "database_name:bad-database,table_name:catalog_poc_data")
        dne = op.validate(config, params).run(env, Response())
        assert (dne.with_status() == table.get(
            config.metastore().client.name(), 2))

        # Table DNE
        params = OperatorParameters(
            parameter_string="database_name:crawler-poc,table_name:bad-table")
        dne2 = op.validate(config, params).run(env, Response())
        assert (dne2.with_status() == table.get(
            config.metastore().client.name(), 3))
Beispiel #19
0
    def get_database(
        self,
        database_name: str,
        response: Optional[Response] = None
    ) -> Tuple[Result[Database, InvalidDatabase], Response]:
        resp = response or Response()

        try:
            result = self.client().get_tables(DatabaseName=database_name)
        except ClientError as e:
            result = e.response

        error, status, message = self.parse_response(result)
        resp.add_response(result)

        if error == "EntityNotFoundException":
            resp.set_status(404)
            return Failure(
                InvalidDatabase(f"Database {database_name} not found")), resp
        elif 200 <= status < 300:

            table_list = result.get("TableList")
            if table_list:
                valid, invalid = sequence(
                    list(
                        map(
                            lambda x: self.parse_table(
                                x, Path(database_name, "glue"), database_name),
                            table_list)), Table, InvalidTable)
                if len(invalid) > 0:
                    invalid_messages = ", ".join(
                        list(map(lambda i: i.reason, invalid)))
                    resp.add_warning(
                        f"Invalid Tables in glue response: {invalid_messages}")
                if len(valid) == 0:
                    return Failure(InvalidDatabase(f"No valid tables")), resp
                else:
                    return Success(Database(database_name,
                                            TableList(valid))), resp
            else:
                return Failure(
                    InvalidDatabase(
                        "TableList not found in glue response")), resp
        else:
            resp.set_status(status)
            return Failure(
                InvalidDatabase(
                    f"Invalid response from glue: {message}.  Status: {status}"
                )), resp
Beispiel #20
0
 def save(self,
          state_store: MasonStateStore,
          overwrite: bool = False,
          response: Response = Response()):
     try:
         result = state_store.cp_source(self.source_path, "workflow",
                                        self.namespace, self.command,
                                        overwrite)
         if isinstance(result, FailedOperation):
             response.add_error(f"{result.message}")
         else:
             response.add_info(result)
     except Exception as e:
         response.add_error(f"Error copying source: {message(e)}")
     return response
Beispiel #21
0
    def tests(env: MasonEnvironment, config: Config, wf: Workflow):
        # DNE
        params = WorkflowParameters(parameter_path=from_root(
            "/test/support/parameters/table_infer_parameters_1.yaml"))
        dne = wf.validate(env, config, params).run(env, Response())
        assert (dne.with_status() == expects.post(False))

        # Exists
        params = WorkflowParameters(parameter_path=from_root(
            "/test/support/parameters/table_infer_parameters_2.yaml"))
        exists = wf.validate(env, config, params).run(env, Response())
        assert (exists.with_status() == expects.post(True))

        # API
        response, status = run(
            "workflow",
            wf.namespace,
            wf.command,
            param_file=from_root(
                "/test/support/parameters/table_infer_parameters_1.yaml"),
            config_id=config.id,
            env=env,
            log_level="fatal")
        assert ((response, status) == expects.post(False))
Beispiel #22
0
    def test_workflow_basic_valid(self):
        env = get_env("/test/support/", "/test/support/validations/")
        step_params = {"config_id": "5", "parameters": {"test_param": "test"}}
        params = {
            "step_1": step_params,
            "step_2": step_params,
            "step_3": step_params
        }
        expects = [
            'Performing Dry Run for Workflow', '',
            'Valid Workflow DAG Definition:',
            '--------------------------------------------------------------------------------',
            '\n* step_1\n* step_2\n* step_3\n', 'Finished'
        ]

        validated = self.validate_workflow(env, "workflow_basic", '3', params)
        run = validated.dry_run(env, Response())
        assert (run.response.info == expects)
Beispiel #23
0
def run(resource_type: str,
        namespace: str,
        command: str,
        parameter_string: Optional[str] = None,
        param_file: Optional[str] = None,
        config_id: Optional[str] = None,
        log_level: Optional[str] = None,
        env: Optional[MasonEnvironment] = None,
        dry_run: bool = False,
        parameters: Optional[dict] = None,
        printer=ApiPrinter()):
    response = Response()
    environment: MasonEnvironment = env or MasonEnvironment().initialize()
    logger.set_level(log_level)
    res = base.Resources(environment)

    resource: Union[Resource, MalformedResource] = res.get_resource(
        resource_type, namespace, command)
    config: Union[Config, MalformedResource] = res.get_best_config(config_id)
    params: Union[Parameters, MalformedResource] = res.get_parameters(
        resource_type, parameter_string, param_file, parameters)

    if isinstance(resource, Resource) and isinstance(
            config, Config) and isinstance(params, Parameters):
        if dry_run:
            response = validate_resource(resource, config, params,
                                         environment).dry_run(
                                             environment,
                                             response).to_response(response)
        else:
            response = validate_resource(resource, config, params,
                                         environment).run(
                                             environment,
                                             response).to_response(response)
    else:
        if isinstance(resource, MalformedResource):
            response.add_error(f"Malformed Resource: {resource.get_message()}")
        elif isinstance(config, MalformedResource):
            response.add_error(f"Bad Config: {config.get_message()}")
        elif isinstance(params, MalformedResource):
            response.add_error(f"Bad Parameters: {params.get_message()}")

    return printer.print_response(response)
Beispiel #24
0
 def run(self,
         job: Job,
         resp: Optional[Response] = None,
         mode: str = "sync"
         ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
     # TODO:  Figure out how to get init variables through MagicMock so i can test scheduler with this test case:
     # InvalidJob Timed out trying to connect # OSError: Timed out trying to connect to 'tcp://dask-scheduler:8786'
     r = resp or Response()
     if isinstance(job, FormatJob):
         if job.format == "csv" and job.output_path.path_str == "good_output_path":
             return (ExecutedJob('Table succesfully formatted as csv'), r)
         else:
             return (InvalidJob('Invalid Dask Job: Invalid Schema'), r)
     elif isinstance(job, QueryJob):
         job.output_path.protocal = "file"
         return (KubernetesWorker({
             "scheduler": "local:8786"
         }).run_job(job.type, job.spec(), "local:8786", mode), r)
     else:
         raise Exception(f"Mock job not implemented: {job.type}")
Beispiel #25
0
    def get(
        self,
        job_id: str,
        resp: Optional[Response] = None
    ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
        response: Response = resp or Response()

        command = ["kubectl", "logs", job_id + "-driver"]
        stdout, stderr = run_sys_call(command)
        job = Job("get")
        final: Union[ExecutedJob, InvalidJob]

        if len(stdout) > 0:
            job.add_log(stdout)
            final = job.running(past=True)
        if len(stderr) > 0:
            job.add_log(stderr)
            final = job.errored(f"Kubernetes Error")

        return final, response
Beispiel #26
0
    def run_job(
        self,
        job: Job,
        response: Optional[Response] = None
    ) -> Tuple[Union[InvalidJob, ExecutedJob], Response]:

        resp: Response = response or Response()
        # if isinstance(job, FormatJob):
        # for path in job.paths:
        #     workbook = Workbook(path.path_str + '.xlsx')
        #     worksheet = workbook.add_worksheet()
        #     with fsspec.open(path.full_path(), mode='rt') as f:
        #         f.read()
        #     for col in split:
        #         worksheet.writer(row,col, i)
        #         i+=1
        #
        # workbook.close()
        # else:
        final = InvalidJob(f"Job type {job.type} not supported")

        return final, resp
Beispiel #27
0
    def get_table(
        self,
        database_name: str,
        table_name: str,
        resp: Optional[Response] = None
    ) -> Tuple[Union[Table, InvalidTables], Response]:
        try:
            result = self.client().get_table(DatabaseName=database_name,
                                             Name=table_name)
        except ClientError as e:
            result = e.response

        response: Response = resp or Response()
        response.add_response(result)

        error, status, message = self.parse_response(result)
        table = self.parse_table(result.get("Table", {}),
                                 Path(database_name + ":" + table_name,
                                      "glue"),
                                 database_name=database_name)

        final: Union[Table, InvalidTables]
        if error == "EntityNotFoundException":
            final = InvalidTables([
                TableNotFound(
                    f"Database {database_name} or table {table_name} not found"
                )
            ])
        elif 200 <= status < 300:
            if isinstance(table, Table):
                final = table
            else:
                final = InvalidTables([table])
        else:
            final = InvalidTables([InvalidTable(f"Invalid Table: {message}")])
            response.set_status(status)

        return final, response
Beispiel #28
0
    def print_resources(
            self,
            resources: List[Union[Operator, Workflow, Config,
                                  MalformedResource]],
            type: Optional[str] = None,
            namespace: Optional[str] = None,
            command: Optional[str] = None,
            environment: Optional[MasonEnvironment] = None) -> Response:
        if len(resources) == 0:
            logger.error(self.none_message(type, namespace, command))
        else:
            operators, workflows, configs, bad = sequence_4(
                resources, Operator, Workflow, Config, MalformedResource)
            type_name = type or "all"
            # TODO: dry up with resources
            if type in ["all", "operator", "operators"]:
                self.print_operators(operators, namespace, command)
            if type in ["all", "workflow", "workflows"]:
                self.print_workflows(workflows, namespace, command)
            if type in ["all", "config", "configs"]:
                self.print_configs(configs, environment)

        return Response()
Beispiel #29
0
    def delete_table(self,
                     database_name: str,
                     table_name: str,
                     resp: Optional[Response] = None) -> Response:
        response = resp or Response()

        try:
            glue_response = self.client().delete_table(
                DatabaseName=database_name, Name=table_name)

        except ClientError as e:
            glue_response = e.response

        error, status, message = self.parse_response(glue_response)
        response.add_response(glue_response)

        if not error == "":
            response.set_status(status)
            response.add_error(message)

        else:
            response.add_info(f"Table {table_name} successfully deleted.")

        return response
Beispiel #30
0
def config(config_id: Optional[str],
           set_current: bool = False,
           log_level: Optional[str] = None,
           env: Optional[MasonEnvironment] = None,
           printer: Printer = ApiPrinter()):
    environment = env or MasonEnvironment().initialize()
    logger.set_level(log_level)
    response = Response()
    if set_current and config_id:
        result = Resources(environment).set_session_config(config_id)
        if isinstance(result, str):
            response.add_error(result)
            response.set_status(404)
        else:
            response.add_info(f"Set session config to {config_id}")
            config_id = None

    res = Resources(environment)
    configs = res.get_resources("config", config_id)
    response = printer.print_resources(configs,
                                       "config",
                                       config_id,
                                       environment=environment)
    return response.with_status()