コード例 #1
0
ファイル: local_client.py プロジェクト: kyprifog/mason
 def register_dag(
         self, schedule_name: str, valid_dag: ValidDag,
         schedule: Optional[Schedule],
         response: Response) -> Tuple[str, Response, Optional[ClientDag]]:
     response.add_info("Registering DAG in local memory")
     self.dag = valid_dag
     return (schedule_name, response, None)
コード例 #2
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        query_string = parameters.get_required("query_string")
        database_name = parameters.get_required("database_name")
        table_name = parameters.get_required("table_name")
        output_path = parameters.get_optional("output_path")

        # TODO?: Sanitize the query string
        query = query_string
        final: Union[ExecutedJob, InvalidJob]

        table, response = config.metastore().get_table(database_name,
                                                       table_name)

        if output_path and isinstance(config.storage(), StorageClient):
            outp: Optional[Path] = config.storage().path(output_path)
        else:
            outp = None

        if isinstance(table, Table):
            response.add_info(f"Running Query \"{query}\"")
            job = QueryJob(query_string, table, outp)
            final, response = config.execution().run_job(job, response)
        else:
            final = InvalidJob(table.message())

        return OperatorResponse(response, final)
コード例 #3
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid delete
        params = OperatorParameters(
            parameter_string=
            f"table_name:good_table,database_name:good_database")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Info': ['Table good_table successfully deleted.']
        }, 200))

        # database DNE
        params = OperatorParameters(
            parameter_string=f"table_name:bad_table,database_name:bad_database"
        )
        bad = op.validate(config, params).run(env, Response())
        assert (bad.with_status() == ({
            'Errors': ['Database bad_database not found.']
        }, 400))

        # table DNE
        params = OperatorParameters(
            parameter_string=f"table_name:bad_table,database_name:good_database"
        )
        bad = op.validate(config, params).run(env, Response())
        assert (bad.with_status() == ({
            'Errors': ['Table bad_table not found.']
        }, 400))
コード例 #4
0
    def register_dag(self, schedule_name: str, valid_dag: ValidDag,
                     schedule: Optional[Schedule], response: Response):
        #  Short-circuit for glue crawler definition since glue as a scheduler is only well defined for Table Infer Operator
        if len(valid_dag.valid_steps) == 1 and valid_dag.valid_steps[
                0].operator.type_name() == "TableInfer":
            op = valid_dag.valid_steps[0].operator
            params = valid_dag.valid_steps[0].operator.parameters
            db_name = params.get_required("database_name")

            storage_engine = op.config.storage()
            if isinstance(storage_engine, StorageClient):
                storage_path = storage_engine.path(
                    params.get_required("storage_path"))
            else:
                response = response.add_error(
                    f"Attempted to register_dag for invalid client: {storage_engine.reason}"
                )
            response = self.register_schedule(db_name, storage_path,
                                              schedule_name, schedule,
                                              response)
        else:
            response.add_error(
                "Glue Scheduler only defined for TableInfer type which registers a glue crawler"
            )

        return (schedule_name, response, None)
コード例 #5
0
    def save_to(self, inpath: Path, outpath: Path, response: Response):
        try:
            self.client().upload(inpath.path_str, outpath.path_str)
        except Exception as e:
            response.add_error(f"Error saving {inpath} to {outpath.path_str}")
            response.add_error(message(e))

        return response
コード例 #6
0
ファイル: invalid_table.py プロジェクト: kyprifog/mason
    def to_response(self, response: Response) -> Response:
        for it in self.invalid_tables:
            response = it.to_response(response)

        if self.error:
            response.add_error(self.error)

        return response
コード例 #7
0
ファイル: malformed.py プロジェクト: kyprifog/mason
 def save(
     self,
     state_store: MasonStateStore,
     overwrite: bool = False,
     response: Response = Response()) -> Response:
     message = self.get_message()
     if message:
         response.add_error(message)
     return response
コード例 #8
0
ファイル: invalid_workflow.py プロジェクト: kyprifog/mason
 def execute(self,
             env: MasonEnvironment,
             response: Response,
             dry_run: bool = True,
             run_now: bool = False,
             schedule_name: Optional[str] = None) -> Response:
     response.add_error(f"Invalid Operator.  Reason:  {self.reason}")
     response.set_status(400)
     return response
コード例 #9
0
def apply(file: str, overwrite: bool = False, log_level: Optional[str] = None, env: Optional[MasonEnvironment] = None):
    environment: MasonEnvironment = env or MasonEnvironment().initialize()
    logger.set_level(log_level)
    response = Response()
    
    all = Resources(environment).get_all(file)
    for r in all:
        response = r.save(environment.state_store, overwrite, response)
    return response.with_status()
コード例 #10
0
ファイル: local_client.py プロジェクト: kyprifog/mason
    def trigger_schedule(self, schedule_name: str, response: Response,
                         env: MasonEnvironment) -> Response:
        dag = self.dag
        if dag:
            workflow_run = WorkflowRun(dag)
            response = workflow_run.run(env, response)
        else:
            response.add_error("Dag not found.  Run 'register_dag' first.")

        return response
コード例 #11
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid refresh
        params = OperatorParameters(
            parameter_string=
            "table_name:catalog_poc_data,database_name:crawler-poc")
        refresh = op.validate(config, params).run(env, Response())
        assert (refresh.with_status() == table.refresh(False))

        # already refreshing
        params = OperatorParameters(
            parameter_string=
            "table_name:catalog_poc_data_refreshing,database_name:crawler-poc")
        refreshing = op.validate(config, params).run(env, Response())
        assert (refreshing.with_status() == table.refresh(True))
コード例 #12
0
ファイル: glue_client.py プロジェクト: kyprifog/mason
    def list_tables(
        self, database_name: str, response: Response
    ) -> Tuple[Result[TableList, InvalidTables], Response]:
        try:
            result = self.client().get_tables(DatabaseName=database_name)
        except ClientError as e:
            result = e.response
        response.add_response(result)
        error, status, message = self.parse_response(result)

        if error == "EntityNotFoundException":
            final = Failure(
                InvalidTables([], f"Database {database_name} not found"))
            response.set_status(404)
            return final, response
        elif 200 <= status < 300:
            valid: List[Table]
            valid, invalid = self.parse_table_list_data(
                result, Path(database_name, "glue"), database_name)
            if len(valid) > 0:
                response.set_status(status)
                return Success(TableList(valid)), response
            else:
                return Failure(InvalidTables(
                    [], "No Valid tables found")), response
        else:
            response.set_status(status)
            return Failure(InvalidTables(message)), response
コード例 #13
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):

        # Database Exists
        params = OperatorParameters(
            parameter_string="database_name:crawler-poc")
        valid = op.validate(config, params)
        exists = valid.run(env, Response())
        assert exists.with_status() == table.index(
            config.metastore().client.name())

        # Database DNE
        params = OperatorParameters(
            parameter_string="database_name:bad-database")
        dne = op.validate(config, params).run(env, Response())
        assert (dne.with_status() == table.index(
            config.metastore().client.name(), False))
コード例 #14
0
ファイル: kubernetes_operator.py プロジェクト: kyprifog/mason
    def run(
        self,
        config: SparkConfig,
        job: Job,
        resp: Optional[Response] = None
    ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
        #  TODO: Replace with python kubernetes api
        #  TODO: Set up kubernetes configuration, run on docker version

        response: Response = resp or Response()

        job.set_id("mason" + "-" + job.type + "-" + str(uuid4()))
        merged_config = merge_config(config, job)
        job_id = merged_config["metadata"]["name"]
        conf = dict(merged_config)

        final: Union[ExecutedJob, InvalidJob]

        with tempfile.NamedTemporaryFile(delete=False, mode='w') as yaml_file:
            yaml_dump = yaml.dump(conf, yaml_file)

            command = ["kubectl", "apply", "-f", yaml_file.name]
            response.add_info(
                f"Executing Spark Kubernetes Operator. job_id:  {job_id}")
            stdout, stderr = run_sys_call(command)

            if len(stdout) > 0:
                final = job.running(stdout)
            else:
                if len(stderr) > 0:
                    final = job.errored(stderr)
                else:
                    final = job.running()

        return final, response
コード例 #15
0
    def run(self,
            job: Job,
            resp: Optional[Response] = None,
            mode: str = "async"
            ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
        final: Union[ExecutedJob, InvalidJob]
        response: Response = resp or Response()

        try:
            if self.scheduler:
                if isinstance(job, FormatJob):
                    final = self.run_job(job.type, job.spec(
                    ), self.scheduler, mode) or ExecutedJob(
                        "format_job",
                        f"Job queued to format {job.table.schema.type} table as {job.format} and save to {job.output_path.path_str}"
                    )
                elif isinstance(job, QueryJob):
                    final = self.run_job(job.type, job.spec(), self.scheduler)
                else:
                    final = job.errored("Job type not supported for Dask")
            else:
                final = InvalidJob("Dask Scheduler not defined")
        except OSError as e:
            final = InvalidJob(message(e))

        return final, response
コード例 #16
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        params = OperatorParameters(
            parameter_string=
            f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/"
        )
        good = op.validate(config, params).run(env, Response())
        invalid_job = good.object
        assert (isinstance(invalid_job, InvalidJob))

        params = OperatorParameters(
            parameter_string=
            f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path"
        )
        good = op.validate(config, params).run(env, Response())
        executed_job = good.object
        assert (isinstance(executed_job, ExecutedJob))
コード例 #17
0
    def test_local_client(self):
        base.set_log_level()
        env = self.before()
        config = Resources(env).get_config("8")

        # DAG has cycle
        step_params = {"config_id": "8", "parameters": {"test_param": "test"}}
        params = {
            "step_1": step_params,
            "step_2": step_params,
            "step_3": step_params,
            "step_4": step_params,
            "step_5": step_params,
            "step_6": step_params,
        }

        wf = self.get_workflow(env, "workflow_local_scheduler")

        if isinstance(wf, MalformedResource):
            raise Exception(f"Workflow not found: {wf.get_message()}")

        if isinstance(config, MalformedResource):
            raise Exception(f"Config not found: {config.get_message()}")

        parameters = WorkflowParameters(parameter_dict=params)
        validated = wf.validate(env, config, parameters)
        assert (isinstance(validated, ValidWorkflow))
        operator_response = validated.run(env, Response())
        info = """
        Registering workflow dag test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5 with local.
        Registering DAG in local memory
        Registered schedule test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5
        Triggering schedule: test_workflow_local_scheduler_ea5b602c-261c-4e06-af21-375ea912b6a5
        Running dag
        * step_1
        | * step_2
        | | * step_3
        | |/  
        * | step_4
        |/  
        * step_5
        * step_6

        Running step step_1
        Running operator1
        Running step step_2
        Running operator2
        Running step step_3
        Running operator3
        Running step step_4
        Running operator4
        Running step step_5
        Running operator5
        Running step step_6
        Running operator6
        """
        response = operator_response.response
        assert (len(response.errors) == 0)
        assert (clean_uuid(clean_string("\n".join(
            response.info))) == clean_uuid(clean_string(info)))
コード例 #18
0
ファイル: schedule_test.py プロジェクト: kyprifog/mason
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid delete
        params = OperatorParameters(
            parameter_string=f"schedule_name:good_schedule")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Info': ['Schedule good_schedule successfully deleted.']
        }, 200))

        # dne
        params = OperatorParameters(
            parameter_string=f"schedule_name:bad_schedule")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors': ["Crawler entry with name bad_schedule does not exist"]
        }, 400))
コード例 #19
0
ファイル: metastore.py プロジェクト: kyprifog/mason
 def execute_ddl(
     self,
     ddl: DDLStatement,
     database: Database,
     response: Optional[Response] = None
 ) -> Tuple[Union[ExecutedJob, InvalidJob], Response]:
     return InvalidJob(
         "Client 'execute_ddl' not implemented"), response or Response()
コード例 #20
0
ファイル: cli_printer.py プロジェクト: kyprifog/mason
    def print_response(self, response: Response):
        def default(o):
            if isinstance(o, (datetime.date, datetime.datetime)):
                return o.isoformat()

        resp, status = response.with_status()
        logger.info(f"Response status: {status}")
        str_resp = json.dumps(resp, indent=4, sort_keys=True, default=default)
        logger.info(highlight(str_resp, JsonLexer(), TerminalFormatter()))
コード例 #21
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # valid query
        query = "SELECT * from $table limit 3"
        output_path = from_root("/.tmp/")
        params = OperatorParameters(
            parameter_string=
            f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}"
        )
        result = op.validate(config, params).run(env, Response())
        exp = {
            "1": [
                'Running Query "SELECT * from $table limit 3"',
                'Running Athena query.  query_id: test', 'Running job id=test'
            ],
            "4": [
                f'Table succesfully formatted as parquet and exported to {output_path}'
            ]
        }

        expect = {'Info': exp[config.id]}
        assert (result.with_status() == (expect, 200))

        # bad permissions
        query = "SELECT * from $table limit 3"
        params = OperatorParameters(
            parameter_string=
            f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}"
        )
        result = op.validate(config, params).run(env, Response())
        exp_2 = {
            "1": ({
                'Errors': [
                    'Job errored: Access denied for credentials.  Ensure associated user or role has permission to CreateNamedQuery on athena'
                ],
                'Info': ['Running Query "SELECT * from $table limit 3"']
            }, 403),
            "4": ({
                'Info': [
                    f'Table succesfully formatted as parquet and exported to {output_path}'
                ]
            }, 200)
        }

        assert (result.with_status() == exp_2[config.id])
コード例 #22
0
ファイル: metastore.py プロジェクト: kyprifog/mason
 def get_database(
     self,
     database_name: str,
     response: Optional[Response] = None
 ) -> Tuple[Result[Database, InvalidDatabase], Response]:
     tables, response = self.list_tables(database_name, response
                                         or Response())
     database = tables.map(lambda a: Database("s3_table", a)).alt(
         lambda b: InvalidDatabase(b.error or b.message()))
     return database, response
コード例 #23
0
ファイル: workflow_run.py プロジェクト: kyprifog/mason
    def run(self, env: MasonEnvironment, response: Response) -> Response:
        response.add_info(f"Running dag \n{self.dag.display()}")

        while not self.finished():
            self.step(env)

        for step in sorted(self.executed_steps):
            response = response.merge(step.operator_response.response)

        if len(self.invalid_steps) > 0:
            response.add_error(f"Workflow failed")
            for i in self.invalid_steps:
                response.add_error(i.reason)
            response.set_status(400)

        return response
コード例 #24
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # database DNE
        params = OperatorParameters(
            parameter_string=
            f"database_name:bad-database,storage_path:crawler-poc/catalog_poc_data"
        )
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors':
            ['Job errored: Metastore database bad-database not found'],
            'Info': ['Table inferred: catalog_poc_data']
        }, 404))

        # bad path
        params = OperatorParameters(
            parameter_string=
            f"database_name:crawler-poc,storage_path:crawler-poc/bad-table")
        good = op.validate(config, params).run(env, Response())
        assert (good.with_status() == ({
            'Errors': [
                'No keys at s3://crawler-poc/bad-table',
                'Job errored: Invalid Tables: No keys at s3://crawler-poc/bad-table'
            ]
        }, 404))

        # valid path
        params = OperatorParameters(
            parameter_string=
            f"database_name:crawler-poc,storage_path:crawler-poc/catalog_poc_data,output_path:crawler-poc/athena/"
        )
        good = op.validate(config, params).run(env, Response())

        def clean(s: List[str]):
            return list(map(lambda i: clean_uuid(clean_string(i)), s))

        infos = clean(good.formatted()["Info"])
        expect = [
            'Tableinferred:catalog_poc_data',
            'RunningAthenaquery.query_id:test_id', 'Runningjobid=test_id'
        ]
        assert (infos == expect)
コード例 #25
0
 def list_objects(self, database_name: str,
                  response: Response) -> Tuple[Result[dict, str], Response]:
     try:
         split = database_name.split("/", 1)
         result = self.client().s3.list_objects(Bucket=split[0],
                                                Prefix=(get(split, 1)
                                                        or '/'),
                                                Delimiter='/')
         response.add_response(result)
         return Success(result), response
     except Exception as e:
         if isinstance(e, ClientError):
             result = e.response
             error = result.get("Error", {})
             code = error.get("Code", "")
             if code == "NoSuchBucket":
                 response.set_status(404)
                 return Failure(
                     f"The specified bucket does not exist: {database_name}"
                 ), response
         return Failure(message(e)), response
コード例 #26
0
ファイル: valid_operator.py プロジェクト: kyprifog/mason
    def execute(self,
                env: MasonEnvironment,
                response: Response,
                dry_run: bool = True) -> OperatorResponse:
        try:
            module = self.module(env)
            if isinstance(module, OperatorDefinition):
                if dry_run:
                    response.add_info(
                        f"Valid Operator: {self.namespace}:{self.command} with specified parameters."
                    )
                    return OperatorResponse(response)
                else:
                    operator_response: OperatorResponse = module.run(
                        env, self.config, self.parameters, response)
            else:
                response.add_error(
                    f"Module does not contain a valid OperatorDefinition. See /examples for sample operator implementations. \nMessage: {module.reason}"
                )
                operator_response = OperatorResponse(response)
        except ModuleNotFoundError as e:
            response.add_error(f"Module Not Found: {e}")
            operator_response = OperatorResponse(response)

        return operator_response
コード例 #27
0
    def tests(env: MasonEnvironment, config: Config, op: Operator):
        # Database and table Exist
        params = OperatorParameters(
            parameter_string=
            "database_name:crawler-poc,table_name:catalog_poc_data")
        exists = op.validate(config, params).run(env, Response())
        assert (exists.with_status() == table.get(
            config.metastore().client.name(), 1))

        # Database DNE
        params = OperatorParameters(
            parameter_string=
            "database_name:bad-database,table_name:catalog_poc_data")
        dne = op.validate(config, params).run(env, Response())
        assert (dne.with_status() == table.get(
            config.metastore().client.name(), 2))

        # Table DNE
        params = OperatorParameters(
            parameter_string="database_name:crawler-poc,table_name:bad-table")
        dne2 = op.validate(config, params).run(env, Response())
        assert (dne2.with_status() == table.get(
            config.metastore().client.name(), 3))
コード例 #28
0
    def list_keys(
            self,
            path: str,
            response: Optional[Response] = None
    ) -> Tuple[List[Path], Response]:
        resp: Response = response or Response()
        keys = self.client().find(path)
        resp.add_response({'keys': keys})
        if len(keys) > 0:
            paths = list(map(lambda k: self.get_path(k), keys))
        else:
            paths = []

        return paths, resp
コード例 #29
0
ファイル: executed_job.py プロジェクト: kyprifog/mason
 def to_response(self, response: Response) -> Response:
     if self.message != "":
         response.add_info(self.message)
     for l in self.logs:
         if isinstance(l, str):
             response.add_info(l)
         else:
             response.add_data(l)
     return response
コード例 #30
0
        def parse_response(
                result: dict,
                response: Response) -> Result[TableList, InvalidTables]:
            contents: Optional[List[dict]] = result.get("Contents")
            prefixes: Optional[List[dict]] = result.get("CommonPrefixes")

            if contents:
                tables: List[Union[Table, InvalidTables]] = []
                for c in contents:
                    key: Optional[str] = c.get("Key")
                    if key:
                        table, response = self.get_table(
                            database_name.split("/")[0],
                            key,
                            response=response)
                        tables.append(table)
                valid, invalid = sequence(tables, Table, InvalidTables)
                if len(valid) > 0:
                    return Success(TableList(valid))
                else:
                    invalid_tables: List[InvalidTable] = []
                    for i in invalid:
                        invalid_tables += (i.invalid_tables)

                    return Failure(
                        InvalidTables(invalid_tables,
                                      f"No valid tables at {database_name}"))
            elif prefixes:
                for p in prefixes:
                    response.add_data(p)
                return Failure(
                    InvalidTables(
                        [],
                        f"No valid tables at {database_name}.  Try appending '/' or specify deeper key."
                    ))
            else:
                return Failure(InvalidTables([], "No Data returned from AWS"))