Esempio n. 1
0
    def execute(self,
                env: MasonEnvironment,
                response: Response,
                dry_run: bool = True) -> OperatorResponse:
        try:
            module = self.module(env)
            if isinstance(module, OperatorDefinition):
                if dry_run:
                    response.add_info(
                        f"Valid Operator: {self.namespace}:{self.command} with specified parameters."
                    )
                    return OperatorResponse(response)
                else:
                    operator_response: OperatorResponse = module.run(
                        env, self.config, self.parameters, response)
            else:
                response.add_error(
                    f"Module does not contain a valid OperatorDefinition. See /examples for sample operator implementations. \nMessage: {module.reason}"
                )
                operator_response = OperatorResponse(response)
        except ModuleNotFoundError as e:
            response.add_error(f"Module Not Found: {e}")
            operator_response = OperatorResponse(response)

        return operator_response
Esempio n. 2
0
    def run(
        self, env: MasonEnvironment, response: Response = Response()
    ) -> OperatorResponse:
        scheduler = self.config.scheduler()
        if isinstance(scheduler, SchedulerClient):
            response.add_info(
                f"Registering workflow dag {self.name} with {scheduler.client.name()}."
            )
            schedule_id, response, client_dag = scheduler.register_dag(
                self.name, self.dag, self.schedule, response)
            if not response.errored():
                response.add_info(f"Registered schedule {schedule_id}")
            # TODO: FIX
            # if client_dag and output_path:
            #     with tempfile.NamedTemporaryFile("w", delete=False) as f:
            #         json = client_dag.to_json()
            #         response.add_info(f"Saving client dag to {output_path}")
            #         f.write(json)
            #         f.close()
            #         response = self.config.storage.client.save_to(f.name, output_path, response)
            if self.schedule:
                response.add_warning(
                    f"Triggering workflow off schedule: {self.schedule.definition}"
                )

            response.add_info(f"Triggering schedule: {schedule_id}")
            response = scheduler.trigger_schedule(schedule_id, response, env)
        else:
            response.add_error("Scheduler client not defined")

        return OperatorResponse(response)
Esempio n. 3
0
    def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse:
        database_name: str = parameters.get_required("database_name")
        table_name: str = parameters.get_required("table_name")

        # TODO: break this up into 2 calls between scheduler and metastore, remove trigger_schedule_for_table from engine definition, table may not be defined for scheduler
        response = config.scheduler().trigger_schedule_for_table(table_name, database_name, response)
        return OperatorResponse(response)
Esempio n. 4
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        query_string = parameters.get_required("query_string")
        database_name = parameters.get_required("database_name")
        table_name = parameters.get_required("table_name")
        output_path = parameters.get_optional("output_path")

        # TODO?: Sanitize the query string
        query = query_string
        final: Union[ExecutedJob, InvalidJob]

        table, response = config.metastore().get_table(database_name,
                                                       table_name)

        if output_path and isinstance(config.storage(), StorageClient):
            outp: Optional[Path] = config.storage().path(output_path)
        else:
            outp = None

        if isinstance(table, Table):
            response.add_info(f"Running Query \"{query}\"")
            job = QueryJob(query_string, table, outp)
            final, response = config.execution().run_job(job, response)
        else:
            final = InvalidJob(table.message())

        return OperatorResponse(response, final)
Esempio n. 5
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        schedule_name: str = parameters.get_required("schedule_name")

        response = config.scheduler().delete_schedule(schedule_name, response)

        return OperatorResponse(response)
Esempio n. 6
0
 def run(self, env: MasonEnvironment, config: Config,
         parameters: ValidatedParameters,
         response: Response) -> OperatorResponse:
     database_name: str = parameters.get_required("database_name")
     tables, response = config.metastore().list_tables(
         database_name, response)
     tb = compute(tables)
     return OperatorResponse(response, tb)
Esempio n. 7
0
 def dry_run(
     self, env: MasonEnvironment, response: Response = Response()
 ) -> OperatorResponse:
     response.add_info(f"Performing Dry Run for Workflow")
     response.add_info("")
     response.add_info(f"Valid Workflow DAG Definition:")
     response.add_info(f"-" * 80)
     response.add_info(f"\n{self.dag.display()}")
     response.add_info("Finished")
     for r in list(map(lambda s: s.reason, self.dag.invalid_steps)):
         response.add_warning(r)
     return OperatorResponse(response)
Esempio n. 8
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        SUPPORTED_SCHEMAS = {"parquet", "csv", "json", "jsonl"}

        input_path = parameters.get_required("input_path")
        output_path = parameters.get_required("output_path")
        parse_headers = parameters.get_optional("parse_headers")

        table, response = config.storage().infer_table(
            input_path, "input_table", {"read_headers": parse_headers},
            response)
        final: Union[ExecutedJob, InvalidJob]

        if isinstance(table, Table):
            final = InvalidJob(
                "No conflicting schemas found. Merge Unecessary")
        else:
            conflicting_table = table.conflicting_table()
            if conflicting_table:
                schemas = conflicting_table.schema_conflict.unique_schemas
                schema_types: Set[str] = set(
                    map(lambda schema: schema.type, schemas))
                job = MergeJob(config.storage().path(input_path),
                               config.storage().path(output_path),
                               next(iter(schema_types)))
                if len(schemas) > 0 and schema_types.issubset(
                        SUPPORTED_SCHEMAS):
                    if len(schema_types) == 1:
                        executed, response = config.execution().run_job(
                            job, response)
                        if isinstance(executed, ExecutedJob):
                            final = job.running()
                        else:
                            final = InvalidJob(
                                f"Job {job.id} errored: {executed.reason}")
                    else:
                        final = InvalidJob(
                            "Mixed schemas not supported at this time.")
                else:
                    final = InvalidJob(
                        f"Unsupported schemas for merge operator: {', '.join(list(schema_types.difference(SUPPORTED_SCHEMAS)))}"
                    )
            else:
                final = InvalidJob(
                    f"No conflicting schemas found at {input_path}. Merge unecessary. Invalid Schemas {table.message()}"
                )

        return OperatorResponse(response, final)
Esempio n. 9
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            resp: Response) -> OperatorResponse:
        database_name: str = parameters.get_required("database_name")
        table_name: str = parameters.get_required("table_name")
        read_headers: bool = isinstance(
            parameters.get_optional("read_headers"), str)

        table, response = config.metastore().get_table(
            database_name,
            table_name,
            options={"read_headers": read_headers},
            response=resp)
        oR = OperatorResponse(response, table)
        return oR
Esempio n. 10
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        table_name: str = parameters.get_required("table_name")
        database_name: str = parameters.get_required("database_name")
        output_path: str = parameters.get_required("output_path")
        format: str = parameters.get_required("format")
        sample_size: str = parameters.get_optional("sample_size") or "3"

        partition_columns: Optional[str] = parameters.get_optional(
            "partition_columns")
        filter_columns: Optional[str] = parameters.get_optional(
            "filter_columns")
        partitions: Optional[str] = parameters.get_optional("partitions")

        outp = config.storage().path(output_path)
        table, response = config.metastore().get_table(
            database_name,
            table_name,
            options={"sample_size": sample_size},
            response=response)
        credentials = config.metastore().credentials()

        if isinstance(credentials, MetastoreCredentials):
            if isinstance(table, Table):
                job = FormatJob(table, outp, format, partition_columns,
                                filter_columns, partitions, credentials)
                executed, response = config.execution().run_job(job, response)
            else:
                message = f"Table not found: {table_name}, {database_name}. Messages:  {table.message()}"
                executed = InvalidJob(message)
        else:
            message = f"Invalid Metastore Credentials: {credentials.reason}"
            executed = InvalidJob(message)

        return OperatorResponse(response, executed)
Esempio n. 11
0
 def run(
     self, env: MasonEnvironment, response: Response = Response()
 ) -> OperatorResponse:
     response.add_error(f"Invalid Operator.  Reason:  {self.reason}")
     response.set_status(400)
     return OperatorResponse(response)
Esempio n. 12
0
 def dry_run(
     self, env: MasonEnvironment, response: Response = Response()
 ) -> OperatorResponse:
     response.add_error("Invalid Resource: " + self.reason)
     response.set_status(400)
     return OperatorResponse(response)
Esempio n. 13
0
 def run(self, env: MasonEnvironment, config: Config,
         parameters: ValidatedParameters,
         response: Response) -> OperatorResponse:
     response.add_info("Running operator2")
     return OperatorResponse(response)
Esempio n. 14
0
 def run(self, env: MasonEnvironment, config: Config,
         parameters: ValidatedParameters,
         response: Response) -> OperatorResponse:
     return OperatorResponse(response)
Esempio n. 15
0
 def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse:
     job_id: str = parameters.get_required("job_id")
     execution = config.execution()
     job, response = config.execution().get_job(job_id, response)
     return OperatorResponse(response, job)
Esempio n. 16
0
 def run(self, env: MasonEnvironment, config: Config,
         parameters: ValidatedParameters,
         response: Response) -> OperatorResponse:
     table = Table("test_table", EmptySchema())
     response.add_info("Running operator1")
     return OperatorResponse(response, table)