Esempio n. 1
0
    def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse:
        database_name: str = parameters.get_required("database_name")
        table_name: str = parameters.get_required("table_name")

        # TODO: break this up into 2 calls between scheduler and metastore, remove trigger_schedule_for_table from engine definition, table may not be defined for scheduler
        response = config.scheduler().trigger_schedule_for_table(table_name, database_name, response)
        return OperatorResponse(response)
Esempio n. 2
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        query_string = parameters.get_required("query_string")
        database_name = parameters.get_required("database_name")
        table_name = parameters.get_required("table_name")
        output_path = parameters.get_optional("output_path")

        # TODO?: Sanitize the query string
        query = query_string
        final: Union[ExecutedJob, InvalidJob]

        table, response = config.metastore().get_table(database_name,
                                                       table_name)

        if output_path and isinstance(config.storage(), StorageClient):
            outp: Optional[Path] = config.storage().path(output_path)
        else:
            outp = None

        if isinstance(table, Table):
            response.add_info(f"Running Query \"{query}\"")
            job = QueryJob(query_string, table, outp)
            final, response = config.execution().run_job(job, response)
        else:
            final = InvalidJob(table.message())

        return OperatorResponse(response, final)
Esempio n. 3
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        SUPPORTED_SCHEMAS = {"parquet", "csv", "json", "jsonl"}

        input_path = parameters.get_required("input_path")
        output_path = parameters.get_required("output_path")
        parse_headers = parameters.get_optional("parse_headers")

        table, response = config.storage().infer_table(
            input_path, "input_table", {"read_headers": parse_headers},
            response)
        final: Union[ExecutedJob, InvalidJob]

        if isinstance(table, Table):
            final = InvalidJob(
                "No conflicting schemas found. Merge Unecessary")
        else:
            conflicting_table = table.conflicting_table()
            if conflicting_table:
                schemas = conflicting_table.schema_conflict.unique_schemas
                schema_types: Set[str] = set(
                    map(lambda schema: schema.type, schemas))
                job = MergeJob(config.storage().path(input_path),
                               config.storage().path(output_path),
                               next(iter(schema_types)))
                if len(schemas) > 0 and schema_types.issubset(
                        SUPPORTED_SCHEMAS):
                    if len(schema_types) == 1:
                        executed, response = config.execution().run_job(
                            job, response)
                        if isinstance(executed, ExecutedJob):
                            final = job.running()
                        else:
                            final = InvalidJob(
                                f"Job {job.id} errored: {executed.reason}")
                    else:
                        final = InvalidJob(
                            "Mixed schemas not supported at this time.")
                else:
                    final = InvalidJob(
                        f"Unsupported schemas for merge operator: {', '.join(list(schema_types.difference(SUPPORTED_SCHEMAS)))}"
                    )
            else:
                final = InvalidJob(
                    f"No conflicting schemas found at {input_path}. Merge unecessary. Invalid Schemas {table.message()}"
                )

        return OperatorResponse(response, final)
Esempio n. 4
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            resp: Response) -> OperatorResponse:
        database_name: str = parameters.get_required("database_name")
        table_name: str = parameters.get_required("table_name")
        read_headers: bool = isinstance(
            parameters.get_optional("read_headers"), str)

        table, response = config.metastore().get_table(
            database_name,
            table_name,
            options={"read_headers": read_headers},
            response=resp)
        oR = OperatorResponse(response, table)
        return oR
Esempio n. 5
0
    def validate(self, required_keys: List[str],
                 optional_keys: List[str]) -> ValidatedParameters:
        parsed_parameters = self.parameters
        validated_parameters: List[ValidatedParameter] = []
        optional_parameters: List[OptionalParameter] = []
        invalid_parameters: List[InvalidParameter] = self.invalid

        for p in self.parameters:
            valid, optional, invalid = p.validate(required_keys, optional_keys)
            if valid:
                validated_parameters.append(valid)
            elif optional:
                optional_parameters.append(optional)
            elif invalid:
                invalid_parameters.append(invalid)

        validated_keys: List[str] = list(
            map(lambda v: v.key, validated_parameters))

        for k in (set(required_keys).difference(validated_keys)):
            invalid_parameters.append(
                InvalidParameter(f"Required parameter not specified: {k}"))

        return ValidatedParameters(parsed_parameters, validated_parameters,
                                   optional_parameters, invalid_parameters)
Esempio n. 6
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        schedule_name: str = parameters.get_required("schedule_name")

        response = config.scheduler().delete_schedule(schedule_name, response)

        return OperatorResponse(response)
Esempio n. 7
0
 def run(self, env: MasonEnvironment, config: Config,
         parameters: ValidatedParameters,
         response: Response) -> OperatorResponse:
     database_name: str = parameters.get_required("database_name")
     tables, response = config.metastore().list_tables(
         database_name, response)
     tb = compute(tables)
     return OperatorResponse(response, tb)
Esempio n. 8
0
    def run(self, env: MasonEnvironment, config: Config,
            parameters: ValidatedParameters,
            response: Response) -> OperatorResponse:
        table_name: str = parameters.get_required("table_name")
        database_name: str = parameters.get_required("database_name")
        output_path: str = parameters.get_required("output_path")
        format: str = parameters.get_required("format")
        sample_size: str = parameters.get_optional("sample_size") or "3"

        partition_columns: Optional[str] = parameters.get_optional(
            "partition_columns")
        filter_columns: Optional[str] = parameters.get_optional(
            "filter_columns")
        partitions: Optional[str] = parameters.get_optional("partitions")

        outp = config.storage().path(output_path)
        table, response = config.metastore().get_table(
            database_name,
            table_name,
            options={"sample_size": sample_size},
            response=response)
        credentials = config.metastore().credentials()

        if isinstance(credentials, MetastoreCredentials):
            if isinstance(table, Table):
                job = FormatJob(table, outp, format, partition_columns,
                                filter_columns, partitions, credentials)
                executed, response = config.execution().run_job(job, response)
            else:
                message = f"Table not found: {table_name}, {database_name}. Messages:  {table.message()}"
                executed = InvalidJob(message)
        else:
            message = f"Invalid Metastore Credentials: {credentials.reason}"
            executed = InvalidJob(message)

        return OperatorResponse(response, executed)
Esempio n. 9
0
 def run(self, env: MasonEnvironment, config: Config, parameters: ValidatedParameters, response: Response) -> OperatorResponse:
     job_id: str = parameters.get_required("job_id")
     execution = config.execution()
     job, response = config.execution().get_job(job_id, response)
     return OperatorResponse(response, job)