def test_csv_equality(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema1 = from_file(f, {"read_headers": True}) assert(isinstance(schema1, TextSchema)) with fs.open(from_root('/test/sample_data/csv_sample_2.csv')) as f: schema2 = from_file(f, {"read_headers": True}) assert(isinstance(schema2, TextSchema)) schema = find_conflicts([schema1, schema2])[0] assert(isinstance(schema, SchemaConflict)) expect = {'CountDistinctSchemas': 2, 'DistinctSchemas': [{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}]},{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}, {'Name': 'availabile', 'Type': 'bool'}, {'Name': 'date', 'Type': 'object'}]}], 'NonOverlappingColumns': [{'name': 'availabile', 'type': 'bool'}, {'name': 'date', 'type': 'object'}]} assert(schema.to_dict() == {'SchemaConflicts': expect})
def parse_param_dict( self, param_dict: dict ) -> Tuple[List[WorkflowParameter], List[InvalidParameter], Optional[str], Optional[str], bool]: valid: List[WorkflowParameter] = [] invalid: List[InvalidParameter] = [] schedule: Optional[str] = None schedule_name: Optional[str] = None strict: bool = True if isinstance(param_dict, dict): validated = object_from_json_schema( param_dict, from_root("/parameters/workflow_schema.json"), dict) # TODO: Use typistry for this # parameteters = validate_dict(TypedDict(param_dict, "workflow_parameters")) if isinstance( validated, dict ): #can now be confident it is matches schema definition schedule = validated.get("schedule") schedule_name = validated.get("schedule_name") strict_mode: Optional[Any] = validated.get("strict") if not isinstance(strict_mode, bool): strict = True else: strict = strict_mode for key, value in validated.items(): if key != "schedule" and key != "schedule_name" and key != "strict": config_id: str = str(value["config_id"]) parameters: Dict[str, Dict[str, Any]] = value["parameters"] valid_step, invalid_step = parse_dict( parameters, from_root("/parameters/schema.json")) ip = OperatorParameters() ip.parameters = valid_step ip.invalid = invalid_step valid.append(WorkflowParameter(key, config_id, ip)) else: invalid.append( InvalidParameter( f"Invalid parameters: {validated.reason}")) else: invalid.append( InvalidParameter( f"Parameters do not conform to specified schema in parameters/workflow_schema.json. Must be of form step_id: key:value. {param_dict}" )) return valid, invalid, schedule, schedule_name, strict
def test_valid_csv(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema = from_file(f, {"read_headers": True}) assert(isinstance(schema, TextSchema)) assert(list(map(lambda c: c.name, schema.columns)) == ["type","price"]) assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
def test_csv_no_header(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_no_header.csv')) as f: schema = from_file(f) assert(isinstance(schema, TextSchema)) assert(list(map(lambda c: c.name,schema.columns)) == [0,1]) assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
def test_file_not_supported(self): logger.set_level("error") fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f: schema = from_file(f) assert(isinstance(schema, InvalidSchema)) assert(schema.reason[0:32] == f"File type not supported for file")
def test_invalid_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/bad_json.json')) as f: schema = from_file(f, {}) assert(isinstance(schema, InvalidSchema)) message = f"File type not supported for file {from_root('/test/sample_data/bad_json.json')}. Type: ASCII text, with no line terminators" assert(message in schema.reason)
def test_complex_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/complex_json.json')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'data': {'type': 'array', 'items': {'type': 'object','properties': {'field1': {'type': 'string'},'field2': {'type': ['integer', 'string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'object','properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff']}}}}}, 'required': ['data']} assert(schema.schema == expect)
def test_jsonl(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'}}, 'type': 'object'} assert(schema.schema == expect)
def run_around_tests(self): examples = from_root("/examples/") os.environ["MASON_HOME"] = examples yield current_config = examples + "configs/CURRENT_CONFIG" if path.exists(current_config): os.remove(current_config)
def run_around_tests(self): tmp_folder = from_root("/.tmp/") if not path.exists(tmp_folder): mkdir(tmp_folder) yield if path.exists(tmp_folder): shutil.rmtree(tmp_folder)
def run_around_tests(self): logger.set_level("fatal") try: yield finally: cc = from_root("/test/support/configs/CURRENT_CONFIG") if path.exists(cc): remove(cc)
def __init__(self, mason_home: Optional[str] = None, validation_path: Optional[str] = None): self.mason_home = get_mason_home(mason_home) self.validation_path: str = validation_path or from_root( "/validations/") self.state_store = LocalStateStore(self.mason_home, get_client_version()) self.load_environment_variables()
def run_around_tests(self): os.environ["MASON_HOME"] = ".tmp/" if path.exists(".tmp/"): shutil.rmtree(".tmp/") os.mkdir(".tmp/") yield if path.exists(".tmp/"): shutil.rmtree(".tmp/") load_dotenv(from_root('/.env.example'))
def test_good_configs(self): env = base.get_env("/.tmp/", "/test/support/validations/") response, status = apply(from_root("/test/support/"), env=env, log_level="fatal") assert(len(response["Info"]) == 20) assert(len(response["Errors"]) == 8) assert(status == 200) response, status = get("config", env=env, log_level="fatal") assert(len(response["Configs"]) == 4)
def test_good_operators(self): env = base.get_env("/.tmp/", "/test/support/validations/") response, status = apply(from_root("/test/support/"), env=env, log_level="fatal") assert(len(response["Info"]) == 20) assert(len(response["Errors"]) == 8) assert(status == 200) response, status = get("operator", env=env, log_level="fatal") assert(len(response["Operators"]) == 6) operators = sorted(list(map(lambda o: o["command"], response["Operators"]))) assert(operators == ["operator1", "operator2", "operator3", "operator4", "operator5", "operator6"])
def test_valid_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/json_simple.json')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'}, 'field2': {'type': 'string'}, 'field3': {'type': 'string'}}, 'required': ['field', 'field2', 'field3'], 'type': 'object'} assert(schema.schema == expect) assert(schema.to_dict() == {'Columns': [], 'SchemaType': 'json'}) assert(schema.to_pd_dict() == {})
def open(self, key: str): if (key == "s3://crawler-poc/catalog_poc_data/test1.csv" or key == "s3://crawler-poc/catalog_poc_data/test2.csv"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) elif key in [ "s3://tests/in/csv/sample.csv", "s3://tests/in/csv/sample2.csv" ]: fs = LocalFileSystem() return fs.open(from_root('/test/sample_data/csv_sample.csv')) elif (key == "s3://test-data/test-path/test1.usf"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/unsupported_file_type.usf')) elif (key == "s3://test-data/test-path/test2.usf"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/unsupported_file_type.usf')) elif (key == "s3://test-data/test-path/sample.snappy.parquet"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) elif (key == "s3://test-data/test-path/sample.snappy.parquet"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) else: raise Exception(f"Unmocked S3 API endpoint: {key}")
def parse_path( self, param_path: str) -> Tuple[List[Parameter], List[InvalidParameter]]: parsed = parse_yaml_invalid(param_path) valid: List[Parameter] = [] invalid: List[InvalidParameter] = [] if isinstance(parsed, dict): valid, invalid = parse_dict(parsed, from_root("/parameters/schema.json")) else: invalid.append(InvalidParameter(parsed)) return valid, invalid
def test_query(): def tests(env: MasonEnvironment, config: Config, op: Operator): # valid query query = "SELECT * from $table limit 3" output_path = from_root("/.tmp/") params = OperatorParameters( parameter_string= f"query_string:{query},database_name:good_database,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp = { "1": [ 'Running Query "SELECT * from $table limit 3"', 'Running Athena query. query_id: test', 'Running job id=test' ], "4": [ f'Table succesfully formatted as parquet and exported to {output_path}' ] } expect = {'Info': exp[config.id]} assert (result.with_status() == (expect, 200)) # bad permissions query = "SELECT * from $table limit 3" params = OperatorParameters( parameter_string= f"query_string:{query},database_name:access_denied,table_name:good_table,output_path:{output_path}" ) result = op.validate(config, params).run(env, Response()) exp_2 = { "1": ({ 'Errors': [ 'Job errored: Access denied for credentials. Ensure associated user or role has permission to CreateNamedQuery on athena' ], 'Info': ['Running Query "SELECT * from $table limit 3"'] }, 403), "4": ({ 'Info': [ f'Table succesfully formatted as parquet and exported to {output_path}' ] }, 200) } assert (result.with_status() == exp_2[config.id]) run_tests("table", "query", True, "fatal", ["1", "4"], tests) tmp_folder = from_root("/.tmp/") if path.exists(tmp_folder): shutil.rmtree(tmp_folder)
def run_tests(namespace: str, command: str, do_mock: bool, log_level: str, configs: List[str], callable, *args, **kwargs): logger.set_level(log_level) env = get_env() load_dotenv(from_root("/../.env.example")) workflow = kwargs.get("workflow") or False if do_mock: patches = get_patches() with contextlib.ExitStack() as stack: for p in patches: stack.enter_context(p) run_test(env, namespace, command, configs, workflow, callable) else: run_test(env, namespace, command, configs, workflow, callable)
def run(self): try: # banner("Importing all registered_operator modules for API") env = MasonEnvironment() # operators.import_all(env) base_swagger = from_root("/api/base_swagger.yml") # banner(f"Regenerating api yaml based on registered_operators to {base_swagger}") # operators.update_yaml(env, base_swagger) app = connexion.App(__name__, specification_dir='api') # Read the swagger.yml file to configure the endpoints swagger = from_root("/api/base_swagger.yml") app.add_api(swagger) # Create a URL route in our application for "/" @app.route('/') def home(): """ This function just responds to the browser ULR localhost:5000/ :return: the rendered template 'home.html' """ readme_file = open("../README.md", "r") md_template_string = markdown.markdown( readme_file.read(), extensions=["fenced_code"]) return md_template_string # If we're running in stand alone mode, run the application if __name__ == 'mason.server': app.run(host='0.0.0.0', port=5000, debug=True) except ModuleNotFoundError as e: logger.error(str(e))
def tests(env: MasonEnvironment, config: Config, wf: Workflow): # DNE params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_1.yaml")) dne = wf.validate(env, config, params).run(env, Response()) assert (dne.with_status() == expects.post(False)) # Exists params = WorkflowParameters(parameter_path=from_root( "/test/support/parameters/table_infer_parameters_2.yaml")) exists = wf.validate(env, config, params).run(env, Response()) assert (exists.with_status() == expects.post(True)) # API response, status = run( "workflow", wf.namespace, wf.command, param_file=from_root( "/test/support/parameters/table_infer_parameters_1.yaml"), config_id=config.id, env=env, log_level="fatal") assert ((response, status) == expects.post(False))
def test_check_schemas(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/complex_json.json')) as f: schema1 = from_file(f) assert(isinstance(schema1, JsonSchema)) with fs.open(from_root('/test/sample_data/complex_json_2.json')) as f: schema2 = from_file(f) assert(isinstance(schema2, JsonSchema)) with fs.open(from_root('/test/sample_data/json_simple.json')) as f: schema3 = from_file(f) assert(isinstance(schema3, JsonSchema)) with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f: schema4 = from_file(f) assert(isinstance(schema4, InvalidSchema)) with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema5 = from_file(f, {"read_headers": True}) assert(isinstance(schema5, TextSchema)) with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f: schema6 = from_file(f) assert(isinstance(schema6, JsonSchema)) with fs.open(from_root('/test/sample_data/json_lines2.jsonl')) as f: schema7 = from_file(f) assert(isinstance(schema7, JsonSchema)) schema = find_conflicts([schema1, schema2])[0] expect = {'$schema': 'http://json-schema.org/schema#','properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}},'type': 'object'},'type': 'array'}},'required': ['data'],'type': 'object'} assert(isinstance(schema, JsonSchema)) assert(schema.schema == expect) schema = find_conflicts([schema1, schema2, schema3])[0] assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'},'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'}}, 'required': [], 'type': 'object'} assert(schema.schema == expect) schema = find_conflicts([schema1, schema2, schema3, schema5])[0] assert(isinstance(schema, InvalidSchema)) assert(schema.reason == "Mixed type schemas not supported at this time. Ensure that files are of one type: ['csv', 'json']") schema = find_conflicts([schema6, schema7])[0] assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'},'other': {'type': 'string'},'other2': {'type': 'string'},'other3': {'type': 'string'}}, 'required': ['other'], 'type': 'object'} assert(schema.schema == expect)
def __init__(self, parameter_string: Optional[str] = None, parameter_path: Optional[str] = None, parameter_dict: Optional[dict] = None): self.parameter_string = parameter_string self.parameter_path = parameter_path if parameter_string: parameters, invalid = self.parse_string(parameter_string) elif parameter_path: parameters, invalid = self.parse_path(parameter_path) elif parameter_dict: parameters, invalid = parse_dict( parameter_dict, from_root("/parameters/schema.json")) else: parameters, invalid = ([], []) self.parameters: List[Parameter] = dedupe(parameters) self.invalid: List[InvalidParameter] = invalid
def merge_config(config: SparkConfig, job: Job): base_config_file = from_root( "/clients/spark/runner/kubernetes_operator/base_config.yaml") parameters = job.parameters or {} parameters["job"] = job.type param_list = prep_parameters(parameters) merge_document = { 'metadata': { 'name': job.id }, 'spec': { 'arguments': param_list, 'image': config.docker_image, 'mainClass': config.main_class, 'mainApplicationFile': config.application_file, 'sparkVersion': config.spark_version, 'driver': { 'cores': config.driver_cores, 'memory': str(config.driver_memory_mbs) + 'm', 'labels': { 'version': config.spark_version } }, 'executor': { 'cores': config.executor_cores, 'instances': config.executors, 'memory': str(config.executor_memory_mb) + 'm', 'labels': { 'version': config.spark_version } } } } arguments = yaml.dump(merge_document) conf = hload(base_config_file, arguments, method=hiyapyco.METHOD_MERGE, usedefaultyamlloader=True) return conf
def update_yaml_file(base_swagger: str, directories: List[str]): swagger_file = from_root("/api/swagger.yml") parsed_swagger = parse_yaml(base_swagger) or {} paths: dict = parsed_swagger["paths"] for directory in directories: for r, d, f in os.walk(directory): for file in f: if '.yml' in file or '.yaml' in file: file_path = os.path.join(r, file) if file == "swagger.yml" or file == "swagger.yaml": file_parsed = parse_yaml(file_path) or {} parsed_paths = file_parsed.get('paths') or {} if len(parsed_paths) > 0: paths.update(parsed_paths) parsed_swagger['paths'] = paths with open(swagger_file, 'w+') as file: #type: ignore yaml.dump(parsed_swagger, file) #type: ignore
def test_format(): load_dotenv(from_root("/../.env"), override=True) def tests(env: MasonEnvironment, config: Config, op: Operator): params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:boogo,output_path:mason-sample-data/tests/out/csv/" ) good = op.validate(config, params).run(env, Response()) invalid_job = good.object assert (isinstance(invalid_job, InvalidJob)) params = OperatorParameters( parameter_string= f"database_name:mason-sample-data,table_name:tests/in/csv/,format:csv,output_path:good_output_path" ) good = op.validate(config, params).run(env, Response()) executed_job = good.object assert (isinstance(executed_job, ExecutedJob)) run_tests("table", "format", True, "fatal", ["4"], tests)
def parse_schemas(directory: str, type: str, cls: Type[T]) -> Tuple[List[T], List[str]]: objects: List[T] = [] errors: List[str] = [] for r, d, f in os.walk(directory): for file in f: if '.yaml' in file or '.yml' in file: if not file.split(".")[0] == "swagger": file_path = os.path.join(r, file) config = parse_yaml_invalid(file_path) if isinstance(config, dict): if config.get("type") == type: schema = from_root(f"/{type}s/schema.json") config["source_path"] = file_path object = object_from_json_schema(config, schema, cls) if isinstance(object, InvalidSchemaDict): errors.append(object.reason) else: objects.append(object) else: errors.append(f"Invalid Schema Specification: {config}") return objects, errors
def build(self): attributes = self.dict.attributes() id = attributes.get("id") mc = attributes.get("metastore_clients") sc = attributes.get("storage_clients") cc = attributes.get("scheduler_clients") ec = attributes.get("execution_clients") source_path = attributes.get("source") cl = attributes.get("clients") clients: List[Client] = [] invalid: List[InvalidClient] = [] if id: # TODO: Add nested object and union support to typistry # TODO: Clean this up if isinstance(cl, dict): for client_name, configuration in cl.items(): configuration = configuration.get("configuration") if isinstance(configuration, dict): configuration = safe_interpolate_environment(configuration) client_class = self.supported_client(client_name) if not client_class is None: tdict = TypedDict(configuration, client_name) valid: Union[ValidDict, InvalidObject] = validate_dict(tdict, from_root(self.client_path()))._inner_value if isinstance(valid, ValidDict): valid.typed_dict.type = valid.type() + "_client" value: Union[Client, InvalidObject] = build_object(valid, to_class=client_class)._inner_value if isinstance(value, InvalidObject): invalid.append(InvalidClient(f"Invalid Client: {value.message}, {value.reference}")) else: clients.append(value) else: invalid.append(InvalidClient(f"{valid.message}")) else: invalid.append(InvalidClient(f"Client not supported: {client_name}")) else: invalid.append(InvalidClient(f"Bad Configuration. Must be dict: {configuration}")) else: invalid.append(InvalidClient("Bad client configuration")) metastore_clients: List[Union[MetastoreClient, InvalidClient]] = MetastoreEngine().get_clients(mc, clients, self.client_module()) execution_clients: List[Union[ExecutionClient, InvalidClient]] = ExecutionEngine().get_clients(ec, clients, self.client_module()) scheduler_clients: List[Union[SchedulerClient, InvalidClient]] = SchedulerEngine().get_clients(cc, clients, self.client_module()) storage_clients: List[Union[StorageClient, InvalidClient]] = StorageEngine().get_clients(sc, clients, self.client_module()) return Config(id, clients, invalid, metastore_clients, execution_clients, storage_clients, scheduler_clients, source_path) else: return InvalidObject("Id not provided for config object", attributes)
def test_snappy_parquet_schema_support(self): logger.set_level("info") fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/sample.snappy.parquet')) as f: schema = from_file(f) assert(isinstance(schema, ParquetSchema))