def test_parquet_schema_equality(self): columns1 = [ ParquetElement("test_name", "test_type", "converted_type", "repitition_type"), ParquetElement("test_name_2", "test_type_2", "converted_type_2", "repitition_type_2") ] columns2 = [ ParquetElement("test_name", "test_type", "converted_type", "repitition_type"), ParquetElement("test_name_3", "test_type_3", "converted_type_3", "repitition_type_2") ] schema1 = ParquetSchema(columns1, Path("")) schema2 = ParquetSchema(columns1, Path("")) schema3 = ParquetSchema(columns2, Path("")) assert(schema1.__eq__(schema2)) assert(not schema1.__eq__(schema3)) assert(not schema2.__eq__(schema3)) s = set([schema1, schema2, schema3]) assert(len(s) == 2) schema = find_conflicts([schema1, schema3])[0] assert(isinstance(schema, SchemaConflict)) assert(schema.to_dict()['SchemaConflicts']['NonOverlappingColumns'] == [{'name': 'test_name_2', 'type': 'test_type_2'}, {'name': 'test_name_3', 'type': 'test_type_3'}])
def register_schedule(self, database_name: str, path: Path, schedule_name: str, schedule: Optional[Schedule], response: Response): create_crawler_response = self.create_glue_crawler( database=database_name, name=schedule_name, role=self.aws_role_arn or "", path=path.clean_path_str(), schedule=schedule) response.add_response(create_crawler_response) error, status, message = self.parse_response(create_crawler_response) if error == "AlreadyExistsException": response.add_warning( f"Table crawler {schedule_name} already exists. Skipping creation." ) response.set_status(201) elif error == "CrawlerRunningException": response.add_warning( f"Table crawler {schedule_name} is already refreshing.") response.set_status(202) elif 200 <= status < 300: response.add_info(f"Created table crawler {schedule_name}.") response.set_status(201) else: response.add_error(message) response.set_status(status) return response
def list_tables( self, database_name: str, response: Response ) -> Tuple[Result[TableList, InvalidTables], Response]: try: result = self.client().get_tables(DatabaseName=database_name) except ClientError as e: result = e.response response.add_response(result) error, status, message = self.parse_response(result) if error == "EntityNotFoundException": final = Failure( InvalidTables([], f"Database {database_name} not found")) response.set_status(404) return final, response elif 200 <= status < 300: valid: List[Table] valid, invalid = self.parse_table_list_data( result, Path(database_name, "glue"), database_name) if len(valid) > 0: response.set_status(status) return Success(TableList(valid)), response else: return Failure(InvalidTables( [], "No Valid tables found")), response else: response.set_status(status) return Failure(InvalidTables(message)), response
def get_database( self, database_name: str, response: Optional[Response] = None ) -> Tuple[Result[Database, InvalidDatabase], Response]: resp = response or Response() try: result = self.client().get_tables(DatabaseName=database_name) except ClientError as e: result = e.response error, status, message = self.parse_response(result) resp.add_response(result) if error == "EntityNotFoundException": resp.set_status(404) return Failure( InvalidDatabase(f"Database {database_name} not found")), resp elif 200 <= status < 300: table_list = result.get("TableList") if table_list: valid, invalid = sequence( list( map( lambda x: self.parse_table( x, Path(database_name, "glue"), database_name), table_list)), Table, InvalidTable) if len(invalid) > 0: invalid_messages = ", ".join( list(map(lambda i: i.reason, invalid))) resp.add_warning( f"Invalid Tables in glue response: {invalid_messages}") if len(valid) == 0: return Failure(InvalidDatabase(f"No valid tables")), resp else: return Success(Database(database_name, TableList(valid))), resp else: return Failure( InvalidDatabase( "TableList not found in glue response")), resp else: resp.set_status(status) return Failure( InvalidDatabase( f"Invalid response from glue: {message}. Status: {status}" )), resp
def from_file(path: Path): # TODO: This code does not scale for large single json file (not jsonl) try: with fsspec.open(path.full_path(), "r") as f: try: data = [json.load(f)] json_type = "json" except JSONDecodeError as e: f.seek(0) jsonl_preview = list(islice(f, 10)) data = [json.loads(jline) for jline in jsonl_preview] json_type = "jsonl" builder = SchemaBuilder() for d in data: builder.add_object(d) schema = builder.to_schema() return JsonSchema(schema, path, json_type) except Exception as e: return InvalidSchema(f"File not found {path.full_path()}")
def get_table( self, database_name: str, table_name: str, resp: Optional[Response] = None ) -> Tuple[Union[Table, InvalidTables], Response]: try: result = self.client().get_table(DatabaseName=database_name, Name=table_name) except ClientError as e: result = e.response response: Response = resp or Response() response.add_response(result) error, status, message = self.parse_response(result) table = self.parse_table(result.get("Table", {}), Path(database_name + ":" + table_name, "glue"), database_name=database_name) final: Union[Table, InvalidTables] if error == "EntityNotFoundException": final = InvalidTables([ TableNotFound( f"Database {database_name} or table {table_name} not found" ) ]) elif 200 <= status < 300: if isinstance(table, Table): final = table else: final = InvalidTables([table]) else: final = InvalidTables([InvalidTable(f"Invalid Table: {message}")]) response.set_status(status) return final, response
def test_file_dne(self): schema = json_from_file(Path('test/sample_data/dne.json')) assert (isinstance(schema, InvalidSchema)) message = 'File not found test/sample_data/dne.json' assert(schema.reason[0:97] == message)
def path(self, path: str): return Path(path.replace("s3://", ""), "s3")
def get_path(self, path: str) -> Path: return Path(path, "s3")
def test_kubernetes_operator(self): config = SparkConfig({ 'script_type': 'scala-test', 'spark_version': 'test.spark.version', 'main_class': 'test.main.Class', 'docker_image': 'docker/test-docker-image', 'application_file': 'test/jar/file/location/assembly.jar', 'driver_cores': 10, 'driver_memory_mbs': 1024, 'executors': 3, 'executor_memory_mb': 1024, 'executor_cores': 20 }) job = MergeJob(Path("test-input"), Path("test-output"), "parquet") job.set_id("mason-spark-test_job") merged = merge_config(config, job) dumped = hdump(merged) expects = """ apiVersion: sparkoperator.k8s.io/v1beta2 kind: SparkApplication metadata: name: mason-spark-test_job namespace: default spec: arguments: - --input_path - test-input - --output_path - test-output - --input_format - parquet - --job - merge driver: coreLimit: 1200m cores: 10 labels: version: test.spark.version memory: 1024m serviceAccount: spark volumeMounts: - mountPath: /tmp name: test-volume executor: cores: 20 instances: 3 labels: version: test.spark.version memory: 1024m volumeMounts: - mountPath: /tmp name: test-volume image: docker/test-docker-image imagePullPolicy: Always mainApplicationFile: local://test/jar/file/location/assembly.jar mainClass: test.main.Class mode: cluster restartPolicy: type: Never sparkVersion: test.spark.version type: Scala volumes: - hostPath: path: /tmp type: Directory name: test-volume """ assert clean_string(dumped) == clean_string(expects)
def save_to(self, inpath: str, outpath: str, response: Response) -> Response: inp: Path = Path(inpath) # TODO: allow saving between paths of different storage clients outp: Path = self.path(outpath) # TODO: return self.client.save_to(inp, outp, response)
def infer_table(self, path: str, name: Optional[str] = None, options: Optional[dict] = None, response: Optional[Response] = None) -> Tuple[Union[Table, InvalidTables], Response]: return self.client.infer_table((self.path(path) or Path("", "s3")).path_str, name, options, response)
def get_path(file: Union[S3File, AbstractBufferedFile]) -> Path: if isinstance(file, S3File): path = Path(file.path, "s3") else: path = Path(file.name) return path