Example #1
0
    def test_parquet_schema_equality(self):
        columns1 = [
            ParquetElement("test_name", "test_type", "converted_type", "repitition_type"),
            ParquetElement("test_name_2", "test_type_2", "converted_type_2", "repitition_type_2")
        ]

        columns2 = [
            ParquetElement("test_name", "test_type", "converted_type", "repitition_type"),
            ParquetElement("test_name_3", "test_type_3", "converted_type_3", "repitition_type_2")
        ]

        schema1 = ParquetSchema(columns1, Path(""))
        schema2 = ParquetSchema(columns1, Path(""))
        schema3 = ParquetSchema(columns2, Path(""))

        assert(schema1.__eq__(schema2))
        assert(not schema1.__eq__(schema3))
        assert(not schema2.__eq__(schema3))

        s = set([schema1, schema2, schema3])
        assert(len(s) == 2)

        schema = find_conflicts([schema1, schema3])[0]
        assert(isinstance(schema, SchemaConflict))

        assert(schema.to_dict()['SchemaConflicts']['NonOverlappingColumns'] == [{'name': 'test_name_2', 'type': 'test_type_2'}, {'name': 'test_name_3', 'type': 'test_type_3'}])
Example #2
0
    def register_schedule(self, database_name: str, path: Path,
                          schedule_name: str, schedule: Optional[Schedule],
                          response: Response):
        create_crawler_response = self.create_glue_crawler(
            database=database_name,
            name=schedule_name,
            role=self.aws_role_arn or "",
            path=path.clean_path_str(),
            schedule=schedule)

        response.add_response(create_crawler_response)
        error, status, message = self.parse_response(create_crawler_response)

        if error == "AlreadyExistsException":
            response.add_warning(
                f"Table crawler {schedule_name} already exists. Skipping creation."
            )
            response.set_status(201)
        elif error == "CrawlerRunningException":
            response.add_warning(
                f"Table crawler {schedule_name} is already refreshing.")
            response.set_status(202)
        elif 200 <= status < 300:
            response.add_info(f"Created table crawler {schedule_name}.")
            response.set_status(201)
        else:
            response.add_error(message)
            response.set_status(status)

        return response
Example #3
0
    def list_tables(
        self, database_name: str, response: Response
    ) -> Tuple[Result[TableList, InvalidTables], Response]:
        try:
            result = self.client().get_tables(DatabaseName=database_name)
        except ClientError as e:
            result = e.response
        response.add_response(result)
        error, status, message = self.parse_response(result)

        if error == "EntityNotFoundException":
            final = Failure(
                InvalidTables([], f"Database {database_name} not found"))
            response.set_status(404)
            return final, response
        elif 200 <= status < 300:
            valid: List[Table]
            valid, invalid = self.parse_table_list_data(
                result, Path(database_name, "glue"), database_name)
            if len(valid) > 0:
                response.set_status(status)
                return Success(TableList(valid)), response
            else:
                return Failure(InvalidTables(
                    [], "No Valid tables found")), response
        else:
            response.set_status(status)
            return Failure(InvalidTables(message)), response
Example #4
0
    def get_database(
        self,
        database_name: str,
        response: Optional[Response] = None
    ) -> Tuple[Result[Database, InvalidDatabase], Response]:
        resp = response or Response()

        try:
            result = self.client().get_tables(DatabaseName=database_name)
        except ClientError as e:
            result = e.response

        error, status, message = self.parse_response(result)
        resp.add_response(result)

        if error == "EntityNotFoundException":
            resp.set_status(404)
            return Failure(
                InvalidDatabase(f"Database {database_name} not found")), resp
        elif 200 <= status < 300:

            table_list = result.get("TableList")
            if table_list:
                valid, invalid = sequence(
                    list(
                        map(
                            lambda x: self.parse_table(
                                x, Path(database_name, "glue"), database_name),
                            table_list)), Table, InvalidTable)
                if len(invalid) > 0:
                    invalid_messages = ", ".join(
                        list(map(lambda i: i.reason, invalid)))
                    resp.add_warning(
                        f"Invalid Tables in glue response: {invalid_messages}")
                if len(valid) == 0:
                    return Failure(InvalidDatabase(f"No valid tables")), resp
                else:
                    return Success(Database(database_name,
                                            TableList(valid))), resp
            else:
                return Failure(
                    InvalidDatabase(
                        "TableList not found in glue response")), resp
        else:
            resp.set_status(status)
            return Failure(
                InvalidDatabase(
                    f"Invalid response from glue: {message}.  Status: {status}"
                )), resp
Example #5
0
def from_file(path: Path):

    # TODO: This code does not scale for large single json file (not jsonl)
    try:
        with fsspec.open(path.full_path(), "r") as f:
            try:
                data = [json.load(f)]
                json_type = "json"
            except JSONDecodeError as e:
                f.seek(0)
                jsonl_preview = list(islice(f, 10))
                data = [json.loads(jline) for jline in jsonl_preview]
                json_type = "jsonl"
            builder = SchemaBuilder()
            for d in data:
                builder.add_object(d)
            schema = builder.to_schema()
            return JsonSchema(schema, path, json_type)
    except Exception as e:
        return InvalidSchema(f"File not found {path.full_path()}")
Example #6
0
    def get_table(
        self,
        database_name: str,
        table_name: str,
        resp: Optional[Response] = None
    ) -> Tuple[Union[Table, InvalidTables], Response]:
        try:
            result = self.client().get_table(DatabaseName=database_name,
                                             Name=table_name)
        except ClientError as e:
            result = e.response

        response: Response = resp or Response()
        response.add_response(result)

        error, status, message = self.parse_response(result)
        table = self.parse_table(result.get("Table", {}),
                                 Path(database_name + ":" + table_name,
                                      "glue"),
                                 database_name=database_name)

        final: Union[Table, InvalidTables]
        if error == "EntityNotFoundException":
            final = InvalidTables([
                TableNotFound(
                    f"Database {database_name} or table {table_name} not found"
                )
            ])
        elif 200 <= status < 300:
            if isinstance(table, Table):
                final = table
            else:
                final = InvalidTables([table])
        else:
            final = InvalidTables([InvalidTable(f"Invalid Table: {message}")])
            response.set_status(status)

        return final, response
Example #7
0
 def test_file_dne(self):
     schema = json_from_file(Path('test/sample_data/dne.json'))
     assert (isinstance(schema, InvalidSchema))
     message = 'File not found test/sample_data/dne.json'
     assert(schema.reason[0:97] == message)
Example #8
0
 def path(self, path: str):
     return Path(path.replace("s3://", ""), "s3")
Example #9
0
 def get_path(self, path: str) -> Path:
     return Path(path, "s3")
Example #10
0
    def test_kubernetes_operator(self):

        config = SparkConfig({
            'script_type': 'scala-test',
            'spark_version': 'test.spark.version',
            'main_class': 'test.main.Class',
            'docker_image': 'docker/test-docker-image',
            'application_file': 'test/jar/file/location/assembly.jar',
            'driver_cores': 10,
            'driver_memory_mbs': 1024,
            'executors': 3,
            'executor_memory_mb': 1024,
            'executor_cores': 20
        })

        job = MergeJob(Path("test-input"), Path("test-output"), "parquet")
        job.set_id("mason-spark-test_job")

        merged = merge_config(config, job)

        dumped = hdump(merged)
        expects = """
            apiVersion: sparkoperator.k8s.io/v1beta2
            kind: SparkApplication
            metadata:
              name: mason-spark-test_job
              namespace: default
            spec:
              arguments:
              - --input_path
              - test-input 
              - --output_path
              - test-output 
              - --input_format
              - parquet
              - --job
              - merge 
              driver:
                coreLimit: 1200m
                cores: 10
                labels:
                  version: test.spark.version
                memory: 1024m
                serviceAccount: spark
                volumeMounts:
                - mountPath: /tmp
                  name: test-volume
              executor:
                cores: 20
                instances: 3
                labels:
                  version: test.spark.version
                memory: 1024m
                volumeMounts:
                - mountPath: /tmp
                  name: test-volume
              image: docker/test-docker-image
              imagePullPolicy: Always
              mainApplicationFile: local://test/jar/file/location/assembly.jar
              mainClass: test.main.Class
              mode: cluster
              restartPolicy:
                type: Never
              sparkVersion: test.spark.version
              type: Scala
              volumes:
              - hostPath:
                  path: /tmp
                  type: Directory
                name: test-volume
        """
        assert clean_string(dumped) == clean_string(expects)
Example #11
0
 def save_to(self, inpath: str, outpath: str, response: Response) -> Response:
     inp: Path = Path(inpath) # TODO:  allow saving between paths of different storage clients
     outp: Path = self.path(outpath)
     # TODO:
     return self.client.save_to(inp, outp, response)
Example #12
0
 def infer_table(self, path: str,  name: Optional[str] = None, options: Optional[dict] = None, response: Optional[Response] = None) -> Tuple[Union[Table, InvalidTables], Response]:
     return self.client.infer_table((self.path(path) or Path("", "s3")).path_str, name, options, response)
Example #13
0
def get_path(file: Union[S3File, AbstractBufferedFile]) -> Path:
    if isinstance(file, S3File):
        path = Path(file.path, "s3")
    else:
        path = Path(file.name)
    return path