Beispiel #1
0
    def parse_table(
            self,
            glue_response: dict,
            path: Path,
            database_name: Optional[str] = None) -> Union[Table, InvalidTable]:
        sd = glue_response.get("StorageDescriptor")
        crawler_name = glue_response.get("Parameters",
                                         {}).get("UPDATED_BY_CRAWLER")

        if sd:
            columns = sd.get("Columns")
            if columns:
                elements = list(
                    map(lambda c: self.schema_from_dict(c), columns))
                valid, invalid = sequence(elements, SchemaElement,
                                          InvalidSchemaElement)
                if len(invalid) > 0:
                    invalid_messages = ", ".join(
                        list(map(lambda i: i.reason, invalid)))
                    return InvalidTable(
                        f"Invalid Schema, invalid elements: {invalid_messages}"
                    )
                elif len(valid) == 0:
                    return InvalidTable(f"Invalid Schema, no valid elements.")
                else:
                    name = glue_response.get("Name")
                    if name:
                        created_at = glue_response.get("CreateTime")

                        if crawler_name:
                            created_by = "crawler:" + crawler_name
                        else:
                            created_by = glue_response.get("CreatedBy")

                        schema = Schema(valid, "glue", path)
                        return Table(name,
                                     schema,
                                     created_at,
                                     created_by,
                                     database_name=database_name)
                    else:
                        return InvalidTable(
                            "No table Name found in glue response")

            else:
                return InvalidTable(
                    f"Columns not found in glue response for {glue_response.get('Name', '')}"
                )
        else:
            return InvalidTable("StorageDescriptor not found in glue response")
Beispiel #2
0
    def get_database(
        self,
        database_name: str,
        response: Optional[Response] = None
    ) -> Tuple[Result[Database, InvalidDatabase], Response]:
        resp = response or Response()

        try:
            result = self.client().get_tables(DatabaseName=database_name)
        except ClientError as e:
            result = e.response

        error, status, message = self.parse_response(result)
        resp.add_response(result)

        if error == "EntityNotFoundException":
            resp.set_status(404)
            return Failure(
                InvalidDatabase(f"Database {database_name} not found")), resp
        elif 200 <= status < 300:

            table_list = result.get("TableList")
            if table_list:
                valid, invalid = sequence(
                    list(
                        map(
                            lambda x: self.parse_table(
                                x, Path(database_name, "glue"), database_name),
                            table_list)), Table, InvalidTable)
                if len(invalid) > 0:
                    invalid_messages = ", ".join(
                        list(map(lambda i: i.reason, invalid)))
                    resp.add_warning(
                        f"Invalid Tables in glue response: {invalid_messages}")
                if len(valid) == 0:
                    return Failure(InvalidDatabase(f"No valid tables")), resp
                else:
                    return Success(Database(database_name,
                                            TableList(valid))), resp
            else:
                return Failure(
                    InvalidDatabase(
                        "TableList not found in glue response")), resp
        else:
            resp.set_status(status)
            return Failure(
                InvalidDatabase(
                    f"Invalid response from glue: {message}.  Status: {status}"
                )), resp
Beispiel #3
0
 def parse_table_list_data(
     self,
     glue_response: dict,
     path: Path,
     database_name: Optional[str] = None
 ) -> Tuple[List[Table], List[InvalidTable]]:
     table_list = glue_response.get('TableList')
     if isinstance(table_list, List):
         parsed: List[Union[Table, InvalidTable]] = list(
             map(lambda x: self.parse_table(x, path, database_name),
                 table_list))
         valid, invalid = sequence(parsed, Table, InvalidTable)
         return valid, invalid
     else:
         none1: List[Table] = []
         none2: List[InvalidTable] = [
             InvalidTable(
                 "Bad TableList response from glue.  Expected list[dict]")
         ]
         return none1, none2
Beispiel #4
0
    def validate(
            self, env: MasonEnvironment,
            all_parameters: WorkflowParameters) -> Union[ValidDag, InvalidDag]:
        all_steps = self.steps
        all_step_ids = list(map(lambda s: s.id, all_steps))
        validated: List[Union[ValidDagStep, InvalidDagStep]] = flatten(
            list(
                map(lambda s: s.validate(env, all_parameters, all_step_ids),
                    self.steps)))
        valid_steps, invalid_steps = sequence(validated, ValidDagStep,
                                              InvalidDagStep)

        roots: List[ValidDagStep] = [
            v for v in valid_steps if len(v.dependencies) == 0
        ]

        if all_parameters.strict_mode and len(invalid_steps) > 0:
            return InvalidDag(
                "Invalid DAG, contains invalid steps.  Specify strict:false to allow more permissive validation. ",
                valid_steps, invalid_steps)
        else:
            return self.validate_dag(valid_steps, invalid_steps, roots)
Beispiel #5
0
        def parse_response(
                result: dict,
                response: Response) -> Result[TableList, InvalidTables]:
            contents: Optional[List[dict]] = result.get("Contents")
            prefixes: Optional[List[dict]] = result.get("CommonPrefixes")

            if contents:
                tables: List[Union[Table, InvalidTables]] = []
                for c in contents:
                    key: Optional[str] = c.get("Key")
                    if key:
                        table, response = self.get_table(
                            database_name.split("/")[0],
                            key,
                            response=response)
                        tables.append(table)
                valid, invalid = sequence(tables, Table, InvalidTables)
                if len(valid) > 0:
                    return Success(TableList(valid))
                else:
                    invalid_tables: List[InvalidTable] = []
                    for i in invalid:
                        invalid_tables += (i.invalid_tables)

                    return Failure(
                        InvalidTables(invalid_tables,
                                      f"No valid tables at {database_name}"))
            elif prefixes:
                for p in prefixes:
                    response.add_data(p)
                return Failure(
                    InvalidTables(
                        [],
                        f"No valid tables at {database_name}.  Try appending '/' or specify deeper key."
                    ))
            else:
                return Failure(InvalidTables([], "No Data returned from AWS"))
Beispiel #6
0
    def infer_table(
        self,
        path: str,
        name: Optional[str],
        options: Optional[dict] = None,
        resp: Optional[Response] = None
    ) -> Tuple[Union[Table, InvalidTables], Response]:
        opt = options or {}
        logger.info(f"Fetching keys at {path}")
        response: Response = resp or Response()

        path = self.get_path(path).full_path()
        keys, response = self.list_keys(path, response)

        logger.debug(f"{len(keys)} keys at {path}")

        final: Union[Table, InvalidTables]

        sample_size = opt.get("sample_size")
        if sample_size:
            import random
            try:
                ss = int(sample_size)
            except TypeError:
                logger.warning(f"Invalid sample size (int): {sample_size}")
                ss = 3

            logger.warning(
                f"Sampling keys to determine schema. Sample size: {ss}.")
            if ss < len(keys):
                keys = random.sample(keys, ss)

        if len(keys) > 0:
            try:
                valid, invalid_schemas = sequence(
                    list(
                        map(
                            lambda key: schemas.from_file(
                                self.client().open(key.full_path()), opt),
                            keys)), Schema, InvalidSchema)
                non_empty = [
                    v for v in valid if not isinstance(v, EmptySchema)
                ]
                validated, paths = CheckSchemas.find_conflicts(non_empty)
                table = CheckSchemas.get_table(self.get_name(name, path),
                                               validated, paths)
                invalid_tables = list(
                    map(
                        lambda i: InvalidTable("Invalid Schema",
                                               invalid_schema=i),
                        invalid_schemas))
                if isinstance(table, Table):
                    final = table
                else:
                    invalid_tables.append(table)
                    final = InvalidTables(invalid_tables)
            except (ClientError, PermissionError) as e:
                final = InvalidTables(
                    [InvalidTable(f"Not able to infer table: {message(e)}")])
        else:
            response.set_status(404)
            final = InvalidTables([TableNotFound(f"No keys at {path}")])

        return final, response