Ejemplo n.º 1
0
def create_table_design_for_transformation(conn: Connection,
                                           kind: str,
                                           relation: RelationDescription,
                                           update=False) -> dict:
    # Use a quick check of the query plan whether we depend on external schemas.
    if kind not in ("CTAS", "VIEW"):
        raise ETLSystemError(f"unexpected source name: {kind}")

    with relation.matching_temporary_view(
            conn, as_late_binding_view=True) as tmp_view_name:
        dependencies = fetch_dependency_hints(conn, relation.query_stmt)
        if dependencies is None:
            raise RuntimeError("failed to query for dependencies")
        if any(dependencies):
            logger.info(
                "Looks like %s has external dependencies, proceeding with caution",
                relation.identifier)
            if kind == "VIEW":
                raise RuntimeError(
                    "VIEW not supported for transformations that depend on extrenal tables"
                )
            return create_table_design_for_ctas(conn, tmp_view_name, relation,
                                                update)

    with relation.matching_temporary_view(
            conn, as_late_binding_view=False) as tmp_view_name:
        if kind == "VIEW":
            return create_table_design_for_view(conn, tmp_view_name, relation,
                                                update)
        return create_table_design_for_ctas(conn, tmp_view_name, relation,
                                            update)
Ejemplo n.º 2
0
 def stat(self, filename):
     """Return file size (in bytes) and timestamp of last modification for this file."""
     if self.scheme == "s3":
         return etl.s3.object_stat(self.netloc, filename)
     elif self.scheme == "file":
         return os.path.getsize(filename), datetime.utcfromtimestamp(os.path.getmtime(filename)).isoformat(" ")
     else:
         raise ETLSystemError("illegal scheme in file set")
Ejemplo n.º 3
0
 def uri(self, filename):
     """Return the full URI for the filename (either in S3 or local)."""
     if self.scheme == "s3":
         return "{0.scheme}://{0.netloc}/{1}".format(self, filename)
     elif self.scheme == "file":
         return filename
     else:
         raise ETLSystemError("illegal scheme in file set")
Ejemplo n.º 4
0
 def external_schemas(self) -> frozenset:
     """List external schemas that are not managed by us and may not exist during validation."""
     if self._external_schemas is None:
         try:
             self._external_schemas = frozenset(etl.config.get_dw_config().external_schema_names)
         except AttributeError:
             raise ETLSystemError("dw_config has not been set!")
     return self._external_schemas
Ejemplo n.º 5
0
 def managed_schemas(self) -> frozenset:
     if self._managed_schemas is None:
         try:
             schemas = etl.config.get_dw_config().schemas
         except AttributeError:
             raise ETLSystemError("dw_config has not been set!")
         self._managed_schemas = frozenset(schema.name
                                           for schema in schemas)
     return self._managed_schemas
Ejemplo n.º 6
0
 def uri(self, filename):
     """
     Return the full URI for the filename, which probably should be one of the files from this set
     """
     if self.scheme == "s3":
         return "{0.scheme}://{0.netloc}/{1}".format(self, filename)
     elif self.scheme == "file":
         return filename
     else:
         raise ETLSystemError("illegal scheme in file set")
Ejemplo n.º 7
0
 def stat(self, filename):
     """
     Return file size (in bytes) and timestamp of last modification for the file which should be one from this set.
     """
     if self.scheme == "s3":
         return etl.s3.object_stat(self.netloc, filename)
     elif self.scheme == "file":
         return local_file_stat(filename)
     else:
         raise ETLSystemError("illegal scheme in file set")
Ejemplo n.º 8
0
    def managed_schemas(self) -> frozenset:
        """
        List of schemas that are managed by Arthur.

        This contains all schemas not just the schema of this relation.
        """
        if self._managed_schemas is None:
            try:
                schemas = etl.config.get_dw_config().schemas
            except AttributeError:
                raise ETLSystemError("dw_config has not been set!")
            self._managed_schemas = frozenset(schema.name for schema in schemas)
        return self._managed_schemas
Ejemplo n.º 9
0
def determine_data_format_parameters(data_format, format_option,
                                     file_compression):
    if data_format is None:
        # This is our original data format (which mirrors settings in unload).
        data_format_parameters = "DELIMITER ',' ESCAPE REMOVEQUOTES GZIP"
    else:
        if data_format == "CSV":
            if format_option is None:
                data_format_parameters = "CSV"
            else:
                data_format_parameters = "CSV QUOTE AS '{}'".format(
                    format_option)
        elif data_format in ["AVRO", "JSON"]:
            if format_option is None:
                format_option = "auto"
            data_format_parameters = "{} AS '{}'".format(
                data_format, format_option)
        else:
            raise ETLSystemError(
                "found unexpected data format: {}".format(data_format))
        if file_compression is not None:
            data_format_parameters += " {}".format(file_compression)
    return data_format_parameters