Ejemplo n.º 1
0
def sync_with_s3(
    relations: Sequence[RelationDescription],
    config_paths: Iterable[str],
    bucket_name: str,
    prefix: str,
    deploy_config: bool,
    delete_schemas_pattern: Optional[TableSelector],
    delete_data_pattern: Optional[TableSelector],
    dry_run=False,
) -> None:
    """
    Copy (validated) table design and SQL files from local directory to S3 bucket.

    To make sure that we don't upload "bad" files or leave files in S3 in an incomplete state,
    we first load to validate all table design files to be uploaded.
    """
    logger.info("Validating %d table design(s) before upload", len(relations))
    RelationDescription.load_in_parallel(relations)

    # Collect list of (local file, remote file) tuples to send to the uploader.
    files: List[Tuple[str, str]] = []
    for relation in relations:
        if relation.is_transformation:
            if relation.sql_file_name is None:
                raise MissingQueryError("missing matching SQL file for '%s'" %
                                        relation.design_file_name)
            relation_files = [
                relation.design_file_name, relation.sql_file_name
            ]
        else:
            relation_files = [relation.design_file_name]

        for file_name in relation_files:
            filename = relation.norm_path(file_name)
            files.append((filename, filename))

    # OK. Things appear to have worked out so far. Now make actual uploads to S3.
    if deploy_config:
        upload_settings(config_paths, bucket_name, prefix, dry_run=dry_run)
    else:
        logger.info("As you wish: Configuration files are not changed in S3")

    if delete_data_pattern is not None:
        etl.file_sets.delete_data_files_in_s3(bucket_name,
                                              prefix,
                                              delete_data_pattern,
                                              dry_run=dry_run)

    if delete_schemas_pattern is not None:
        etl.file_sets.delete_schemas_files_in_s3(bucket_name,
                                                 prefix,
                                                 delete_schemas_pattern,
                                                 dry_run=dry_run)

    etl.s3.upload_files(files, bucket_name, prefix, dry_run)
Ejemplo n.º 2
0
def bootstrap_sources(selector,
                      table_design_dir,
                      local_files,
                      update=False,
                      replace=False,
                      dry_run=False):
    """
    Download schemas from database tables and write or update local design files.

    This will create new design files locally if they don't already exist for any relations tied
    to upstream database sources. If one already exists locally, the file is update or replaced or
    kept as-is depending on the "update" and "replace" arguments.

    This is a callback of a command.
    """
    dw_config = etl.config.get_dw_config()
    selected_database_schemas = [
        schema for schema in dw_config.schemas
        if schema.is_database_source and selector.match_schema(schema.name)
    ]
    database_schema_names = {
        schema.name
        for schema in selected_database_schemas
    }
    existing_relations = [
        RelationDescription(file_set) for file_set in local_files
        if file_set.source_name in database_schema_names
        and file_set.design_file_name
    ]
    if existing_relations:
        RelationDescription.load_in_parallel(existing_relations)

    total = 0
    for schema in selected_database_schemas:
        relations = [
            relation for relation in existing_relations
            if relation.source_name == schema.name
        ]
        total += create_table_designs_from_source(schema,
                                                  selector,
                                                  table_design_dir,
                                                  relations,
                                                  update,
                                                  replace,
                                                  dry_run=dry_run)
    if not total:
        logger.warning(
            "Found no matching tables in any upstream source for '%s'",
            selector)
    def select_statement(self, relation: RelationDescription,
                         add_sampling_on_column: Optional[str]) -> str:
        """
        Create "SELECT statement with quoted identifiers and base WHERE clause.

        Return something like
            "SELECT id, name FROM table WHERE TRUE" or
            "SELECT id, name FROM table WHERE ((id % 10) = 1)"
        where the actual statement uses delimited identifiers.
        Note the existence of the WHERE clause which allows appending more conditions.
        """
        selected_columns = relation.get_columns_with_casts()
        statement = """SELECT {} FROM {}""".format(", ".join(selected_columns),
                                                   relation.source_table_name)

        condition = relation.table_design.get("extract_settings",
                                              {}).get("condition", "TRUE")

        if add_sampling_on_column is None:
            statement += """ WHERE ({})""".format(condition)
        else:
            self.logger.info(
                "Adding sampling on column '%s' while extracting '%s.%s'",
                add_sampling_on_column,
                relation.source_name,
                relation.source_table_name.identifier,
            )
            statement += """ WHERE (({}) AND ("{}" % 10) = 1)""".format(
                condition, add_sampling_on_column)

        return statement
Ejemplo n.º 4
0
def bootstrap_transformations(dsn_etl, schemas, local_dir, local_files, as_view,
                              update=False, replace=False, dry_run=False):
    """
    Download design information for transformations by test-running them in the data warehouse.
    """
    transformation_schema = {schema.name for schema in schemas if schema.has_transformations}
    transforms = [file_set for file_set in local_files if file_set.source_name in transformation_schema]
    if not (update or replace):
        transforms = [file_set for file_set in transforms if not file_set.design_file_name]
    if not transforms:
        logger.warning("Found no new queries without matching design files")
        return
    relations = [RelationDescription(file_set) for file_set in transforms]
    if update:
        logger.info("Loading existing table design file(s)")
        # Unfortunately, this adds warnings about any of the upstream sources being unknown.
        relations = etl.relation.order_by_dependencies(relations)

    if as_view:
        create_func = create_table_design_for_view
    else:
        create_func = create_table_design_for_ctas

    with closing(etl.db.connection(dsn_etl, autocommit=True)) as conn:
        for relation in relations:
            with relation.matching_temporary_view(conn) as tmp_view_name:
                table_design = create_func(conn, tmp_view_name, relation, update)
                source_dir = os.path.join(local_dir, relation.source_name)
                save_table_design(source_dir, relation.target_table_name, relation.target_table_name, table_design,
                                  overwrite=update or replace, dry_run=dry_run)
Ejemplo n.º 5
0
    def read_table_as_dataframe(self, source: DataWarehouseSchema, relation: RelationDescription):
        """
        Read dataframe (with partitions) by contacting upstream JDBC-reachable source.
        """
        partition_key = relation.find_partition_key()

        table_size = self.fetch_source_table_size(source.dsn, relation)
        num_partitions = self.maximize_partitions(table_size)

        if partition_key is None or num_partitions <= 1:
            predicates = None
        else:
            with closing(etl.db.connection(source.dsn, readonly=True)) as conn:
                predicates = self.determine_partitioning(conn, relation, partition_key, num_partitions)

        if self.use_sampling_with_table(table_size):
            inner_select = self.select_statement(relation, partition_key)
        else:
            inner_select = self.select_statement(relation, None)
        select_statement = """({}) AS t""".format(inner_select)
        self.logger.debug("Table query: SELECT * FROM %s", select_statement)

        jdbc_url, dsn_properties = etl.db.extract_dsn(source.dsn, read_only=True)
        df = self.sql_context.read.jdbc(
            url=jdbc_url, properties=dsn_properties, table=select_statement, predicates=predicates
        )
        return df
Ejemplo n.º 6
0
 def source_info(source: DataWarehouseSchema,
                 relation: RelationDescription):
     return {
         "name": source.name,
         "bucket_name": source.s3_bucket,
         "object_prefix": relation.data_directory(source.s3_path_prefix),
     }
Ejemplo n.º 7
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Read the table before writing it out to CSV using Spark's dataframe API."""
     with etl.db.log_error():
         df = self.read_table_as_dataframe(source, relation)
         self.write_dataframe_as_csv(df, relation)
         self.write_manifest_file(relation, relation.bucket_name,
                                  relation.data_directory())
Ejemplo n.º 8
0
def sync_with_s3(relations: List[RelationDescription], bucket_name: str, prefix: str, dry_run: bool = False) -> None:
    """
    Copy (validated) table design and SQL files from local directory to S3 bucket.
    """
    logger.info("Validating %d table design(s) before upload", len(relations))
    RelationDescription.load_in_parallel(relations)

    files = []  # typing: List[Tuple[str, str]]
    for relation in relations:
        relation_files = [relation.design_file_name]
        if relation.is_transformation:
            if relation.sql_file_name:
                relation_files.append(relation.sql_file_name)
            else:
                raise MissingQueryError("Missing matching SQL file for '%s'" % relation.design_file_name)
        for file_name in relation_files:
            local_filename = relation.norm_path(file_name)
            remote_filename = os.path.join(prefix, local_filename)
            files.append((local_filename, remote_filename))

    uploader = etl.s3.S3Uploader(bucket_name, dry_run=dry_run)
    with Timer() as timer:
        futures = []  # typing: List[concurrent.futures.Future]
        # TODO With Python 3.6, we should pass in a thread_name_prefix
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            for local_filename, remote_filename in files:
                futures.append(executor.submit(uploader.__call__, local_filename, remote_filename))
        errors = 0
        for future in concurrent.futures.as_completed(futures):
            exception = future.exception()
            if exception is not None:
                logger.error("Failed to upload file: %s", exception)
                errors += 1
    if not dry_run:
        logger.info(
            "Uploaded %d of %d file(s) to 's3://%s/%s (%s)", len(files) - errors, len(files), bucket_name, prefix, timer
        )
    if errors:
        raise ETLRuntimeError("There were {:d} error(s) during upload".format(errors))
 def select_statement(self, relation: RelationDescription, add_sampling_on_column: Optional[str]) -> str:
     """
     Return something like
         "SELECT id, name FROM table WHERE TRUE" or
         "SELECT id, name FROM table WHERE ((id % 10) = 1)"
     where the actual statement used delimited identifiers, but note the existence of the WHERE clause.
     """
     selected_columns = relation.get_columns_with_casts()
     statement = """SELECT {} FROM {}""".format(", ".join(selected_columns), relation.source_table_name)
     if add_sampling_on_column is None:
         statement += " WHERE TRUE"
     else:
         self.logger.info("Adding sampling on column '%s' while extracting '%s.%s'",
                          add_sampling_on_column, relation.source_name, relation.source_table_name.identifier)
         statement += """ WHERE (("{}" % 10) = 1)""".format(add_sampling_on_column)
     return statement
Ejemplo n.º 10
0
 def _delete_directory_before_write(self,
                                    relation: RelationDescription) -> None:
     """Need to first delete data directory since Sqoop won't overwrite (and can't delete)."""
     csv_prefix = relation.data_directory()
     deletable = sorted(
         etl.s3.list_objects_for_prefix(relation.bucket_name, csv_prefix))
     if not deletable:
         return
     if self.dry_run:
         self.logger.info(
             "Dry-run: Skipping deletion of %d existing CSV file(s) in 's3://%s/%s'",
             len(deletable),
             relation.bucket_name,
             csv_prefix,
         )
     else:
         etl.s3.delete_objects(relation.bucket_name, deletable, wait=True)
Ejemplo n.º 11
0
def write_columns_file(relation: RelationDescription, bucket_name: str,
                       prefix: str, dry_run: bool) -> None:
    """Write out a YAML file into the same folder as the CSV files with a list of the columns."""
    data = {
        "columns": relation.unquoted_columns,
        "columns_with_types": relation.get_columns_with_types(),
    }
    object_key = f"{prefix}/columns.yaml"
    if dry_run:
        logger.info("Dry-run: Skipping writing columns file to 's3://%s/%s'",
                    bucket_name, object_key)
        return

    logger.info("Writing columns file to 's3://%s/%s'", bucket_name,
                object_key)
    etl.s3.upload_data_to_s3(data, bucket_name, object_key)
    # This waits for the file to show up so that we don't move on before all files are ready.
    etl.s3.wait_until_object_exists(bucket_name, object_key)
Ejemplo n.º 12
0
 def write_dataframe_as_csv(self, df,
                            relation: RelationDescription) -> None:
     """Write (partitioned) dataframe to CSV file(s)."""
     s3_uri = "s3a://{}/{}".format(relation.bucket_name,
                                   relation.data_directory())
     if self.dry_run:
         self.logger.info("Dry-run: Skipping upload to '%s'", s3_uri)
     else:
         self.logger.info("Writing dataframe for '%s' to '%s'",
                          relation.source_path_name, s3_uri)
         # N.B. This must match the Sqoop (import) and Redshift (COPY) options
         # BROKEN Uses double quotes to escape double quotes ("Hello" becomes """Hello""")
         # BROKEN Does not escape newlines ('\n' does not become '\\n' so is read as 'n' in Redshift)
         write_options = {
             "header": "false",
             "nullValue": r"\N",
             "quoteAll": "true",
             "codec": "gzip"
         }
         df.write.mode("overwrite").options(**write_options).csv(s3_uri)
Ejemplo n.º 13
0
    def extract_table(self, source: DataWarehouseSchema,
                      relation: RelationDescription) -> None:
        """Run Sqoop for one table; creates the sub-process and all the pretty args for Sqoop."""
        try:
            table_size = self.fetch_source_table_size(source.dsn, relation)
        except psycopg2.OperationalError as exc:
            raise DataExtractError("failed to fetch table size for '%s'" %
                                   relation.identifier) from exc

        connection_params_file_path = self.write_connection_params()
        password_file_path = self.write_password_file(source.dsn["password"])
        args = self.build_sqoop_options(source.dsn, relation, table_size,
                                        connection_params_file_path,
                                        password_file_path)
        options_file = self.write_options_file(args)
        # TODO(tom): Guard against failure in S3
        self._delete_directory_before_write(relation)

        self.run_sqoop(options_file)
        self.write_manifest_file(relation, relation.bucket_name,
                                 relation.data_directory())
Ejemplo n.º 14
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Build a manifest file for the given table and write it to S3."""
     self.write_manifest_file(relation, relation.bucket_name,
                              relation.data_directory())
Ejemplo n.º 15
0
    def build_sqoop_options(self, source_dsn: Dict[str, str],
                            relation: RelationDescription, table_size: int,
                            connection_param_file_path: str,
                            password_file_path: str) -> List[str]:
        """
        Create set of Sqoop options.

        Starts with the command (import), then continues with generic options,
        tool specific options, and child-process options.
        """
        jdbc_url, dsn_properties = etl.db.extract_dsn(source_dsn)

        partition_key = relation.find_partition_key()
        select_statement = self.build_sqoop_select(relation, partition_key,
                                                   table_size)
        partition_options = self.build_sqoop_partition_options(
            relation, partition_key, table_size)

        # Only the paranoid survive ... quote arguments of options, except for --select
        def q(s):
            # E731 do not assign a lambda expression, use a def -- whatever happened to Python?
            return '"{}"'.format(s)

        args = [
            "import",
            "--connect",
            q(jdbc_url),
            "--driver",
            q(dsn_properties["driver"]),
            "--connection-param-file",
            q(connection_param_file_path),
            "--username",
            q(dsn_properties["user"]),
            "--password-file",
            '"file://{}"'.format(password_file_path),
            "--verbose",
            "--fields-terminated-by",
            q(","),
            "--lines-terminated-by",
            r"'\n'",
            "--enclosed-by",
            "'\"'",
            "--escaped-by",
            r"'\\'",
            "--null-string",
            r"'\\N'",
            "--null-non-string",
            r"'\\N'",
            # NOTE Does not work with s3n:  "--delete-target-dir",
            "--target-dir",
            '"s3n://{}/{}/{}"'.format(relation.bucket_name, relation.prefix,
                                      relation.csv_path_name),
            # NOTE Quoting the select statement (e.g. with shlex.quote) breaks the select in an unSQLy way.
            "--query",
            select_statement,
            # NOTE Embedded newlines are not escaped so we need to remove them.  WAT?
            "--hive-drop-import-delims",
            "--compress"
        ]  # The default compression codec is gzip.

        args.extend(partition_options)
        self.logger.debug("Sqoop options are:\n%s", " ".join(args))
        return args
Ejemplo n.º 16
0
def bootstrap_transformations(local_dir,
                              local_files,
                              source_name,
                              check_only=False,
                              update=False,
                              replace=False,
                              dry_run=False):
    """
    Download design information for transformations by test-running in the data warehouse.

    "source_name" should be "CTAS" or "VIEW or None (in which case the relation type currently
    specified will continue to be used).

    This is a callback of a command.
    """
    dw_config = etl.config.get_dw_config()
    transformation_schema = {
        schema.name
        for schema in dw_config.schemas if schema.has_transformations
    }
    transforms = [
        file_set for file_set in local_files
        if file_set.source_name in transformation_schema
    ]
    if not (check_only or replace or update):
        # Filter down to new transformations: SQL files without matching YAML file
        transforms = [
            file_set for file_set in transforms
            if not file_set.design_file_name
        ]
    if not transforms:
        logger.warning("Found no new queries without matching design files")
        return

    relations = [RelationDescription(file_set) for file_set in transforms]
    if check_only or update or (replace and source_name is None):
        logger.info("Loading existing table design file(s)")
        try:
            RelationDescription.load_in_parallel(relations)
        except Exception:
            logger.warning(
                "Make sure that table design files exist and are valid before trying to update"
            )
            raise

    check_only_errors = 0
    with closing(etl.db.connection(dw_config.dsn_etl,
                                   autocommit=True)) as conn:
        for index, relation in enumerate(relations):
            logger.info("Working on transformation '%s' (%d/%d)",
                        relation.identifier, index + 1, len(relations))
            # Be careful to not trigger a load of an unknown design file by accessing "kind".
            actual_kind = source_name or (relation.kind if
                                          relation.design_file_name else None)
            try:
                table_design = create_table_design_for_transformation(
                    conn, actual_kind, relation, update or check_only)
            except RuntimeError as exc:
                if check_only:
                    print(
                        f"Failed to create table design for {relation:x}: {exc}"
                    )
                    check_only_errors += 1
                    continue
                else:
                    raise

            if check_only:
                if relation.table_design != table_design:
                    check_only_errors += 1
                    print(f"Change detected in table design for {relation:x}")
                    print(
                        diff_table_designs(relation.table_design, table_design,
                                           relation.design_file_name,
                                           "bootstrap"))
                continue

            if update and relation.table_design == table_design:
                logger.info(
                    f"No updates detected in table design for {relation:x}, skipping write"
                )
                continue

            source_dir = os.path.join(local_dir, relation.source_name)
            # Derive preferred name from the current design or SQL file.
            if relation.design_file_name is not None:
                filename = relation.design_file_name
            elif relation.sql_file_name is not None:
                filename = re.sub(r".sql$", ".yaml", relation.sql_file_name)
            else:
                filename = os.path.join(
                    source_dir,
                    f"{relation.target_table_name.schema}-{relation.target_table_name.table}.yaml",
                )
            save_table_design(relation.target_table_name,
                              table_design,
                              filename,
                              overwrite=update or replace,
                              dry_run=dry_run)

    if check_only_errors:
        raise TableDesignValidationError(
            f"found {check_only_errors} table design(s) that would be rewritten"
        )
    if check_only:
        print("Congratulations. There were no changes in table design files.")
Ejemplo n.º 17
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Write out manifest file for files from (rendered) path in S3."""
     prefix_for_table = relation.data_directory(source.s3_path_prefix)
     self.write_manifest_file(relation, source.s3_bucket, prefix_for_table)