Ejemplo n.º 1
0
 def source_info(source: DataWarehouseSchema,
                 relation: RelationDescription):
     return {
         "name": source.name,
         "bucket_name": source.s3_bucket,
         "object_prefix": relation.data_directory(source.s3_path_prefix),
     }
Ejemplo n.º 2
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Read the table before writing it out to CSV using Spark's dataframe API."""
     with etl.db.log_error():
         df = self.read_table_as_dataframe(source, relation)
         self.write_dataframe_as_csv(df, relation)
         self.write_manifest_file(relation, relation.bucket_name,
                                  relation.data_directory())
Ejemplo n.º 3
0
 def _delete_directory_before_write(self,
                                    relation: RelationDescription) -> None:
     """Need to first delete data directory since Sqoop won't overwrite (and can't delete)."""
     csv_prefix = relation.data_directory()
     deletable = sorted(
         etl.s3.list_objects_for_prefix(relation.bucket_name, csv_prefix))
     if not deletable:
         return
     if self.dry_run:
         self.logger.info(
             "Dry-run: Skipping deletion of %d existing CSV file(s) in 's3://%s/%s'",
             len(deletable),
             relation.bucket_name,
             csv_prefix,
         )
     else:
         etl.s3.delete_objects(relation.bucket_name, deletable, wait=True)
Ejemplo n.º 4
0
 def write_dataframe_as_csv(self, df,
                            relation: RelationDescription) -> None:
     """Write (partitioned) dataframe to CSV file(s)."""
     s3_uri = "s3a://{}/{}".format(relation.bucket_name,
                                   relation.data_directory())
     if self.dry_run:
         self.logger.info("Dry-run: Skipping upload to '%s'", s3_uri)
     else:
         self.logger.info("Writing dataframe for '%s' to '%s'",
                          relation.source_path_name, s3_uri)
         # N.B. This must match the Sqoop (import) and Redshift (COPY) options
         # BROKEN Uses double quotes to escape double quotes ("Hello" becomes """Hello""")
         # BROKEN Does not escape newlines ('\n' does not become '\\n' so is read as 'n' in Redshift)
         write_options = {
             "header": "false",
             "nullValue": r"\N",
             "quoteAll": "true",
             "codec": "gzip"
         }
         df.write.mode("overwrite").options(**write_options).csv(s3_uri)
Ejemplo n.º 5
0
    def extract_table(self, source: DataWarehouseSchema,
                      relation: RelationDescription) -> None:
        """Run Sqoop for one table; creates the sub-process and all the pretty args for Sqoop."""
        try:
            table_size = self.fetch_source_table_size(source.dsn, relation)
        except psycopg2.OperationalError as exc:
            raise DataExtractError("failed to fetch table size for '%s'" %
                                   relation.identifier) from exc

        connection_params_file_path = self.write_connection_params()
        password_file_path = self.write_password_file(source.dsn["password"])
        args = self.build_sqoop_options(source.dsn, relation, table_size,
                                        connection_params_file_path,
                                        password_file_path)
        options_file = self.write_options_file(args)
        # TODO(tom): Guard against failure in S3
        self._delete_directory_before_write(relation)

        self.run_sqoop(options_file)
        self.write_manifest_file(relation, relation.bucket_name,
                                 relation.data_directory())
Ejemplo n.º 6
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Build a manifest file for the given table and write it to S3."""
     self.write_manifest_file(relation, relation.bucket_name,
                              relation.data_directory())
Ejemplo n.º 7
0
 def extract_table(self, source: DataWarehouseSchema,
                   relation: RelationDescription):
     """Write out manifest file for files from (rendered) path in S3."""
     prefix_for_table = relation.data_directory(source.s3_path_prefix)
     self.write_manifest_file(relation, source.s3_bucket, prefix_for_table)
Ejemplo n.º 8
0
    def build_sqoop_options(
        self,
        source_dsn: Dict[str, str],
        relation: RelationDescription,
        table_size: int,
        connection_param_file_path: str,
        password_file_path: str,
    ) -> List[str]:
        """
        Create set of Sqoop options.

        Starts with the command (import), then continues with generic options,
        tool specific options, and child-process options.
        """
        jdbc_url, dsn_properties = etl.db.extract_dsn(source_dsn)

        partition_key = relation.find_partition_key()
        select_statement = self.build_sqoop_select(relation, partition_key,
                                                   table_size)
        partition_options = self.build_sqoop_partition_options(
            relation, partition_key, table_size)

        # Only the paranoid survive ... quote arguments of options, except for --select
        def q(s):
            # E731 do not assign a lambda expression, use a def -- whatever happened to Python?
            return '"{}"'.format(s)

        args = [
            "import",
            "--connect",
            q(jdbc_url),
            "--driver",
            q(dsn_properties["driver"]),
            "--connection-param-file",
            q(connection_param_file_path),
            "--username",
            q(dsn_properties["user"]),
            "--password-file",
            '"file://{}"'.format(password_file_path),
            "--verbose",
            "--fields-terminated-by",
            q(","),
            "--lines-terminated-by",
            r"'\n'",
            "--enclosed-by",
            "'\"'",
            "--escaped-by",
            r"'\\'",
            "--null-string",
            r"'\\N'",
            "--null-non-string",
            r"'\\N'",
            # NOTE Does not work with s3n:  "--delete-target-dir",
            "--target-dir",
            '"s3n://{}/{}"'.format(relation.bucket_name,
                                   relation.data_directory()),
            # NOTE Quoting the select statement breaks the select in an unSQLy way.
            "--query",
            select_statement,
            # NOTE Embedded newlines are not escaped so we need to remove them.  WAT?
            "--hive-drop-import-delims",
            "--compress",
        ]  # The default compression codec is gzip.

        args.extend(partition_options)
        self.logger.debug("Sqoop options are:\n%s", " ".join(args))
        return args