Ejemplo n.º 1
0
    def read_table_as_dataframe(self, source: DataWarehouseSchema, relation: RelationDescription):
        """
        Read dataframe (with partitions) by contacting upstream JDBC-reachable source.
        """
        partition_key = relation.find_partition_key()

        table_size = self.fetch_source_table_size(source.dsn, relation)
        num_partitions = self.maximize_partitions(table_size)

        if partition_key is None or num_partitions <= 1:
            predicates = None
        else:
            with closing(etl.db.connection(source.dsn, readonly=True)) as conn:
                predicates = self.determine_partitioning(conn, relation, partition_key, num_partitions)

        if self.use_sampling_with_table(table_size):
            inner_select = self.select_statement(relation, partition_key)
        else:
            inner_select = self.select_statement(relation, None)
        select_statement = """({}) AS t""".format(inner_select)
        self.logger.debug("Table query: SELECT * FROM %s", select_statement)

        jdbc_url, dsn_properties = etl.db.extract_dsn(source.dsn, read_only=True)
        df = self.sql_context.read.jdbc(
            url=jdbc_url, properties=dsn_properties, table=select_statement, predicates=predicates
        )
        return df
Ejemplo n.º 2
0
    def build_sqoop_options(self, source_dsn: Dict[str, str],
                            relation: RelationDescription, table_size: int,
                            connection_param_file_path: str,
                            password_file_path: str) -> List[str]:
        """
        Create set of Sqoop options.

        Starts with the command (import), then continues with generic options,
        tool specific options, and child-process options.
        """
        jdbc_url, dsn_properties = etl.db.extract_dsn(source_dsn)

        partition_key = relation.find_partition_key()
        select_statement = self.build_sqoop_select(relation, partition_key,
                                                   table_size)
        partition_options = self.build_sqoop_partition_options(
            relation, partition_key, table_size)

        # Only the paranoid survive ... quote arguments of options, except for --select
        def q(s):
            # E731 do not assign a lambda expression, use a def -- whatever happened to Python?
            return '"{}"'.format(s)

        args = [
            "import",
            "--connect",
            q(jdbc_url),
            "--driver",
            q(dsn_properties["driver"]),
            "--connection-param-file",
            q(connection_param_file_path),
            "--username",
            q(dsn_properties["user"]),
            "--password-file",
            '"file://{}"'.format(password_file_path),
            "--verbose",
            "--fields-terminated-by",
            q(","),
            "--lines-terminated-by",
            r"'\n'",
            "--enclosed-by",
            "'\"'",
            "--escaped-by",
            r"'\\'",
            "--null-string",
            r"'\\N'",
            "--null-non-string",
            r"'\\N'",
            # NOTE Does not work with s3n:  "--delete-target-dir",
            "--target-dir",
            '"s3n://{}/{}/{}"'.format(relation.bucket_name, relation.prefix,
                                      relation.csv_path_name),
            # NOTE Quoting the select statement (e.g. with shlex.quote) breaks the select in an unSQLy way.
            "--query",
            select_statement,
            # NOTE Embedded newlines are not escaped so we need to remove them.  WAT?
            "--hive-drop-import-delims",
            "--compress"
        ]  # The default compression codec is gzip.

        args.extend(partition_options)
        self.logger.debug("Sqoop options are:\n%s", " ".join(args))
        return args