Beispiel #1
0
    def temp_s3_copy(self,
                     tbl,
                     aws_access_key_id=None,
                     aws_secret_access_key=None):

        if not self.s3_temp_bucket:
            raise KeyError((
                "Missing S3_TEMP_BUCKET, needed for transferring data to Redshift. "
                "Must be specified as env vars or kwargs"))

        # Coalesce S3 Key arguments
        aws_access_key_id = aws_access_key_id or self.aws_access_key_id
        aws_secret_access_key = aws_secret_access_key or self.aws_secret_access_key

        self.s3 = S3(aws_access_key_id=aws_access_key_id,
                     aws_secret_access_key=aws_secret_access_key)

        hashed_name = hash(time.time())
        key = f"{S3_TEMP_KEY_PREFIX}/{hashed_name}.csv.gz"

        # Convert table to compressed CSV file, to optimize the transfers to S3 and to
        # Redshift.
        local_path = tbl.to_csv(temp_file_compression='gzip')
        # Copy table to bucket
        self.s3.put_file(self.s3_temp_bucket, key, local_path)

        return key
Beispiel #2
0
    def setUp(self):

        self.s3 = S3()

        self.s3.aws.session.get_credentials()

        # Create a new bucket
        self.test_bucket = os.environ['S3_TEMP_BUCKET']
        # Trying miss random errors on not finding buckets
        self.s3.create_bucket(self.test_bucket)

        self.test_key = 'test.csv'
        self.tbl = Table([{'first': 'Bob', 'last': 'Smith'}])
        csv_path = self.tbl.to_csv()

        self.test_key_2 = 'test2.csv'
        self.tbl_2 = Table([{'first': 'Jack', 'last': 'Smith'}])
        csv_path_2 = self.tbl_2.to_csv()

        # Sometimes it runs into issues putting the file
        retry = 1

        while retry <= 5:
            try:
                # Put a test file in the bucket
                self.s3.put_file(self.test_bucket, self.test_key, csv_path)
                self.s3.put_file(self.test_bucket, self.test_key_2, csv_path_2)
                break
            except Exception:
                print('Retrying putting file in bucket...')
                retry += 1
Beispiel #3
0
    def get_creds(self, aws_access_key_id, aws_secret_access_key):

        if aws_access_key_id and aws_secret_access_key:
            # When we have credentials, then we don't need to set them again
            pass

        elif self.iam_role:
            # bail early, since the bottom is specifically formatted with creds
            return f"credentials 'aws_iam_role={self.iam_role}'\n"

        elif self.aws_access_key_id and self.aws_secret_access_key:

            aws_access_key_id = self.aws_access_key_id
            aws_secret_access_key = self.aws_secret_access_key

        elif 'AWS_ACCESS_KEY_ID' in os.environ and 'AWS_SECRET_ACCESS_KEY' in os.environ:

            aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
            aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

        else:

            s3 = S3()
            creds = s3.aws.session.get_credentials()
            aws_access_key_id = creds.access_key
            aws_secret_access_key = creds.secret_key

        return "credentials 'aws_access_key_id={};aws_secret_access_key={}'\n".format(
            aws_access_key_id, aws_secret_access_key)
Beispiel #4
0
    def get_creds(self, aws_access_key_id, aws_secret_access_key):

        if aws_access_key_id and aws_secret_access_key:

            pass

        elif 'AWS_ACCESS_KEY_ID' in os.environ and 'AWS_SECRET_ACCESS_KEY' in os.environ:

            aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
            aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']

        else:

            s3 = S3()
            creds = s3.aws.session.get_credentials()
            aws_access_key_id = creds.access_key
            aws_secret_access_key = creds.secret_key

        return "credentials 'aws_access_key_id={};aws_secret_access_key={}'\n".format(
            aws_access_key_id, aws_secret_access_key)
Beispiel #5
0
    def generate_manifest(self,
                          buckets,
                          aws_access_key_id=None,
                          aws_secret_access_key=None,
                          mandatory=True,
                          prefix=None,
                          manifest_bucket=None,
                          manifest_key=None,
                          path=None):
        """
        Given a list of S3 buckets, generate a manifest file (JSON format). A manifest file
        allows you to copy multiple files into a single table at once. Once the manifest is
        generated, you can pass it with the :func:`~parsons.redshift.Redshift.copy_s3` method.

        AWS keys are not required if ``AWS_ACCESS_KEY_ID`` and
        ``AWS_SECRET_ACCESS_KEY`` environmental variables set.

        `Args:`

            buckets: list or str
                A list of buckets or single bucket from which to generate manifest
            aws_access_key_id: str
                AWS access key id to access S3 bucket
            aws_secret_access_key: str
                AWS secret access key to access S3 bucket
            mandatory: boolean
                The mandatory flag indicates whether the Redshift COPY should
                terminate if the file does not exist.
            prefix: str
                Optional filter for key prefixes
            manifest_bucket: str
                Optional bucket to write manifest file.
            manifest_key: str
                Optional key name for S3 bucket to write file

        `Returns:`
            ``dict`` of manifest
        """

        from parsons.aws import S3
        s3 = S3(aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key)

        # Deal with a single bucket being passed, rather than list.
        if isinstance(buckets, str):
            buckets = [buckets]

        # Generate manifest file
        manifest = {'entries': []}
        for bucket in buckets:

            # Retrieve list of files in bucket
            key_list = s3.list_keys(bucket, prefix=prefix)
            for key in key_list:
                manifest['entries'].append({
                    'url':
                    '/'.join(['s3:/', bucket, key]),
                    'mandatory':
                    mandatory
                })

        logger.info('Manifest generated.')

        # Save the file to s3 bucket if provided
        if manifest_key and manifest_bucket:
            # Dump the manifest to a temp JSON file
            manifest_path = files.create_temp_file()
            with open(manifest_path, 'w') as manifest_file_obj:
                json.dump(manifest,
                          manifest_file_obj,
                          sort_keys=True,
                          indent=4)

            # Upload the file to S3
            s3.put_file(manifest_bucket, manifest_key, manifest_path)

            logger.info(
                f'Manifest saved to s3://{manifest_bucket}/{manifest_key}')

        return manifest
Beispiel #6
0
    def copy_s3(self,
                table_name,
                bucket,
                key,
                manifest=False,
                data_type='csv',
                csv_delimiter=',',
                compression=None,
                if_exists='fail',
                max_errors=0,
                distkey=None,
                sortkey=None,
                padding=None,
                varchar_max=None,
                statupdate=True,
                compupdate=True,
                ignoreheader=1,
                acceptanydate=True,
                dateformat='auto',
                timeformat='auto',
                emptyasnull=True,
                blanksasnull=True,
                nullas=None,
                acceptinvchars=True,
                truncatecolumns=False,
                columntypes=None,
                specifycols=None,
                aws_access_key_id=None,
                aws_secret_access_key=None,
                bucket_region=None,
                strict_length=True,
                template_table=None):
        """
        Copy a file from s3 to Redshift.

        `Args:`
            table_name: str
                The table name and schema (``tmc.cool_table``) to point the file.
            bucket: str
                The s3 bucket where the file or manifest is located.
            key: str
                The key of the file or manifest in the s3 bucket.
            manifest: str
                If using a manifest
            data_type: str
                The data type of the file. Only ``csv`` supported currently.
            csv_delimiter: str
                The delimiter of the ``csv``. Only relevant if data_type is ``csv``.
            compression: str
                If specified (``gzip``), will attempt to decompress the file.
            if_exists: str
                If the table already exists, either ``fail``, ``append``, ``drop``
                or ``truncate`` the table.
            max_errors: int
                The maximum number of rows that can error and be skipped before
                the job fails.
            distkey: str
                The column name of the distkey
            sortkey: str
                The column name of the sortkey
            padding: float
                A percentage padding to add to varchar columns if creating a new table. This is
                helpful to add a buffer for future copies in which the data might be wider.
            varchar_max: list
                A list of columns in which to set the width of the varchar column to 65,535
                characters.
            statupate: boolean
                Governs automatic computation and refresh of optimizer statistics at the end
                of a successful COPY command.
            compupdate: boolean
                Controls whether compression encodings are automatically applied during a COPY.
            ignore_header: int
                The number of header rows to skip. Ignored if data_type is ``json``.
            acceptanydate: boolean
                Allows any date format, including invalid formats such as 00/00/00 00:00:00, to be
                loaded without generating an error.
            emptyasnull: boolean
                Indicates that Amazon Redshift should load empty char and varchar fields
                as ``NULL``.
            blanksasnull: boolean
                Loads blank varchar fields, which consist of only white space characters,
                as ``NULL``.
            nullas: str
                Loads fields that match string as NULL
            acceptinvchars: boolean
                Enables loading of data into VARCHAR columns even if the data contains
                invalid UTF-8 characters.
            dateformat: str
                Set the date format. Defaults to ``auto``.
            timeformat: str
                Set the time format. Defaults to ``auto``.
            truncatecolumns: boolean
                If the table already exists, truncates data in columns to the appropriate number
                of characters so that it fits the column specification. Applies only to columns
                with a VARCHAR or CHAR data type, and rows 4 MB or less in size.
            columntypes: dict
                Optional map of column name to redshift column type, overriding the usual type
                inference. You only specify the columns you want to override, eg.
                ``columntypes={'phone': 'varchar(12)', 'age': 'int'})``.
            specifycols: boolean
                Adds a column list to the Redshift `COPY` command, allowing for the source table
                in an append to have the columnns out of order, and to have fewer columns with any
                leftover target table columns filled in with the `DEFAULT` value.

                This will fail if all of the source table's columns do not match a column in the
                target table. This will also fail if the target table has an `IDENTITY`
                column and that column name is among the source table's columns.
            aws_access_key_id:
                An AWS access key granted to the bucket where the file is located. Not required
                if keys are stored as environmental variables.
            aws_secret_access_key:
                An AWS secret access key granted to the bucket where the file is located. Not
                required if keys are stored as environmental variables.
            bucket_region: str
                The AWS region that the bucket is located in. This should be provided if the
                Redshift cluster is located in a different region from the temp bucket.
            strict_length: bool
                If the database table needs to be created, strict_length determines whether
                the created table's column sizes will be sized to exactly fit the current data,
                or if their size will be rounded up to account for future values being larger
                then the current dataset. defaults to ``True``; this argument is ignored if
                ``padding`` is specified
            template_table: str
                Instead of specifying columns, columntypes, and/or inference, if there
                is a pre-existing table that has the same columns/types, then use the template_table
                table name as the schema for the new table.

        `Returns`
            Parsons Table or ``None``
                See :ref:`parsons-table` for output options.
        """

        with self.connection() as connection:

            if self._create_table_precheck(connection, table_name, if_exists):
                if template_table:
                    sql = f'CREATE TABLE {table_name} (LIKE {template_table})'
                else:
                    # Grab the object from s3
                    from parsons.aws.s3 import S3
                    s3 = S3(aws_access_key_id=aws_access_key_id,
                            aws_secret_access_key=aws_secret_access_key)

                    local_path = s3.get_file(bucket, key)
                    if data_type == 'csv':
                        tbl = Table.from_csv(local_path,
                                             delimiter=csv_delimiter)
                    else:
                        raise TypeError("Invalid data type provided")

                    # Create the table
                    sql = self.create_statement(tbl,
                                                table_name,
                                                padding=padding,
                                                distkey=distkey,
                                                sortkey=sortkey,
                                                varchar_max=varchar_max,
                                                columntypes=columntypes,
                                                strict_length=strict_length)

                self.query_with_connection(sql, connection, commit=False)
                logger.info(f'{table_name} created.')

            # Copy the table
            copy_sql = self.copy_statement(
                table_name,
                bucket,
                key,
                manifest=manifest,
                data_type=data_type,
                csv_delimiter=csv_delimiter,
                compression=compression,
                max_errors=max_errors,
                statupdate=statupdate,
                compupdate=compupdate,
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                ignoreheader=ignoreheader,
                acceptanydate=acceptanydate,
                emptyasnull=emptyasnull,
                blanksasnull=blanksasnull,
                nullas=nullas,
                acceptinvchars=acceptinvchars,
                truncatecolumns=truncatecolumns,
                specifycols=specifycols,
                dateformat=dateformat,
                timeformat=timeformat,
                bucket_region=bucket_region)

            self.query_with_connection(copy_sql, connection, commit=False)
            logger.info(f'Data copied to {table_name}.')
Beispiel #7
0
 def __init__(self):
     self.s3 = S3()