def temp_s3_copy(self, tbl, aws_access_key_id=None, aws_secret_access_key=None): if not self.s3_temp_bucket: raise KeyError(( "Missing S3_TEMP_BUCKET, needed for transferring data to Redshift. " "Must be specified as env vars or kwargs")) # Coalesce S3 Key arguments aws_access_key_id = aws_access_key_id or self.aws_access_key_id aws_secret_access_key = aws_secret_access_key or self.aws_secret_access_key self.s3 = S3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) hashed_name = hash(time.time()) key = f"{S3_TEMP_KEY_PREFIX}/{hashed_name}.csv.gz" # Convert table to compressed CSV file, to optimize the transfers to S3 and to # Redshift. local_path = tbl.to_csv(temp_file_compression='gzip') # Copy table to bucket self.s3.put_file(self.s3_temp_bucket, key, local_path) return key
def setUp(self): self.s3 = S3() self.s3.aws.session.get_credentials() # Create a new bucket self.test_bucket = os.environ['S3_TEMP_BUCKET'] # Trying miss random errors on not finding buckets self.s3.create_bucket(self.test_bucket) self.test_key = 'test.csv' self.tbl = Table([{'first': 'Bob', 'last': 'Smith'}]) csv_path = self.tbl.to_csv() self.test_key_2 = 'test2.csv' self.tbl_2 = Table([{'first': 'Jack', 'last': 'Smith'}]) csv_path_2 = self.tbl_2.to_csv() # Sometimes it runs into issues putting the file retry = 1 while retry <= 5: try: # Put a test file in the bucket self.s3.put_file(self.test_bucket, self.test_key, csv_path) self.s3.put_file(self.test_bucket, self.test_key_2, csv_path_2) break except Exception: print('Retrying putting file in bucket...') retry += 1
def get_creds(self, aws_access_key_id, aws_secret_access_key): if aws_access_key_id and aws_secret_access_key: # When we have credentials, then we don't need to set them again pass elif self.iam_role: # bail early, since the bottom is specifically formatted with creds return f"credentials 'aws_iam_role={self.iam_role}'\n" elif self.aws_access_key_id and self.aws_secret_access_key: aws_access_key_id = self.aws_access_key_id aws_secret_access_key = self.aws_secret_access_key elif 'AWS_ACCESS_KEY_ID' in os.environ and 'AWS_SECRET_ACCESS_KEY' in os.environ: aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] else: s3 = S3() creds = s3.aws.session.get_credentials() aws_access_key_id = creds.access_key aws_secret_access_key = creds.secret_key return "credentials 'aws_access_key_id={};aws_secret_access_key={}'\n".format( aws_access_key_id, aws_secret_access_key)
def get_creds(self, aws_access_key_id, aws_secret_access_key): if aws_access_key_id and aws_secret_access_key: pass elif 'AWS_ACCESS_KEY_ID' in os.environ and 'AWS_SECRET_ACCESS_KEY' in os.environ: aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] else: s3 = S3() creds = s3.aws.session.get_credentials() aws_access_key_id = creds.access_key aws_secret_access_key = creds.secret_key return "credentials 'aws_access_key_id={};aws_secret_access_key={}'\n".format( aws_access_key_id, aws_secret_access_key)
def generate_manifest(self, buckets, aws_access_key_id=None, aws_secret_access_key=None, mandatory=True, prefix=None, manifest_bucket=None, manifest_key=None, path=None): """ Given a list of S3 buckets, generate a manifest file (JSON format). A manifest file allows you to copy multiple files into a single table at once. Once the manifest is generated, you can pass it with the :func:`~parsons.redshift.Redshift.copy_s3` method. AWS keys are not required if ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` environmental variables set. `Args:` buckets: list or str A list of buckets or single bucket from which to generate manifest aws_access_key_id: str AWS access key id to access S3 bucket aws_secret_access_key: str AWS secret access key to access S3 bucket mandatory: boolean The mandatory flag indicates whether the Redshift COPY should terminate if the file does not exist. prefix: str Optional filter for key prefixes manifest_bucket: str Optional bucket to write manifest file. manifest_key: str Optional key name for S3 bucket to write file `Returns:` ``dict`` of manifest """ from parsons.aws import S3 s3 = S3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) # Deal with a single bucket being passed, rather than list. if isinstance(buckets, str): buckets = [buckets] # Generate manifest file manifest = {'entries': []} for bucket in buckets: # Retrieve list of files in bucket key_list = s3.list_keys(bucket, prefix=prefix) for key in key_list: manifest['entries'].append({ 'url': '/'.join(['s3:/', bucket, key]), 'mandatory': mandatory }) logger.info('Manifest generated.') # Save the file to s3 bucket if provided if manifest_key and manifest_bucket: # Dump the manifest to a temp JSON file manifest_path = files.create_temp_file() with open(manifest_path, 'w') as manifest_file_obj: json.dump(manifest, manifest_file_obj, sort_keys=True, indent=4) # Upload the file to S3 s3.put_file(manifest_bucket, manifest_key, manifest_path) logger.info( f'Manifest saved to s3://{manifest_bucket}/{manifest_key}') return manifest
def copy_s3(self, table_name, bucket, key, manifest=False, data_type='csv', csv_delimiter=',', compression=None, if_exists='fail', max_errors=0, distkey=None, sortkey=None, padding=None, varchar_max=None, statupdate=True, compupdate=True, ignoreheader=1, acceptanydate=True, dateformat='auto', timeformat='auto', emptyasnull=True, blanksasnull=True, nullas=None, acceptinvchars=True, truncatecolumns=False, columntypes=None, specifycols=None, aws_access_key_id=None, aws_secret_access_key=None, bucket_region=None, strict_length=True, template_table=None): """ Copy a file from s3 to Redshift. `Args:` table_name: str The table name and schema (``tmc.cool_table``) to point the file. bucket: str The s3 bucket where the file or manifest is located. key: str The key of the file or manifest in the s3 bucket. manifest: str If using a manifest data_type: str The data type of the file. Only ``csv`` supported currently. csv_delimiter: str The delimiter of the ``csv``. Only relevant if data_type is ``csv``. compression: str If specified (``gzip``), will attempt to decompress the file. if_exists: str If the table already exists, either ``fail``, ``append``, ``drop`` or ``truncate`` the table. max_errors: int The maximum number of rows that can error and be skipped before the job fails. distkey: str The column name of the distkey sortkey: str The column name of the sortkey padding: float A percentage padding to add to varchar columns if creating a new table. This is helpful to add a buffer for future copies in which the data might be wider. varchar_max: list A list of columns in which to set the width of the varchar column to 65,535 characters. statupate: boolean Governs automatic computation and refresh of optimizer statistics at the end of a successful COPY command. compupdate: boolean Controls whether compression encodings are automatically applied during a COPY. ignore_header: int The number of header rows to skip. Ignored if data_type is ``json``. acceptanydate: boolean Allows any date format, including invalid formats such as 00/00/00 00:00:00, to be loaded without generating an error. emptyasnull: boolean Indicates that Amazon Redshift should load empty char and varchar fields as ``NULL``. blanksasnull: boolean Loads blank varchar fields, which consist of only white space characters, as ``NULL``. nullas: str Loads fields that match string as NULL acceptinvchars: boolean Enables loading of data into VARCHAR columns even if the data contains invalid UTF-8 characters. dateformat: str Set the date format. Defaults to ``auto``. timeformat: str Set the time format. Defaults to ``auto``. truncatecolumns: boolean If the table already exists, truncates data in columns to the appropriate number of characters so that it fits the column specification. Applies only to columns with a VARCHAR or CHAR data type, and rows 4 MB or less in size. columntypes: dict Optional map of column name to redshift column type, overriding the usual type inference. You only specify the columns you want to override, eg. ``columntypes={'phone': 'varchar(12)', 'age': 'int'})``. specifycols: boolean Adds a column list to the Redshift `COPY` command, allowing for the source table in an append to have the columnns out of order, and to have fewer columns with any leftover target table columns filled in with the `DEFAULT` value. This will fail if all of the source table's columns do not match a column in the target table. This will also fail if the target table has an `IDENTITY` column and that column name is among the source table's columns. aws_access_key_id: An AWS access key granted to the bucket where the file is located. Not required if keys are stored as environmental variables. aws_secret_access_key: An AWS secret access key granted to the bucket where the file is located. Not required if keys are stored as environmental variables. bucket_region: str The AWS region that the bucket is located in. This should be provided if the Redshift cluster is located in a different region from the temp bucket. strict_length: bool If the database table needs to be created, strict_length determines whether the created table's column sizes will be sized to exactly fit the current data, or if their size will be rounded up to account for future values being larger then the current dataset. defaults to ``True``; this argument is ignored if ``padding`` is specified template_table: str Instead of specifying columns, columntypes, and/or inference, if there is a pre-existing table that has the same columns/types, then use the template_table table name as the schema for the new table. `Returns` Parsons Table or ``None`` See :ref:`parsons-table` for output options. """ with self.connection() as connection: if self._create_table_precheck(connection, table_name, if_exists): if template_table: sql = f'CREATE TABLE {table_name} (LIKE {template_table})' else: # Grab the object from s3 from parsons.aws.s3 import S3 s3 = S3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) local_path = s3.get_file(bucket, key) if data_type == 'csv': tbl = Table.from_csv(local_path, delimiter=csv_delimiter) else: raise TypeError("Invalid data type provided") # Create the table sql = self.create_statement(tbl, table_name, padding=padding, distkey=distkey, sortkey=sortkey, varchar_max=varchar_max, columntypes=columntypes, strict_length=strict_length) self.query_with_connection(sql, connection, commit=False) logger.info(f'{table_name} created.') # Copy the table copy_sql = self.copy_statement( table_name, bucket, key, manifest=manifest, data_type=data_type, csv_delimiter=csv_delimiter, compression=compression, max_errors=max_errors, statupdate=statupdate, compupdate=compupdate, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, ignoreheader=ignoreheader, acceptanydate=acceptanydate, emptyasnull=emptyasnull, blanksasnull=blanksasnull, nullas=nullas, acceptinvchars=acceptinvchars, truncatecolumns=truncatecolumns, specifycols=specifycols, dateformat=dateformat, timeformat=timeformat, bucket_region=bucket_region) self.query_with_connection(copy_sql, connection, commit=False) logger.info(f'Data copied to {table_name}.')
def __init__(self): self.s3 = S3()