def test_transfer_bucket(self): # Create a destination bucket # TODO maybe pull this from an env var as well destination_bucket = f"{self.test_bucket}-test" self.s3.create_bucket(destination_bucket) # Copy self.s3.transfer_bucket(self.test_bucket, self.test_key, destination_bucket) # Test that object made it path = self.s3.get_file(destination_bucket, self.test_key) result_tbl = Table.from_csv(path) assert_matching_tables(self.tbl, result_tbl) # Test that original still exists in original bucket self.assertTrue(self.s3.key_exists(self.test_bucket, self.test_key)) # Transfer and delete original self.s3.transfer_bucket( self.test_bucket, self.test_key_2, destination_bucket, None, None, None, None, None, False, True) path_2 = self.s3.get_file(destination_bucket, self.test_key_2) result_tbl_2 = Table.from_csv(path_2) assert_matching_tables(self.tbl_2, result_tbl_2) self.assertFalse(self.s3.key_exists(self.test_bucket, self.test_key_2))
def test_get_url(self): # Test that you can download from URL url = self.s3.get_url(self.test_bucket, self.test_key) csv_table = Table.from_csv(url) assert_matching_tables(self.tbl, csv_table) # Test that the url expires url_short = self.s3.get_url(self.test_bucket, self.test_key, expires_in=1) time.sleep(2) with self.assertRaises(urllib.error.HTTPError) as cm: Table.from_csv(url_short) self.assertEqual(cm.exception.code, 403)
def test_to_csv_zip(self): try: # Test using the to_csv() method self.tbl.to_csv('myzip.zip') tmp = zip_archive.unzip_archive('myzip.zip') assert_matching_tables(self.tbl, Table.from_csv(tmp)) # Test using the to_csv_zip() method self.tbl.to_zip_csv('myzip.zip') tmp = zip_archive.unzip_archive('myzip.zip') assert_matching_tables(self.tbl, Table.from_csv(tmp)) finally: os.unlink('myzip.zip')
def test_put_and_get_file(self): # put_file is part of setup, so just testing getting it here path = self.s3.get_file(self.test_bucket, self.test_key) result_tbl = Table.from_csv(path) assert_matching_tables(self.tbl, result_tbl)
def get_table_by_file_id(self, file_id, format='csv') -> Table: """Get a table that has been saved to Box in csv or JSON format. `Args`: file_id: str The Box file_id of the table to be retrieved. format: str Format in which Table has been saved; for now, only 'csv' or 'json'. `Returns`: Table A Parsons Table. """ if format not in self.ALLOWED_FILE_FORMATS: raise ValueError( f'Format argument to upload_table() must be in one ' f'of {self.ALLOWED_FILE_FORMATS}; found "{format}"') # Temp file will be around as long as enclosing process is running, # which we need, because the Table we return will continue to use it. output_file_name = create_temp_file() with open(output_file_name, 'wb') as output_file: self.client.file(file_id).download_to(output_file) if format == 'csv': return Table.from_csv(output_file_name) elif format == 'json': return Table.from_json(output_file_name) else: raise SystemError(f'Got (theoretically) impossible ' f'format option "{format}"') # pragma: no cover
def test_append_csv_compressed(self): path = self.tbl.to_csv(temp_file_compression='gzip') append_tbl = Table([{'first': 'Mary', 'last': 'Nichols'}]) append_tbl.append_csv(path) result_tbl = Table.from_csv(path) # Combine tables, so we can check the resulting file self.tbl.concat(append_tbl) assert_matching_tables(self.tbl, result_tbl)
def get_changed_entities(self, resource_type, date_from, date_to=None, include_inactive=False, requested_fields=None, custom_fields=None): """ Get modified records for VAN from up to 90 days in the past. `Args:` resource_type: str The type of resource to export. Use the :py:meth:`~parsons.ngpvan.changed_entities.ChangedEntities.get_changed_entity_resources` to get a list of potential entities. date_from: str The start date in which to search. Must be less than 90 days in the past. Must be``iso8601`` formatted date (``2021-10-11``). date_to: str The end date to search. Must be less than 90 days in the past. Must be``iso8601`` formatted date (``2021-10-11``). include_inactive: boolean Include inactive records requested_fields: list A list of optional requested fields to include. These options can be accessed through :py:meth:`~parsons.ngpvan.changed_entities.ChangedEntities.get_changed_entity_resource_fields` method. custom_fields: list A list of ids of custom fields to include in the export. `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # noqa: E501 json = { "dateChangedFrom": date_from, "dateChangedTo": date_to, "resourceType": resource_type, "requestedFields": requested_fields, "requestedCustomFieldIds": custom_fields, "fileSizeKbLimit": 100000, "includeInactive": include_inactive } r = self.connection.post_request('changedEntityExportJobs', json=json) while True: status = self._get_changed_entity_job(r['exportJobId']) if status['jobStatus'] in ['Pending', 'InProcess']: logger.info('Waiting on export file.') time.sleep(RETRY_RATE) elif status['jobStatus'] == 'Complete': return Table.from_csv(status['files'][0]['downloadUrl']) else: raise ValueError(status['message'])
def download_saved_list(self, saved_list_id): """ Download the vanids associated with a saved list. `Args:` saved_list_id: int The saved list id. `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ ej = ExportJobs(self.connection) job = ej.export_job_create(saved_list_id) if isinstance(job, tuple): return job else: return Table.from_csv(job['downloadUrl'])
def get_bulk_import_job_results(self, job_id): """ Get result file of a bulk upload job. This will include one row per record processed as well as the status of each. If the job results have not been generated, either due to an error in the process or the fact the job is still processing, it will return ``None``. `Args:` job_id: int The bulk import job id. `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ r = self.get_bulk_import_job(job_id) logger.info(f"Bulk Import Job Status: {r['status']}") if r['status'] == 'Completed': return Table.from_csv(r['resultFiles'][0]['url']) return None
def copy_s3(self, table_name, bucket, key, manifest=False, data_type='csv', csv_delimiter=',', compression=None, if_exists='fail', max_errors=0, distkey=None, sortkey=None, padding=None, varchar_max=None, statupdate=True, compupdate=True, ignoreheader=1, acceptanydate=True, dateformat='auto', timeformat='auto', emptyasnull=True, blanksasnull=True, nullas=None, acceptinvchars=True, truncatecolumns=False, columntypes=None, specifycols=None, aws_access_key_id=None, aws_secret_access_key=None, bucket_region=None, strict_length=True, template_table=None): """ Copy a file from s3 to Redshift. `Args:` table_name: str The table name and schema (``tmc.cool_table``) to point the file. bucket: str The s3 bucket where the file or manifest is located. key: str The key of the file or manifest in the s3 bucket. manifest: str If using a manifest data_type: str The data type of the file. Only ``csv`` supported currently. csv_delimiter: str The delimiter of the ``csv``. Only relevant if data_type is ``csv``. compression: str If specified (``gzip``), will attempt to decompress the file. if_exists: str If the table already exists, either ``fail``, ``append``, ``drop`` or ``truncate`` the table. max_errors: int The maximum number of rows that can error and be skipped before the job fails. distkey: str The column name of the distkey sortkey: str The column name of the sortkey padding: float A percentage padding to add to varchar columns if creating a new table. This is helpful to add a buffer for future copies in which the data might be wider. varchar_max: list A list of columns in which to set the width of the varchar column to 65,535 characters. statupate: boolean Governs automatic computation and refresh of optimizer statistics at the end of a successful COPY command. compupdate: boolean Controls whether compression encodings are automatically applied during a COPY. ignore_header: int The number of header rows to skip. Ignored if data_type is ``json``. acceptanydate: boolean Allows any date format, including invalid formats such as 00/00/00 00:00:00, to be loaded without generating an error. emptyasnull: boolean Indicates that Amazon Redshift should load empty char and varchar fields as ``NULL``. blanksasnull: boolean Loads blank varchar fields, which consist of only white space characters, as ``NULL``. nullas: str Loads fields that match string as NULL acceptinvchars: boolean Enables loading of data into VARCHAR columns even if the data contains invalid UTF-8 characters. dateformat: str Set the date format. Defaults to ``auto``. timeformat: str Set the time format. Defaults to ``auto``. truncatecolumns: boolean If the table already exists, truncates data in columns to the appropriate number of characters so that it fits the column specification. Applies only to columns with a VARCHAR or CHAR data type, and rows 4 MB or less in size. columntypes: dict Optional map of column name to redshift column type, overriding the usual type inference. You only specify the columns you want to override, eg. ``columntypes={'phone': 'varchar(12)', 'age': 'int'})``. specifycols: boolean Adds a column list to the Redshift `COPY` command, allowing for the source table in an append to have the columnns out of order, and to have fewer columns with any leftover target table columns filled in with the `DEFAULT` value. This will fail if all of the source table's columns do not match a column in the target table. This will also fail if the target table has an `IDENTITY` column and that column name is among the source table's columns. aws_access_key_id: An AWS access key granted to the bucket where the file is located. Not required if keys are stored as environmental variables. aws_secret_access_key: An AWS secret access key granted to the bucket where the file is located. Not required if keys are stored as environmental variables. bucket_region: str The AWS region that the bucket is located in. This should be provided if the Redshift cluster is located in a different region from the temp bucket. strict_length: bool If the database table needs to be created, strict_length determines whether the created table's column sizes will be sized to exactly fit the current data, or if their size will be rounded up to account for future values being larger then the current dataset. defaults to ``True``; this argument is ignored if ``padding`` is specified template_table: str Instead of specifying columns, columntypes, and/or inference, if there is a pre-existing table that has the same columns/types, then use the template_table table name as the schema for the new table. `Returns` Parsons Table or ``None`` See :ref:`parsons-table` for output options. """ with self.connection() as connection: if self._create_table_precheck(connection, table_name, if_exists): if template_table: sql = f'CREATE TABLE {table_name} (LIKE {template_table})' else: # Grab the object from s3 from parsons.aws.s3 import S3 s3 = S3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) local_path = s3.get_file(bucket, key) if data_type == 'csv': tbl = Table.from_csv(local_path, delimiter=csv_delimiter) else: raise TypeError("Invalid data type provided") # Create the table sql = self.create_statement(tbl, table_name, padding=padding, distkey=distkey, sortkey=sortkey, varchar_max=varchar_max, columntypes=columntypes, strict_length=strict_length) self.query_with_connection(sql, connection, commit=False) logger.info(f'{table_name} created.') # Copy the table copy_sql = self.copy_statement( table_name, bucket, key, manifest=manifest, data_type=data_type, csv_delimiter=csv_delimiter, compression=compression, max_errors=max_errors, statupdate=statupdate, compupdate=compupdate, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, ignoreheader=ignoreheader, acceptanydate=acceptanydate, emptyasnull=emptyasnull, blanksasnull=blanksasnull, nullas=nullas, acceptinvchars=acceptinvchars, truncatecolumns=truncatecolumns, specifycols=specifycols, dateformat=dateformat, timeformat=timeformat, bucket_region=bucket_region) self.query_with_connection(copy_sql, connection, commit=False) logger.info(f'Data copied to {table_name}.')
def _assert_expected_csv(self, path, orig_tbl): result_tbl = Table.from_csv(path) assert_matching_tables(orig_tbl, result_tbl)
def match(self, table, job_type, job_name=None, emails=None, call_back=None, remove_files=True): """ Match a table to TargetSmart using their bulk matching service. .. warning:: Table Columns The automation job does not validates the file by column indexes rather than columns names. So, if it expected 10 columns and you only provide 9, it will fail. However, if you provide 10 columns that are out of order, the job will succeed, but the records will not match. Args: table: Parsons Table Object A table object with the required columns. (Required columns provided be TargetSmart) job_type: str The match job type. **This is case sensitive.** (Match job names provided by TargetSmart) job_name: str Optional job name. emails: list A list of emails that will received status notifications. This is useful in debugging failed jobs. call_back: str A callback url to which the status will be posted. See `TargetSmart documentation <https://docs.targetsmart.com/developers/automation/index.html#http-callback>`_ for more details. remove_files: boolean Remove the configuration, file to be matched and matched file from the TargetSmart FTP upon completion or failure of match. """ # noqa: E501,E261 # Generate a match job job_name = job_name or str(uuid.uuid1()) try: # Upload table self.sftp.put_file(table.to_csv(), f'{self.sftp_dir}/{job_name}_input.csv') logger.info(f'Table with {table.num_rows} rows uploaded to TargetSmart.') # Create/upload XML configuration xml = self.create_job_xml(job_type, job_name, emails=emails, status_key=job_name, call_back=call_back) self.sftp.put_file(xml, f'{self.sftp_dir}/{job_name}.job.xml') logger.info('Match configuration uploaded to TargetSmart.') # Check xml configuration status self.poll_config_status(job_name) # Check the status of the match self.match_status(job_name) # Download the resulting file tbl = Table.from_csv(self.sftp.get_file(f'{self.sftp_dir}/{job_name}_output.csv')) finally: # Clean up files if remove_files: self.remove_files(job_name) # Log Stats # TO DO: Provide some stats on the match # Return file as a Table return tbl