def to_csv(self, local_path=None, temp_file_compression=None, encoding=None, errors='strict', write_header=True, csv_name=None, **csvargs): """ Outputs table to a CSV. Additional key word arguments are passed to ``csv.writer()``. So, e.g., to override the delimiter from the default CSV dialect, provide the delimiter keyword argument. .. warning:: If a file already exists at the given location, it will be overwritten. `Args:` local_path: str The path to write the csv locally. If it ends in ".gz" or ".zip", the file will be compressed. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. temp_file_compression: str If a temp file is requested (ie. no ``local_path`` is specified), the compression type for that file. Currently "None", "gzip" or "zip" are supported. If a ``local_path`` is specified, this argument is ignored. encoding: str The CSV encoding type for `csv.writer() <https://docs.python.org/2/library/csv.html#csv.writer/>`_ errors: str Raise an Error if encountered write_header: boolean Include header in output csv_name: str If ``zip`` compression (either specified or inferred), the name of csv file within the archive. \**csvargs: kwargs ``csv_writer`` optional arguments `Returns:` str The path of the new file """ # noqa: W605 # If a zip archive. if files.zip_check(local_path, temp_file_compression): return self.to_zip_csv(archive_path=local_path, encoding=encoding, errors=errors, write_header=write_header, csv_name=csv_name, **csvargs) if not local_path: suffix = '.csv' + files.suffix_for_compression_type(temp_file_compression) local_path = files.create_temp_file(suffix=suffix) # Create normal csv/.gzip petl.tocsv(self.table, source=local_path, encoding=encoding, errors=errors, write_header=write_header, **csvargs) return local_path
def get_table_by_file_id(self, file_id, format='csv') -> Table: """Get a table that has been saved to Box in csv or JSON format. `Args`: file_id: str The Box file_id of the table to be retrieved. format: str Format in which Table has been saved; for now, only 'csv' or 'json'. `Returns`: Table A Parsons Table. """ if format not in self.ALLOWED_FILE_FORMATS: raise ValueError( f'Format argument to upload_table() must be in one ' f'of {self.ALLOWED_FILE_FORMATS}; found "{format}"') # Temp file will be around as long as enclosing process is running, # which we need, because the Table we return will continue to use it. output_file_name = create_temp_file() with open(output_file_name, 'wb') as output_file: self.client.file(file_id).download_to(output_file) if format == 'csv': return Table.from_csv(output_file_name) elif format == 'json': return Table.from_json(output_file_name) else: raise SystemError(f'Got (theoretically) impossible ' f'format option "{format}"') # pragma: no cover
def test_close_temp_file(): temp = files.create_temp_file() files.close_temp_file(temp) # Verify the temp file no longer exists with pytest.raises(FileNotFoundError): open(temp, 'r')
def create_job_xml(self, job_type, job_name, emails=None, status_key=None, call_back=None): # Internal method to create a valid job xml job = ET.Element("job") # Generate Base XML input_file = ET.SubElement(job, 'inputfile') input_file.text = job_name + '_input.csv' output_file = ET.SubElement(job, 'outputfile') output_file.text = job_name + '_output.csv' jobtype = ET.SubElement(job, 'jobtype', text=job_type) jobtype.text = job_type # Add status key args = ET.SubElement(job, "args") statuskey = ET.SubElement(args, "arg", name="__status_key") statuskey.text = status_key or job_name # Option args if call_back: callback = ET.SubElement(args, "arg", name="__http_callback") callback.text = call_back if emails: emails_el = ET.SubElement(args, "arg", name="__emails") emails_el.text = ','.join(emails) # Write xml to file object local_path = create_temp_file(suffix='.xml') tree = ET.ElementTree(job) tree.write(local_path) return local_path
def materialize_to_file(self, file_path=None): """ "Materializes" a Table, meaning all pending transformations are applied. Unlike the original materialize function, this method does not bring the data into memory, but instead loads the data into a local temp file. This method updates the current table in place. `Args:` file_path: str The path to the file to materialize the table to; if not specified, a temp file will be created. `Returns:` str Path to the temp file that now contains the table """ # Load the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) file_path = file_path or files.create_temp_file() with open(file_path, 'wb') as handle: for row in self.table: pickle.dump(list(row), handle) # Load a Table from the file self.table = petl.frompickle(file_path) return file_path
def to_zip_csv(self, archive_path=None, csv_name=None, encoding=None, errors='strict', write_header=True, if_exists='replace', **csvargs): """ Outputs table to a CSV in a zip archive. Additional key word arguments are passed to ``csv.writer()``. So, e.g., to override the delimiter from the default CSV dialect, provide the delimiter keyword argument. Use thismethod if you would like to write multiple csv files to the same archive. .. warning:: If a file already exists in the archive, it will be overwritten. `Args:` archive_path: str The path to zip achive. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. csv_name: str The name of the csv file to be stored in the archive. If ``None`` will use the archive name. encoding: str The CSV encoding type for `csv.writer() <https://docs.python.org/2/library/csv.html#csv.writer/>`_ errors: str Raise an Error if encountered write_header: boolean Include header in output if_exists: str If archive already exists, one of 'replace' or 'append' \**csvargs: kwargs ``csv_writer`` optional arguments `Returns:` str The path of the archive """ # noqa: W605 if not archive_path: archive_path = files.create_temp_file(suffix='.zip') cf = self.to_csv(encoding=encoding, errors=errors, write_header=write_header, **csvargs) if not csv_name: csv_name = files.extract_file_name(archive_path, include_suffix=False) + '.csv' return zip_archive.create_archive(archive_path, cf, file_name=csv_name, if_exists=if_exists)
def to_json(self, local_path=None, temp_file_compression=None, line_delimited=False): """ Outputs table to a JSON file .. warning:: If a file already exists at the given location, it will be overwritten. `Args:` local_path: str The path to write the JSON locally. If it ends in ".gz", it will be compressed first. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. temp_file_compression: str If a temp file is requested (ie. no ``local_path`` is specified), the compression type for that file. Currently "None" and "gzip" are supported. If a ``local_path`` is specified, this argument is ignored. line_delimited: bool Whether the file will be line-delimited JSON (with a row on each line), or a proper JSON file. `Returns:` str The path of the new file """ if not local_path: suffix = '.json' + files.suffix_for_compression_type(temp_file_compression) local_path = files.create_temp_file(suffix=suffix) # Note we don't use the much simpler petl.tojson(), since that method reads the whole # table into memory before writing to file. if files.is_gzip_path(local_path): open_fn = gzip.open mode = 'w+t' else: open_fn = open mode = 'w' with open_fn(local_path, mode) as file: if not line_delimited: file.write('[') i = 0 for row in self: if i: if not line_delimited: file.write(',') file.write('\n') i += 1 json.dump(row, file) if not line_delimited: file.write(']') return local_path
def to_html(self, local_path=None, encoding=None, errors='strict', index_header=False, caption=None, tr_style=None, td_styles=None, truncate=None): """ Outputs table to html. .. warning:: If a file already exists at the given location, it will be overwritten. `Args:` local_path: str The path to write the html locally. If not specified, a temporary file will be created and returned, and that file will be removed automatically when the script is done running. encoding: str The encoding type for `csv.writer() <https://docs.python.org/2/library/csv.html#csv.writer/>`_ errors: str Raise an Error if encountered index_header: boolean Prepend index to column names; Defaults to False. caption: str A caption to include with the html table. tr_style: str or callable Style to be applied to the table row. td_styles: str, dict or callable Styles to be applied to the table cells. truncate: int Length of cell data. `Returns:` str The path of the new file """ if not local_path: local_path = files.create_temp_file(suffix=".html") petl.tohtml(self.table, source=local_path, encoding=encoding, errors=errors, caption=caption, index_header=index_header, tr_style=tr_style, td_styles=td_styles, truncate=truncate) return local_path
def query(self, sql): """ Run a BigQuery query and return the results as a Parsons table. `Args:` sql: str A valid BigTable statement `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # Run the query query_job = self.client.query(sql) # We will use a temp file to cache the results so that they are not all living # in memory. We'll use pickle to serialize the results to file in order to maintain # the proper data types (e.g. integer). temp_filename = create_temp_file() wrote_header = False with open(temp_filename, 'wb') as temp_file: results = query_job.result() # If there are no results, just return None if results.total_rows == 0: return None for row in results: # Make sure we write out the header once and only once if not wrote_header: wrote_header = True header = list(row.keys()) pickle.dump(header, temp_file) row_data = list(row.values()) pickle.dump(row_data, temp_file) ptable = petl.frompickle(temp_filename) final_table = Table(ptable) return final_table
def test_get_table(live_sftp, simple_table): local_path = files.create_temp_file() tbl = live_sftp.get_table(REMOTE_CSV_PATH) assert_matching_tables(tbl, simple_table)
def generate_manifest(self, buckets, aws_access_key_id=None, aws_secret_access_key=None, mandatory=True, prefix=None, manifest_bucket=None, manifest_key=None, path=None): """ Given a list of S3 buckets, generate a manifest file (JSON format). A manifest file allows you to copy multiple files into a single table at once. Once the manifest is generated, you can pass it with the :func:`~parsons.redshift.Redshift.copy_s3` method. AWS keys are not required if ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` environmental variables set. `Args:` buckets: list or str A list of buckets or single bucket from which to generate manifest aws_access_key_id: str AWS access key id to access S3 bucket aws_secret_access_key: str AWS secret access key to access S3 bucket mandatory: boolean The mandatory flag indicates whether the Redshift COPY should terminate if the file does not exist. prefix: str Optional filter for key prefixes manifest_bucket: str Optional bucket to write manifest file. manifest_key: str Optional key name for S3 bucket to write file `Returns:` ``dict`` of manifest """ from parsons.aws import S3 s3 = S3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) # Deal with a single bucket being passed, rather than list. if isinstance(buckets, str): buckets = [buckets] # Generate manifest file manifest = {'entries': []} for bucket in buckets: # Retrieve list of files in bucket key_list = s3.list_keys(bucket, prefix=prefix) for key in key_list: manifest['entries'].append({ 'url': '/'.join(['s3:/', bucket, key]), 'mandatory': mandatory }) logger.info('Manifest generated.') # Save the file to s3 bucket if provided if manifest_key and manifest_bucket: # Dump the manifest to a temp JSON file manifest_path = files.create_temp_file() with open(manifest_path, 'w') as manifest_file_obj: json.dump(manifest, manifest_file_obj, sort_keys=True, indent=4) # Upload the file to S3 s3.put_file(manifest_bucket, manifest_key, manifest_path) logger.info( f'Manifest saved to s3://{manifest_bucket}/{manifest_key}') return manifest
def test_get_table(live_sftp, simple_table): # noqa F811 file_util.create_temp_file() tbl = live_sftp.get_table(CSV_PATH) assert_matching_tables(tbl, simple_table)
def query_with_connection(self, sql, connection, parameters=None, commit=True): """ Execute a query against the database, with an existing connection. Useful for batching queries together. Will return ``None`` if the query returns zero rows. `Args:` sql: str A valid SQL statement connection: obj A connection object obtained from ``mysql.connection()`` parameters: list A list of python variables to be converted into SQL values in your query commit: boolean Whether to commit the transaction immediately. If ``False`` the transaction will be committed when the connection goes out of scope and is closed (or you can commit manually with ``connection.commit()``). `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ with self.cursor(connection) as cursor: # The python connector can only execute a single sql statement, so we will # break up each statement and execute them separately. for s in sql.strip().split(';'): if len(s) != 0: logger.debug(f'SQL Query: {sql}') cursor.execute(s, parameters) if commit: connection.commit() # If the SQL query provides no response, then return None if not cursor.description: logger.debug('Query returned 0 rows') return None else: # Fetch the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) temp_file = files.create_temp_file() with open(temp_file, 'wb') as f: # Grab the header pickle.dump(cursor.column_names, f) while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if len(batch) == 0: break logger.debug(f'Fetched {len(batch)} rows.') for row in batch: pickle.dump(row, f) # Load a Table from the file final_tbl = Table(petl.frompickle(temp_file)) logger.debug(f'Query returned {final_tbl.num_rows} rows.') return final_tbl
def query(self, sql, parameters=None): """ Run a BigQuery query and return the results as a Parsons table. To include python variables in your query, it is recommended to pass them as parameters, following the BigQuery style where parameters are prefixed with `@`s. Using the ``parameters`` argument ensures that values are escaped properly, and avoids SQL injection attacks. **Parameter Examples** .. code-block:: python name = "Beatrice O'Brady" sql = 'SELECT * FROM my_table WHERE name = %s' rs.query(sql, parameters=[name]) .. code-block:: python name = "Beatrice O'Brady" sql = "SELECT * FROM my_table WHERE name = %(name)s" rs.query(sql, parameters={'name': name}) `Args:` sql: str A valid BigTable statement parameters: dict A dictionary of query parameters for BigQuery. `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # get our connection and cursor cursor = self._dbapi.connect(self.client).cursor() # Run the query cursor.execute(sql, parameters) # We will use a temp file to cache the results so that they are not all living # in memory. We'll use pickle to serialize the results to file in order to maintain # the proper data types (e.g. integer). temp_filename = create_temp_file() wrote_header = False with open(temp_filename, 'wb') as temp_file: # Track whether we got data, since if we don't get any results we need to return None got_results = False while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if len(batch) == 0: break got_results = True for row in batch: # Make sure we write out the header once and only once if not wrote_header: wrote_header = True header = list(row.keys()) pickle.dump(header, temp_file) row_data = list(row.values()) pickle.dump(row_data, temp_file) if not got_results: return None ptable = petl.frompickle(temp_filename) final_table = Table(ptable) return final_table
def query_with_connection(self, sql, connection, parameters=None, commit=True): """ Execute a query against the Redshift database, with an existing connection. Useful for batching queries together. Will return ``None`` if the query returns zero rows. `Args:` sql: str A valid SQL statement connection: obj A connection object obtained from ``redshift.connection()`` parameters: list A list of python variables to be converted into SQL values in your query commit: boolean Whether to commit the transaction immediately. If ``False`` the transaction will be committed when the connection goes out of scope and is closed (or you can commit manually with ``connection.commit()``). `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # To Do: Have it return an ordered dict to return the # rows in the correct order with self.cursor(connection) as cursor: if 'credentials' not in sql: logger.debug(f'SQL Query: {sql}') cursor.execute(sql, parameters) if commit: connection.commit() # If the cursor is empty, don't cause an error if not cursor.description: logger.debug('Query returned 0 rows') return None else: # Fetch the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) temp_file = files.create_temp_file() with open(temp_file, 'wb') as f: # Grab the header header = [i[0] for i in cursor.description] pickle.dump(header, f) while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if not batch: break logger.debug(f'Fetched {len(batch)} rows.') for row in batch: pickle.dump(list(row), f) # Load a Table from the file final_tbl = Table(petl.frompickle(temp_file)) logger.debug(f'Query returned {final_tbl.num_rows} rows.') return final_tbl
def test_get_file(live_sftp, simple_table): local_path = files.create_temp_file() live_sftp.get_file(REMOTE_CSV_PATH, local_path=local_path) assert_file_matches_table(local_path, simple_table)
def test_get_file(live_sftp, simple_table): # noqa F811 local_path = file_util.create_temp_file() live_sftp.get_file(CSV_PATH, local_path=local_path) assert_file_matches_table(local_path, simple_table)