Exemple #1
0
    def to_csv(self, local_path=None, temp_file_compression=None, encoding=None, errors='strict',
               write_header=True, csv_name=None, **csvargs):
        """
        Outputs table to a CSV. Additional key word arguments are passed to ``csv.writer()``. So,
        e.g., to override the delimiter from the default CSV dialect, provide the delimiter
        keyword argument.

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the csv locally. If it ends in ".gz" or ".zip", the file will be
                compressed. If not specified, a temporary file will be created and returned,
                and that file will be removed automatically when the script is done running.
            temp_file_compression: str
                If a temp file is requested (ie. no ``local_path`` is specified), the compression
                type for that file. Currently "None", "gzip" or "zip" are supported.
                If a ``local_path`` is specified, this argument is ignored.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            csv_name: str
                If ``zip`` compression (either specified or inferred), the name of csv file
                within the archive.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments

        `Returns:`
            str
                The path of the new file
        """  # noqa: W605

        # If a zip archive.
        if files.zip_check(local_path, temp_file_compression):
            return self.to_zip_csv(archive_path=local_path,
                                   encoding=encoding,
                                   errors=errors,
                                   write_header=write_header,
                                   csv_name=csv_name,
                                   **csvargs)

        if not local_path:
            suffix = '.csv' + files.suffix_for_compression_type(temp_file_compression)
            local_path = files.create_temp_file(suffix=suffix)

        # Create normal csv/.gzip
        petl.tocsv(self.table,
                   source=local_path,
                   encoding=encoding,
                   errors=errors,
                   write_header=write_header,
                   **csvargs)

        return local_path
Exemple #2
0
    def get_table_by_file_id(self, file_id, format='csv') -> Table:
        """Get a table that has been saved to Box in csv or JSON format.

        `Args`:
            file_id: str
                The Box file_id of the table to be retrieved.
            format: str
                 Format in which Table has been saved; for now, only 'csv' or 'json'.

        `Returns`: Table
            A Parsons Table.
        """
        if format not in self.ALLOWED_FILE_FORMATS:
            raise ValueError(
                f'Format argument to upload_table() must be in one '
                f'of {self.ALLOWED_FILE_FORMATS}; found "{format}"')

        # Temp file will be around as long as enclosing process is running,
        # which we need, because the Table we return will continue to use it.
        output_file_name = create_temp_file()
        with open(output_file_name, 'wb') as output_file:
            self.client.file(file_id).download_to(output_file)

        if format == 'csv':
            return Table.from_csv(output_file_name)
        elif format == 'json':
            return Table.from_json(output_file_name)
        else:
            raise SystemError(f'Got (theoretically) impossible '
                              f'format option "{format}"')  # pragma: no cover
Exemple #3
0
def test_close_temp_file():
    temp = files.create_temp_file()
    files.close_temp_file(temp)

    # Verify the temp file no longer exists
    with pytest.raises(FileNotFoundError):
        open(temp, 'r')
    def create_job_xml(self, job_type, job_name, emails=None, status_key=None, call_back=None):
        # Internal method to create a valid job xml

        job = ET.Element("job")

        # Generate Base XML
        input_file = ET.SubElement(job, 'inputfile')
        input_file.text = job_name + '_input.csv'
        output_file = ET.SubElement(job, 'outputfile')
        output_file.text = job_name + '_output.csv'
        jobtype = ET.SubElement(job, 'jobtype', text=job_type)
        jobtype.text = job_type

        # Add status key
        args = ET.SubElement(job, "args")
        statuskey = ET.SubElement(args, "arg", name="__status_key")
        statuskey.text = status_key or job_name

        # Option args
        if call_back:
            callback = ET.SubElement(args, "arg", name="__http_callback")
            callback.text = call_back

        if emails:
            emails_el = ET.SubElement(args, "arg", name="__emails")
            emails_el.text = ','.join(emails)

        # Write xml to file object
        local_path = create_temp_file(suffix='.xml')
        tree = ET.ElementTree(job)
        tree.write(local_path)
        return local_path
Exemple #5
0
    def materialize_to_file(self, file_path=None):
        """
        "Materializes" a Table, meaning all pending transformations are applied.

        Unlike the original materialize function, this method does not bring the data into memory,
        but instead loads the data into a local temp file.

        This method updates the current table in place.

        `Args:`
            file_path: str
                The path to the file to materialize the table to; if not specified, a temp file
                will be created.
        `Returns:`
            str
                Path to the temp file that now contains the table
        """

        # Load the data in batches, and "pickle" the rows to a temp file.
        # (We pickle rather than writing to, say, a CSV, so that we maintain
        # all the type information for each field.)

        file_path = file_path or files.create_temp_file()

        with open(file_path, 'wb') as handle:
            for row in self.table:
                pickle.dump(list(row), handle)

        # Load a Table from the file
        self.table = petl.frompickle(file_path)

        return file_path
Exemple #6
0
    def to_zip_csv(self,
                   archive_path=None,
                   csv_name=None,
                   encoding=None,
                   errors='strict',
                   write_header=True,
                   if_exists='replace',
                   **csvargs):
        """
        Outputs table to a CSV in a zip archive. Additional key word arguments are passed to
        ``csv.writer()``. So, e.g., to override the delimiter from the default CSV dialect,
        provide the delimiter keyword argument. Use thismethod if you would like to write
        multiple csv files to the same archive.

        .. warning::
                If a file already exists in the archive, it will be overwritten.

        `Args:`
            archive_path: str
                The path to zip achive. If not specified, a temporary file will be created and
                returned, and that file will be removed automatically when the script is done
                running.
            csv_name: str
                The name of the csv file to be stored in the archive. If ``None`` will use
                the archive name.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            if_exists: str
                If archive already exists, one of 'replace' or 'append'
            \**csvargs: kwargs
                ``csv_writer`` optional arguments

        `Returns:`
            str
                The path of the archive
        """  # noqa: W605

        if not archive_path:
            archive_path = files.create_temp_file(suffix='.zip')

        cf = self.to_csv(encoding=encoding,
                         errors=errors,
                         write_header=write_header,
                         **csvargs)

        if not csv_name:
            csv_name = files.extract_file_name(archive_path,
                                               include_suffix=False) + '.csv'

        return zip_archive.create_archive(archive_path,
                                          cf,
                                          file_name=csv_name,
                                          if_exists=if_exists)
Exemple #7
0
    def to_json(self, local_path=None, temp_file_compression=None, line_delimited=False):
        """
        Outputs table to a JSON file

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the JSON locally. If it ends in ".gz", it will be
                compressed first. If not specified, a temporary file will be created and returned,
                and that file will be removed automatically when the script is done running.
            temp_file_compression: str
                If a temp file is requested (ie. no ``local_path`` is specified), the compression
                type for that file. Currently "None" and "gzip" are supported.
                If a ``local_path`` is specified, this argument is ignored.
            line_delimited: bool
                Whether the file will be line-delimited JSON (with a row on each line), or a proper
                JSON file.

        `Returns:`
            str
                The path of the new file
        """

        if not local_path:
            suffix = '.json' + files.suffix_for_compression_type(temp_file_compression)
            local_path = files.create_temp_file(suffix=suffix)

        # Note we don't use the much simpler petl.tojson(), since that method reads the whole
        # table into memory before writing to file.

        if files.is_gzip_path(local_path):
            open_fn = gzip.open
            mode = 'w+t'
        else:
            open_fn = open
            mode = 'w'

        with open_fn(local_path, mode) as file:
            if not line_delimited:
                file.write('[')

            i = 0
            for row in self:
                if i:
                    if not line_delimited:
                        file.write(',')
                    file.write('\n')
                i += 1
                json.dump(row, file)

            if not line_delimited:
                file.write(']')

        return local_path
Exemple #8
0
    def to_html(self,
                local_path=None,
                encoding=None,
                errors='strict',
                index_header=False,
                caption=None,
                tr_style=None,
                td_styles=None,
                truncate=None):
        """
        Outputs table to html.

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the html locally. If not specified, a temporary file will be
                created and returned, and that file will be removed automatically when the script
                is done running.
            encoding: str
                The encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            index_header: boolean
                Prepend index to column names; Defaults to False.
            caption: str
                A caption to include with the html table.
            tr_style: str or callable
                Style to be applied to the table row.
            td_styles: str, dict or callable
                Styles to be applied to the table cells.
            truncate: int
                Length of cell data.
        `Returns:`
            str
                The path of the new file
        """

        if not local_path:
            local_path = files.create_temp_file(suffix=".html")

        petl.tohtml(self.table,
                    source=local_path,
                    encoding=encoding,
                    errors=errors,
                    caption=caption,
                    index_header=index_header,
                    tr_style=tr_style,
                    td_styles=td_styles,
                    truncate=truncate)

        return local_path
Exemple #9
0
    def query(self, sql):
        """
        Run a BigQuery query and return the results as a Parsons table.

        `Args:`
            sql: str
                A valid BigTable statement

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """
        # Run the query
        query_job = self.client.query(sql)

        # We will use a temp file to cache the results so that they are not all living
        # in memory. We'll use pickle to serialize the results to file in order to maintain
        # the proper data types (e.g. integer).
        temp_filename = create_temp_file()

        wrote_header = False
        with open(temp_filename, 'wb') as temp_file:
            results = query_job.result()

            # If there are no results, just return None
            if results.total_rows == 0:
                return None

            for row in results:
                # Make sure we write out the header once and only once
                if not wrote_header:
                    wrote_header = True
                    header = list(row.keys())
                    pickle.dump(header, temp_file)

                row_data = list(row.values())
                pickle.dump(row_data, temp_file)

        ptable = petl.frompickle(temp_filename)
        final_table = Table(ptable)

        return final_table
Exemple #10
0
def test_get_table(live_sftp, simple_table):
    local_path = files.create_temp_file()
    tbl = live_sftp.get_table(REMOTE_CSV_PATH)
    assert_matching_tables(tbl, simple_table)
Exemple #11
0
    def generate_manifest(self,
                          buckets,
                          aws_access_key_id=None,
                          aws_secret_access_key=None,
                          mandatory=True,
                          prefix=None,
                          manifest_bucket=None,
                          manifest_key=None,
                          path=None):
        """
        Given a list of S3 buckets, generate a manifest file (JSON format). A manifest file
        allows you to copy multiple files into a single table at once. Once the manifest is
        generated, you can pass it with the :func:`~parsons.redshift.Redshift.copy_s3` method.

        AWS keys are not required if ``AWS_ACCESS_KEY_ID`` and
        ``AWS_SECRET_ACCESS_KEY`` environmental variables set.

        `Args:`

            buckets: list or str
                A list of buckets or single bucket from which to generate manifest
            aws_access_key_id: str
                AWS access key id to access S3 bucket
            aws_secret_access_key: str
                AWS secret access key to access S3 bucket
            mandatory: boolean
                The mandatory flag indicates whether the Redshift COPY should
                terminate if the file does not exist.
            prefix: str
                Optional filter for key prefixes
            manifest_bucket: str
                Optional bucket to write manifest file.
            manifest_key: str
                Optional key name for S3 bucket to write file

        `Returns:`
            ``dict`` of manifest
        """

        from parsons.aws import S3
        s3 = S3(aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key)

        # Deal with a single bucket being passed, rather than list.
        if isinstance(buckets, str):
            buckets = [buckets]

        # Generate manifest file
        manifest = {'entries': []}
        for bucket in buckets:

            # Retrieve list of files in bucket
            key_list = s3.list_keys(bucket, prefix=prefix)
            for key in key_list:
                manifest['entries'].append({
                    'url':
                    '/'.join(['s3:/', bucket, key]),
                    'mandatory':
                    mandatory
                })

        logger.info('Manifest generated.')

        # Save the file to s3 bucket if provided
        if manifest_key and manifest_bucket:
            # Dump the manifest to a temp JSON file
            manifest_path = files.create_temp_file()
            with open(manifest_path, 'w') as manifest_file_obj:
                json.dump(manifest,
                          manifest_file_obj,
                          sort_keys=True,
                          indent=4)

            # Upload the file to S3
            s3.put_file(manifest_bucket, manifest_key, manifest_path)

            logger.info(
                f'Manifest saved to s3://{manifest_bucket}/{manifest_key}')

        return manifest
Exemple #12
0
def test_get_table(live_sftp, simple_table):  # noqa F811
    file_util.create_temp_file()
    tbl = live_sftp.get_table(CSV_PATH)
    assert_matching_tables(tbl, simple_table)
Exemple #13
0
    def query_with_connection(self,
                              sql,
                              connection,
                              parameters=None,
                              commit=True):
        """
        Execute a query against the database, with an existing connection. Useful for batching
        queries together. Will return ``None`` if the query returns zero rows.

        `Args:`
            sql: str
                A valid SQL statement
            connection: obj
                A connection object obtained from ``mysql.connection()``
            parameters: list
                A list of python variables to be converted into SQL values in your query
            commit: boolean
                Whether to commit the transaction immediately. If ``False`` the transaction will
                be committed when the connection goes out of scope and is closed (or you can
                commit manually with ``connection.commit()``).

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """
        with self.cursor(connection) as cursor:

            # The python connector can only execute a single sql statement, so we will
            # break up each statement and execute them separately.
            for s in sql.strip().split(';'):
                if len(s) != 0:
                    logger.debug(f'SQL Query: {sql}')
                    cursor.execute(s, parameters)

            if commit:
                connection.commit()

            # If the SQL query provides no response, then return None
            if not cursor.description:
                logger.debug('Query returned 0 rows')
                return None

            else:
                # Fetch the data in batches, and "pickle" the rows to a temp file.
                # (We pickle rather than writing to, say, a CSV, so that we maintain
                # all the type information for each field.)
                temp_file = files.create_temp_file()

                with open(temp_file, 'wb') as f:
                    # Grab the header
                    pickle.dump(cursor.column_names, f)

                    while True:
                        batch = cursor.fetchmany(QUERY_BATCH_SIZE)
                        if len(batch) == 0:
                            break

                        logger.debug(f'Fetched {len(batch)} rows.')
                        for row in batch:
                            pickle.dump(row, f)

                # Load a Table from the file
                final_tbl = Table(petl.frompickle(temp_file))

                logger.debug(f'Query returned {final_tbl.num_rows} rows.')
                return final_tbl
Exemple #14
0
    def query(self, sql, parameters=None):
        """
        Run a BigQuery query and return the results as a Parsons table.

        To include python variables in your query, it is recommended to pass them as parameters,
        following the BigQuery style where parameters are prefixed with `@`s.
        Using the ``parameters`` argument ensures that values are escaped properly, and avoids SQL
        injection attacks.

        **Parameter Examples**

        .. code-block:: python

        name = "Beatrice O'Brady"
        sql = 'SELECT * FROM my_table WHERE name = %s'
        rs.query(sql, parameters=[name])

        .. code-block:: python

        name = "Beatrice O'Brady"
        sql = "SELECT * FROM my_table WHERE name = %(name)s"
        rs.query(sql, parameters={'name': name})

        `Args:`
            sql: str
                A valid BigTable statement
            parameters: dict
                A dictionary of query parameters for BigQuery.

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """
        # get our connection and cursor
        cursor = self._dbapi.connect(self.client).cursor()

        # Run the query
        cursor.execute(sql, parameters)

        # We will use a temp file to cache the results so that they are not all living
        # in memory. We'll use pickle to serialize the results to file in order to maintain
        # the proper data types (e.g. integer).
        temp_filename = create_temp_file()

        wrote_header = False
        with open(temp_filename, 'wb') as temp_file:
            # Track whether we got data, since if we don't get any results we need to return None
            got_results = False
            while True:
                batch = cursor.fetchmany(QUERY_BATCH_SIZE)
                if len(batch) == 0:
                    break

                got_results = True

                for row in batch:
                    # Make sure we write out the header once and only once
                    if not wrote_header:
                        wrote_header = True
                        header = list(row.keys())
                        pickle.dump(header, temp_file)

                    row_data = list(row.values())
                    pickle.dump(row_data, temp_file)

        if not got_results:
            return None

        ptable = petl.frompickle(temp_filename)
        final_table = Table(ptable)

        return final_table
Exemple #15
0
    def query_with_connection(self,
                              sql,
                              connection,
                              parameters=None,
                              commit=True):
        """
        Execute a query against the Redshift database, with an existing connection.
        Useful for batching queries together. Will return ``None`` if the query
        returns zero rows.

        `Args:`
            sql: str
                A valid SQL statement
            connection: obj
                A connection object obtained from ``redshift.connection()``
            parameters: list
                A list of python variables to be converted into SQL values in your query
            commit: boolean
                Whether to commit the transaction immediately. If ``False`` the transaction will
                be committed when the connection goes out of scope and is closed (or you can
                commit manually with ``connection.commit()``).

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        # To Do: Have it return an ordered dict to return the
        #        rows in the correct order

        with self.cursor(connection) as cursor:

            if 'credentials' not in sql:
                logger.debug(f'SQL Query: {sql}')
            cursor.execute(sql, parameters)

            if commit:
                connection.commit()

            # If the cursor is empty, don't cause an error
            if not cursor.description:
                logger.debug('Query returned 0 rows')
                return None

            else:

                # Fetch the data in batches, and "pickle" the rows to a temp file.
                # (We pickle rather than writing to, say, a CSV, so that we maintain
                # all the type information for each field.)

                temp_file = files.create_temp_file()

                with open(temp_file, 'wb') as f:
                    # Grab the header
                    header = [i[0] for i in cursor.description]
                    pickle.dump(header, f)

                    while True:
                        batch = cursor.fetchmany(QUERY_BATCH_SIZE)
                        if not batch:
                            break

                        logger.debug(f'Fetched {len(batch)} rows.')
                        for row in batch:
                            pickle.dump(list(row), f)

                # Load a Table from the file
                final_tbl = Table(petl.frompickle(temp_file))

                logger.debug(f'Query returned {final_tbl.num_rows} rows.')
                return final_tbl
Exemple #16
0
def test_get_file(live_sftp, simple_table):
    local_path = files.create_temp_file()
    live_sftp.get_file(REMOTE_CSV_PATH, local_path=local_path)
    assert_file_matches_table(local_path, simple_table)
Exemple #17
0
def test_get_file(live_sftp, simple_table):  # noqa F811
    local_path = file_util.create_temp_file()
    live_sftp.get_file(CSV_PATH, local_path=local_path)
    assert_file_matches_table(local_path, simple_table)