Python Table.from_csv Examples, parsons.etl.table.Table.from_csv Python Examples

Example #1

0

Show file

File: test_s3.py Project: varomodt/parsons

    def test_transfer_bucket(self):

        # Create a destination bucket
        # TODO maybe pull this from an env var as well
        destination_bucket = f"{self.test_bucket}-test"
        self.s3.create_bucket(destination_bucket)

        # Copy
        self.s3.transfer_bucket(self.test_bucket, self.test_key, destination_bucket)

        # Test that object made it
        path = self.s3.get_file(destination_bucket, self.test_key)
        result_tbl = Table.from_csv(path)
        assert_matching_tables(self.tbl, result_tbl)
        # Test that original still exists in original bucket
        self.assertTrue(self.s3.key_exists(self.test_bucket, self.test_key))

        # Transfer and delete original
        self.s3.transfer_bucket(
            self.test_bucket, self.test_key_2, destination_bucket,
            None, None, None, None, None, False, True)
        path_2 = self.s3.get_file(destination_bucket, self.test_key_2)
        result_tbl_2 = Table.from_csv(path_2)
        assert_matching_tables(self.tbl_2, result_tbl_2)
        self.assertFalse(self.s3.key_exists(self.test_bucket, self.test_key_2))

Example #2

0

Show file

File: test_s3.py Project: varomodt/parsons

    def test_get_url(self):

        # Test that you can download from URL
        url = self.s3.get_url(self.test_bucket, self.test_key)
        csv_table = Table.from_csv(url)
        assert_matching_tables(self.tbl, csv_table)

        # Test that the url expires
        url_short = self.s3.get_url(self.test_bucket, self.test_key, expires_in=1)
        time.sleep(2)
        with self.assertRaises(urllib.error.HTTPError) as cm:
            Table.from_csv(url_short)
        self.assertEqual(cm.exception.code, 403)

Example #3

0

Show file

File: test_etl.py Project: move-coop/parsons

    def test_to_csv_zip(self):

        try:
            # Test using the to_csv() method
            self.tbl.to_csv('myzip.zip')
            tmp = zip_archive.unzip_archive('myzip.zip')
            assert_matching_tables(self.tbl, Table.from_csv(tmp))

            # Test using the to_csv_zip() method
            self.tbl.to_zip_csv('myzip.zip')
            tmp = zip_archive.unzip_archive('myzip.zip')
            assert_matching_tables(self.tbl, Table.from_csv(tmp))
        finally:
            os.unlink('myzip.zip')

Example #4

0

Show file

File: test_s3.py Project: varomodt/parsons

    def test_put_and_get_file(self):

        # put_file is part of setup, so just testing getting it here

        path = self.s3.get_file(self.test_bucket, self.test_key)
        result_tbl = Table.from_csv(path)
        assert_matching_tables(self.tbl, result_tbl)

Example #5

0

Show file

    def get_table_by_file_id(self, file_id, format='csv') -> Table:
        """Get a table that has been saved to Box in csv or JSON format.

        `Args`:
            file_id: str
                The Box file_id of the table to be retrieved.
            format: str
                 Format in which Table has been saved; for now, only 'csv' or 'json'.

        `Returns`: Table
            A Parsons Table.
        """
        if format not in self.ALLOWED_FILE_FORMATS:
            raise ValueError(
                f'Format argument to upload_table() must be in one '
                f'of {self.ALLOWED_FILE_FORMATS}; found "{format}"')

        # Temp file will be around as long as enclosing process is running,
        # which we need, because the Table we return will continue to use it.
        output_file_name = create_temp_file()
        with open(output_file_name, 'wb') as output_file:
            self.client.file(file_id).download_to(output_file)

        if format == 'csv':
            return Table.from_csv(output_file_name)
        elif format == 'json':
            return Table.from_json(output_file_name)
        else:
            raise SystemError(f'Got (theoretically) impossible '
                              f'format option "{format}"')  # pragma: no cover

Example #6

0

Show file

File: test_etl.py Project: shepardjma/parsons

    def test_append_csv_compressed(self):
        path = self.tbl.to_csv(temp_file_compression='gzip')
        append_tbl = Table([{'first': 'Mary', 'last': 'Nichols'}])
        append_tbl.append_csv(path)

        result_tbl = Table.from_csv(path)
        # Combine tables, so we can check the resulting file
        self.tbl.concat(append_tbl)
        assert_matching_tables(self.tbl, result_tbl)

Example #7

0

Show file

File: changed_entities.py Project: move-coop/parsons

    def get_changed_entities(self,
                             resource_type,
                             date_from,
                             date_to=None,
                             include_inactive=False,
                             requested_fields=None,
                             custom_fields=None):
        """
        Get modified records for VAN from up to 90 days in the past.

        `Args:`
            resource_type: str
                The type of resource to export. Use the :py:meth:`~parsons.ngpvan.changed_entities.ChangedEntities.get_changed_entity_resources`
                to get a list of potential entities.
            date_from: str
                The start date in which to search. Must be less than 90 days in the
                past. Must be``iso8601`` formatted date (``2021-10-11``).
            date_to: str
                The end date to search. Must be less than 90 days in the
                past. Must be``iso8601`` formatted date (``2021-10-11``).
            include_inactive: boolean
                Include inactive records
            requested_fields: list
                A list of optional requested fields to include. These options can be accessed through
                :py:meth:`~parsons.ngpvan.changed_entities.ChangedEntities.get_changed_entity_resource_fields`
                method.
            custom_fields: list
                A list of ids of custom fields to include in the export.

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """  # noqa: E501

        json = {
            "dateChangedFrom": date_from,
            "dateChangedTo": date_to,
            "resourceType": resource_type,
            "requestedFields": requested_fields,
            "requestedCustomFieldIds": custom_fields,
            "fileSizeKbLimit": 100000,
            "includeInactive": include_inactive
        }

        r = self.connection.post_request('changedEntityExportJobs', json=json)

        while True:
            status = self._get_changed_entity_job(r['exportJobId'])
            if status['jobStatus'] in ['Pending', 'InProcess']:
                logger.info('Waiting on export file.')
                time.sleep(RETRY_RATE)
            elif status['jobStatus'] == 'Complete':
                return Table.from_csv(status['files'][0]['downloadUrl'])
            else:
                raise ValueError(status['message'])

Example #8

0

Show file

File: saved_lists.py Project: eliotst/parsons

    def download_saved_list(self, saved_list_id):
        """
        Download the vanids associated with a saved list.

        `Args:`
            saved_list_id: int
                The saved list id.
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        ej = ExportJobs(self.connection)
        job = ej.export_job_create(saved_list_id)

        if isinstance(job, tuple):
            return job
        else:
            return Table.from_csv(job['downloadUrl'])

Example #9

0

Show file

File: bulk_import.py Project: move-coop/parsons

    def get_bulk_import_job_results(self, job_id):
        """
        Get result file of a bulk upload job. This will include one row
        per record processed as well as the status of each.

        If the job results have not been generated, either due to an error in the
        process or the fact the job is still processing, it will return ``None``.

        `Args:`
            job_id: int
                The bulk import job id.
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        r = self.get_bulk_import_job(job_id)
        logger.info(f"Bulk Import Job Status: {r['status']}")
        if r['status'] == 'Completed':
            return Table.from_csv(r['resultFiles'][0]['url'])

        return None

Example #10

0

Show file

    def copy_s3(self,
                table_name,
                bucket,
                key,
                manifest=False,
                data_type='csv',
                csv_delimiter=',',
                compression=None,
                if_exists='fail',
                max_errors=0,
                distkey=None,
                sortkey=None,
                padding=None,
                varchar_max=None,
                statupdate=True,
                compupdate=True,
                ignoreheader=1,
                acceptanydate=True,
                dateformat='auto',
                timeformat='auto',
                emptyasnull=True,
                blanksasnull=True,
                nullas=None,
                acceptinvchars=True,
                truncatecolumns=False,
                columntypes=None,
                specifycols=None,
                aws_access_key_id=None,
                aws_secret_access_key=None,
                bucket_region=None,
                strict_length=True,
                template_table=None):
        """
        Copy a file from s3 to Redshift.

        `Args:`
            table_name: str
                The table name and schema (``tmc.cool_table``) to point the file.
            bucket: str
                The s3 bucket where the file or manifest is located.
            key: str
                The key of the file or manifest in the s3 bucket.
            manifest: str
                If using a manifest
            data_type: str
                The data type of the file. Only ``csv`` supported currently.
            csv_delimiter: str
                The delimiter of the ``csv``. Only relevant if data_type is ``csv``.
            compression: str
                If specified (``gzip``), will attempt to decompress the file.
            if_exists: str
                If the table already exists, either ``fail``, ``append``, ``drop``
                or ``truncate`` the table.
            max_errors: int
                The maximum number of rows that can error and be skipped before
                the job fails.
            distkey: str
                The column name of the distkey
            sortkey: str
                The column name of the sortkey
            padding: float
                A percentage padding to add to varchar columns if creating a new table. This is
                helpful to add a buffer for future copies in which the data might be wider.
            varchar_max: list
                A list of columns in which to set the width of the varchar column to 65,535
                characters.
            statupate: boolean
                Governs automatic computation and refresh of optimizer statistics at the end
                of a successful COPY command.
            compupdate: boolean
                Controls whether compression encodings are automatically applied during a COPY.
            ignore_header: int
                The number of header rows to skip. Ignored if data_type is ``json``.
            acceptanydate: boolean
                Allows any date format, including invalid formats such as 00/00/00 00:00:00, to be
                loaded without generating an error.
            emptyasnull: boolean
                Indicates that Amazon Redshift should load empty char and varchar fields
                as ``NULL``.
            blanksasnull: boolean
                Loads blank varchar fields, which consist of only white space characters,
                as ``NULL``.
            nullas: str
                Loads fields that match string as NULL
            acceptinvchars: boolean
                Enables loading of data into VARCHAR columns even if the data contains
                invalid UTF-8 characters.
            dateformat: str
                Set the date format. Defaults to ``auto``.
            timeformat: str
                Set the time format. Defaults to ``auto``.
            truncatecolumns: boolean
                If the table already exists, truncates data in columns to the appropriate number
                of characters so that it fits the column specification. Applies only to columns
                with a VARCHAR or CHAR data type, and rows 4 MB or less in size.
            columntypes: dict
                Optional map of column name to redshift column type, overriding the usual type
                inference. You only specify the columns you want to override, eg.
                ``columntypes={'phone': 'varchar(12)', 'age': 'int'})``.
            specifycols: boolean
                Adds a column list to the Redshift `COPY` command, allowing for the source table
                in an append to have the columnns out of order, and to have fewer columns with any
                leftover target table columns filled in with the `DEFAULT` value.

                This will fail if all of the source table's columns do not match a column in the
                target table. This will also fail if the target table has an `IDENTITY`
                column and that column name is among the source table's columns.
            aws_access_key_id:
                An AWS access key granted to the bucket where the file is located. Not required
                if keys are stored as environmental variables.
            aws_secret_access_key:
                An AWS secret access key granted to the bucket where the file is located. Not
                required if keys are stored as environmental variables.
            bucket_region: str
                The AWS region that the bucket is located in. This should be provided if the
                Redshift cluster is located in a different region from the temp bucket.
            strict_length: bool
                If the database table needs to be created, strict_length determines whether
                the created table's column sizes will be sized to exactly fit the current data,
                or if their size will be rounded up to account for future values being larger
                then the current dataset. defaults to ``True``; this argument is ignored if
                ``padding`` is specified
            template_table: str
                Instead of specifying columns, columntypes, and/or inference, if there
                is a pre-existing table that has the same columns/types, then use the template_table
                table name as the schema for the new table.

        `Returns`
            Parsons Table or ``None``
                See :ref:`parsons-table` for output options.
        """

        with self.connection() as connection:

            if self._create_table_precheck(connection, table_name, if_exists):
                if template_table:
                    sql = f'CREATE TABLE {table_name} (LIKE {template_table})'
                else:
                    # Grab the object from s3
                    from parsons.aws.s3 import S3
                    s3 = S3(aws_access_key_id=aws_access_key_id,
                            aws_secret_access_key=aws_secret_access_key)

                    local_path = s3.get_file(bucket, key)
                    if data_type == 'csv':
                        tbl = Table.from_csv(local_path,
                                             delimiter=csv_delimiter)
                    else:
                        raise TypeError("Invalid data type provided")

                    # Create the table
                    sql = self.create_statement(tbl,
                                                table_name,
                                                padding=padding,
                                                distkey=distkey,
                                                sortkey=sortkey,
                                                varchar_max=varchar_max,
                                                columntypes=columntypes,
                                                strict_length=strict_length)

                self.query_with_connection(sql, connection, commit=False)
                logger.info(f'{table_name} created.')

            # Copy the table
            copy_sql = self.copy_statement(
                table_name,
                bucket,
                key,
                manifest=manifest,
                data_type=data_type,
                csv_delimiter=csv_delimiter,
                compression=compression,
                max_errors=max_errors,
                statupdate=statupdate,
                compupdate=compupdate,
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                ignoreheader=ignoreheader,
                acceptanydate=acceptanydate,
                emptyasnull=emptyasnull,
                blanksasnull=blanksasnull,
                nullas=nullas,
                acceptinvchars=acceptinvchars,
                truncatecolumns=truncatecolumns,
                specifycols=specifycols,
                dateformat=dateformat,
                timeformat=timeformat,
                bucket_region=bucket_region)

            self.query_with_connection(copy_sql, connection, commit=False)
            logger.info(f'Data copied to {table_name}.')

Example #11

0

Show file

File: test_etl.py Project: shepardjma/parsons

 def _assert_expected_csv(self, path, orig_tbl):
     result_tbl = Table.from_csv(path)
     assert_matching_tables(orig_tbl, result_tbl)

Example #12

0

Show file

File: targetsmart_automation.py Project: move-coop/parsons

    def match(self, table, job_type, job_name=None, emails=None, call_back=None, remove_files=True):
        """
        Match a table to TargetSmart using their bulk matching service.

        .. warning::
            Table Columns
              The automation job does not validates the file by column indexes
              rather than columns names. So, if it expected 10 columns and you
              only provide 9, it will fail. However, if you provide 10 columns that
              are out of order, the job will succeed, but the records will not
              match.

        Args:
            table: Parsons Table Object
                A table object with the required columns. (Required columns provided be TargetSmart)
            job_type: str
                The match job type. **This is case sensitive.** (Match job names provided by TargetSmart)
            job_name: str
                Optional job name.
            emails: list
                A list of emails that will received status notifications. This
                is useful in debugging failed jobs.
            call_back: str
                A callback url to which the status will be posted. See
                `TargetSmart documentation <https://docs.targetsmart.com/developers/automation/index.html#http-callback>`_
                for more details.
            remove_files: boolean
                Remove the configuration, file to be matched and matched file from
                the TargetSmart FTP upon completion or failure of match.
        """ # noqa: E501,E261

        # Generate a match job
        job_name = job_name or str(uuid.uuid1())

        try:
            # Upload table
            self.sftp.put_file(table.to_csv(), f'{self.sftp_dir}/{job_name}_input.csv')
            logger.info(f'Table with {table.num_rows} rows uploaded to TargetSmart.')

            # Create/upload XML configuration
            xml = self.create_job_xml(job_type, job_name, emails=emails,
                                      status_key=job_name, call_back=call_back)
            self.sftp.put_file(xml, f'{self.sftp_dir}/{job_name}.job.xml')
            logger.info('Match configuration uploaded to TargetSmart.')

            # Check xml configuration status
            self.poll_config_status(job_name)

            # Check the status of the match
            self.match_status(job_name)

            # Download the resulting file
            tbl = Table.from_csv(self.sftp.get_file(f'{self.sftp_dir}/{job_name}_output.csv'))

        finally:
            # Clean up files
            if remove_files:
                self.remove_files(job_name)

            # Log Stats
            # TO DO: Provide some stats on the match

        # Return file as a Table
        return tbl