Exemple #1
0
    def register_extract_monitor(self, starttime, endtime):
        """Adds an entry into the extract monitor for a given extract task

        Parameters
        ----------
        starttime : datetime.datetime
            datetime object generated at the beginning of the data
            extraction
        endtime : datetime.datetime
            datetime object generated at the end of the data extraction
        """

        query = """
        INSERT INTO "public"."table_extract_detail" VALUES (
            %(task_id)s, %(class_name)s, %(task_date_params)s,
            %(task_other_params)s, %(start_dt)s, %(end_dt)s, %(run_time_sec)s,
            %(manifest_path)s, %(data_path)s, %(output_exists)s, %(row_count)s,
            %(upload_size)s, %(exception)s
        );
        """
        args = {
            "task_id":
            "{}(alias={}, database={}, table={})".format(
                self.__class__.__name__,
                self.database_alias,
                self.db_name,
                self.source_table_name,
            ),
            "class_name":
            self.__class__.__name__,
            "task_date_params":
            None,
            "task_other_params":
            None,
            "start_dt":
            starttime.replace(microsecond=0),
            "end_dt":
            endtime.replace(microsecond=0),
            "run_time_sec": (endtime - starttime).total_seconds(),
            "manifest_path":
            self.copy_target_url,
            "data_path":
            "s3://{}/{}".format(get_redshift().s3_config.bucket,
                                self.s3_key_prefix),
            "output_exists":
            self.row_count > 0,
            "row_count":
            self.row_count,
            "upload_size":
            self.upload_size,
            "exception":
            None,
        }

        self.logger.info("Inserting record into table_extract_detail")
        with get_redshift().cursor() as cur:
            cur.execute(query, args)
Exemple #2
0
    def write_manifest_file(self):
        if not self.manifest_mode:
            raise TableStateError("Cannot write manifest when not in manifest mode")

        entries = [
            {"url": _s3_url(get_redshift().s3_config.bucket, key), "mandatory": True}
            for key in self.data_file_keys()
        ]
        manifest = {"entries": entries}

        with BytesIO() as f:
            f.write(json.dumps(manifest).encode())
            self._upload_s3(
                f, get_redshift().s3_config.bucket, self.manifest_s3_data_key()
            )
Exemple #3
0
    def avro_to_s3(self, results_iter, results_schema):
        """Attempts to serialize a result set to an AVRO file

        returns true if it complete writes the entire result_iter and false
        if there were records remaining when it hit the maximum file size.
        """
        with BytesIO() as f:
            complete, row_count = write_avro_file(
                f,
                results_iter,
                results_schema,
                self.destination_table_name,
                self.max_file_size,
            )

            if self.row_count is None:
                self.row_count = row_count
            else:
                self.row_count += row_count

            self.upload_size += f.tell()

            if not complete:
                self.manifest_mode = True

            if row_count > 0:
                self._upload_s3(
                    f, get_redshift().s3_config.bucket, self.next_s3_data_file_key()
                )
                self.num_data_files += 1

        return complete
Exemple #4
0
    def set_last_updated_index(self):
        """Adds a new index to the pipeline_table_index table for updated
        tables
        """
        if self.new_index_value is None:
            return

        query = f"""
        INSERT INTO "{self.index_schema}"."{self.index_table}" VALUES
        (%s, %s, %s, %s)
        """

        if isinstance(self.new_index_value, int):
            new_index_value = str(self.new_index_value)
        elif isinstance(self.new_index_value, datetime.datetime):
            new_index_value = self.new_index_value.strftime("%Y-%m-%d %H:%M:%S.%f")
        else:
            msg = "Don't know how to handle index {} of type {}".format(
                self.new_index_value, str(type(self.new_index_value))
            )
            raise TypeError(msg)

        self.logger.info("Updating index table")
        with get_redshift().cursor() as cur:
            args = (
                self.database_alias,
                self.db_name,
                self.source_table_name,
                new_index_value,
            )
            self.logger.debug(cur.mogrify(query, args))
            cur.execute(query, args)
Exemple #5
0
 def s3_key_prefix(self):
     return "{}/{}.{}.{}".format(
         get_redshift().s3_config.prefix,
         self.database_alias,
         self.source_table_name,
         self.date_key,
     )
Exemple #6
0
    def register_and_cleanup(self):
        # Register in index table.
        self.set_last_updated_index()

        # Register in monitor table
        # self.endtime = datetime.datetime.utcnow()
        # self.register_load_monitor()

        # Clean up S3
        for key in self.data_file_keys():
            self.s3.delete_object(Bucket=get_redshift().s3_config.bucket, Key=key)

        if self.manifest_mode:
            self.s3.delete_object(
                Bucket=get_redshift().s3_config.bucket, Key=self.manifest_s3_data_key()
            )
Exemple #7
0
    def _load_old_index_value(self):
        """Sets and gets the index_value property, retrieved from Redshift

        Returns
        -------
        index_value : variable
            Since index_value can vary from table to table, this can be many
            different types. Most common will be a datetime or int, but
            could also be a date or string. Returns None if no previous index
            value found
        """

        query = f"""
        SELECT index_value
          FROM "{self.index_schema}"."{self.index_table}"
         WHERE datastore_name = %s
           AND database_name = %s
           AND table_name = %s
        ORDER BY created_ts DESC
        LIMIT 1;
        """
        self.logger.debug("Querying Redshift for last updated index")
        with get_redshift().cursor() as cur:
            cur.execute(
                query,
                (self.database_alias, self.db_name, self.source_table_name))
            index_value = cur.fetchone()

        if index_value is None:
            self.logger.info("No index found. Dumping entire table: %s.",
                             self.source_table_name)
            return index_value
        else:
            self.logger.info("Index found: %s", index_value[0])
            return index_value[0]
Exemple #8
0
    def register_load_monitor(self):
        """Adds an entry into the load monitor for a given load task

        Parameters
        ----------
        starttime : datetime
            datetime object generated at the beginning of the load
        endtime : datetime
            datetime object generated at the end of the load
        rows_inserted : int
            Total number of rows generated by the Redshift COPY command
        rows_deleted : int
            Count of rows deleted by primary key in the destination table
        load_size : int
            Size in bytes of the file in S3 used in the Redshift COPY
            command
        """

        query = """
            INSERT INTO "public"."table_load_detail" VALUES (
                %(task_id)s, %(class_name)s, %(task_date_params)s,
                %(task_other_params)s, %(target_table)s, %(start_dt)s,
                %(end_dt)s, %(run_time_sec)s, %(extract_task_update_id)s,
                %(data_path)s, %(manifest_cleaned)s, %(rows_inserted)s,
                %(rows_deleted)s, %(load_size)s, %(exception)s
            );
        """
        task_id = "{}(alias={}, database={}, table={})".format(
            self.__class__.__name__,
            self.database_alias,
            self.db_name,
            self.source_table_name,
        )
        target_table = "{}.{}".format(
            self.destination_schema_name, self.destination_table_name
        )
        args = {
            "task_id": task_id,
            "class_name": self.__class__.__name__,
            "task_date_params": None,
            "task_other_params": None,
            "target_table": target_table,
            "start_dt": self.starttime.replace(microsecond=0),
            "end_dt": self.endtime.replace(microsecond=0),
            "run_time_sec": (self.endtime - self.starttime).total_seconds(),
            "extract_task_update_id": task_id,
            "data_path": self.copy_target_url,
            "manifest_cleaned": False,
            "rows_inserted": self.rows_inserted,
            "rows_deleted": self.rows_deleted,
            "load_size": self.upload_size,
            "exception": None,
        }

        self.logger.info("Inserting record into table_load_detail")
        with get_redshift().cursor() as cur:
            cur.execute(query, args)
Exemple #9
0
    def get_destination_table_columns(self):
        query = """
        SELECT "column"
          FROM pg_table_def
         WHERE schemaname = %s
           AND tablename = %s;
        """

        with get_redshift().cursor() as cur:
            self.set_search_path(cur)
            cur.execute(
                query, (self.destination_schema_name, self.destination_table_name)
            )
            results = cur.fetchall()

        return [x[0] for x in results]
Exemple #10
0
 def copy_target_url(self):
     return _s3_url(get_redshift().s3_config.bucket, self.copy_target_key)
Exemple #11
0
    def load(self):
        """The Load phase of the pipeline. Takes a file in S3 and issues a
        Redshift COPY command to import the data into a staging table. It
        then upserts the data by deleting rows in the destination table
        that match on PK, inserts all rows from the staging table into the
        destination table, and then deletes the staging table.
        """

        self.starttime = datetime.datetime.utcnow()

        # Initializing Data
        delete_clause = self.get_delete_sql()
        staging_table = self.staging_table_name
        destination_table = self.destination_table_name
        is_normal_load = self._destination_table_status == self.DESTINATION_TABLE_OK
        is_rebuild = self._destination_table_status == self.DESTINATION_TABLE_REBUILD
        is_dne = self._destination_table_status == self.DESTINATION_TABLE_DNE

        with get_redshift().cursor() as cur:
            self.set_search_path(cur)

            # If table does not exist, create it
            if is_dne:
                create_table = self.query_to_redshift_create_table(
                    self.get_query_sql(), self.destination_table_name
                )
                cur.execute(create_table)
            elif not is_normal_load and not is_rebuild:
                raise RuntimeError(
                    "Invalid table status in redshift_copy: {}".format(
                        self._destination_table_status
                    )
                )

            # If there is no row updates, just skip copy and return
            if self.row_count == 0:
                return

            cur.execute("BEGIN TRANSACTION;")
            # Lock the table early to avoid deadlocks in many-to-one pipelines.
            query = generate_lock_query(destination_table)
            cur.execute(query)

            query = generate_drop_exists_query(staging_table)
            cur.execute(query)

            if is_rebuild:
                # Build staging table anew and grant it appropriate permissions
                self.logger.info(
                    "Creating staging table to rebuild %s", destination_table
                )
                create_staging_table = self.query_to_redshift_create_table(
                    self.get_query_sql(), staging_table
                )
                permissions_sql = self.get_grant_sql(cur)
                cur.execute(create_staging_table)
                if permissions_sql:
                    self.logger.info(
                        "Copying permissions onto %s:\n%s",
                        staging_table,
                        permissions_sql,
                    )
                    cur.execute(permissions_sql)
            else:
                # If not rebuilding, create staging with LIKE
                self.logger.info("Creating staging table %s", staging_table)
                query = generate_create_table_like_query(
                    staging_table, destination_table
                )
                cur.execute(query)

            # Issuing Copy Command
            self.logger.info("Issuing copy command")
            query = generate_copy_query(
                staging_table,
                self.copy_target_url,
                get_redshift().iam_copy_role,
                self.manifest_mode,
            )
            self.logger.debug(query)
            cur.execute(query)

            # Row delete and count logic
            if is_rebuild or (self.append_only and not self.full_refresh):
                self.rows_deleted = 0
            else:
                cur.execute(delete_clause)
                self.rows_deleted = cur.rowcount

            # Row insert and count logic
            if is_rebuild:
                self.logger.info("Swapping staging table into %s", destination_table)
                # DNE overrides rebuild, so we can assume the table exists
                query = generate_drop_query(destination_table)
                self.logger.debug(query)
                cur.execute(query)
                query = generate_rename_query(staging_table, destination_table)
                self.logger.debug(query)
                cur.execute(query)
                query = generate_count_query(destination_table)
                self.logger.debug(query)
                cur.execute(query)
                self.rows_inserted = cur.fetchall()[0]
            else:
                query = generate_insert_all_query(staging_table, destination_table)
                self.logger.debug(query)
                cur.execute(query)
                self.rows_inserted = cur.rowcount
                query = generate_drop_query(staging_table)
                self.logger.debug(query)
                cur.execute(query)
            cur.execute("END TRANSACTION;")
            self.register_and_cleanup()