class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        _log.info('Executing: ' + str(self.sql))
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param sql: the sql code to be executed. (templated)
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: str
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :type autocommit: bool
    :param parameters: (optional) the parameters to render the SQL query with.
    :type parameters: mapping or iterable
    :param database: name of database which overwrite defined one in connection
    :type database: str
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            parameters=None,
            database=None,
            *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database

    def execute(self, context):
        self.log.info('Executing: %s', self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
        for output in self.hook.conn.notices:
            self.log.info(output)
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :parame verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:
        - False: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - path/to/cert/bundle.pem: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            verify=None,
            copy_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.copy_options = copy_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Running sql query...")
        redshift.run(self.sql)
        self.log.info("...done.")
Example #5
0
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     redshift_hook.run(str(self.sql_query))
   def execute(self, context):
       self.log.info('LoadFactOperator implemented by Mohan Hegde')
 
       self.log.info("LoadFact", self.sql)
       postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
       postgres_hook.run(self.sql)
Example #7
0
def copy_table(tablename, filename):
    '''Copy data from source files (.csv files) into src stage tables in the local postgres database'''
    pghook = PostgresHook('postgres_local')
    pghook.run(COPY_SQL.format(tablename, filename))
Example #8
0
def create_indexes(SQL):
    '''Create indexes on the tables to improve query performance'''
    pghook = PostgresHook('postgres_local')
    pghook.run(SQL)
def execute_postgres_hook(query):
    pg_hook = PostgresHook('redshift_lake')
    pg_hook.run(query)
Example #10
0
 def execute(self, context):
     postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     postgres_hook.run(f"Delete from {self.table};")
     postgres_hook.run(self.sql)
Example #11
0
 def execute(self, context):
     self.log.info(f"start loading fact table {self.target_table}")
     redshift = PostgresHook(postgres_conn_id=self.conn_id)
     str_sql_insert = f"INSERT INTO {self.target_table} {self.str_sql_select}"
     redshift.run(str_sql_insert)
     self.log.info(f"finished loading fact table {self.target_table}")
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers
    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param s3_conn_id: reference to a specific S3 connection
    :type s3_conn_id: string
    :param options: reference to a list of UNLOAD options
    :type options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            s3_conn_id='s3_default',
            unload_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(RedshiftToS3Transfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.s3_conn_id = s3_conn_id
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        a_key, s_key = self.s3.get_credentials()
        unload_options = ('\n\t\t\t').join(self.unload_options)

        _log.info("Retrieving headers from %s.%s..." % (self.schema, self.table))

        columns_query = """SELECT column_name
                            FROM information_schema.columns
                            WHERE table_schema = '{0}'
                            AND   table_name = '{1}'
                            ORDER BY ordinal_position
                        """.format(self.schema, self.table)

        cursor = self.hook.get_conn().cursor()
        cursor.execute(columns_query)
        rows = cursor.fetchall()
        columns = map(lambda row: row[0], rows)
        column_names = (', ').join(map(lambda c: "\\'{0}\\'".format(c), columns))
        column_castings = (', ').join(map(lambda c: "CAST({0} AS text) AS {0}".format(c),
                                            columns))

        unload_query = """
                        UNLOAD ('SELECT {0}
                        UNION ALL
                        SELECT {1} FROM {2}.{3}')
                        TO 's3://{4}/{5}/{3}_'
                        with
                        credentials 'aws_access_key_id={6};aws_secret_access_key={7}'
                        {8};
                        """.format(column_names, column_castings, self.schema, self.table,
                                self.s3_bucket, self.s3_key, a_key, s_key, unload_options)

        _log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        _log.info("UNLOAD command complete...")
Example #13
0
    def execute(self, context):
        # Set AWS S3 and Redshift connections
        self.log.info("Setting up Redshift connection...")
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info("Redshift connection created.")

        self.log.info("Clearing data from Redshift target table...")
        redshift.run("DELETE FROM {}".format(self.target_table))

        # Prepare S3 paths
        self.log.info("Preparing Copying data from S3 to Redshift...")
        exec_date_rendered = self.execution_date.format(**context)
        self.log.info("Execution_date: {}".format(exec_date_rendered))
        exec_date_obj = datetime.datetime.strptime( exec_date_rendered, \
                                                    '%Y-%m-%d')
        self.log.info("Execution_year: {}".format(exec_date_obj.year))
        self.log.info("Execution_month: {}".format(exec_date_obj.month))
        rendered_key_raw = self.s3_key.format(**context)
        self.log.info("Rendered_key_raw: {}".format(rendered_key_raw))

        if self.use_partitioned_data == "True":
            self.log.info("Rendered_key_raw: {}".format(rendered_key_raw))
            rendered_key = rendered_key_raw.format(\
                                        exec_date_obj.year, \
                                        exec_date_obj.month)
        else:
            rendered_key = rendered_key_raw
        self.log.info("Rendered_key: {}".format(rendered_key))

        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        self.log.info("S3_path: ".format(s3_path))

        if self.json_paths == "":
            s3_json_path = "\'auto\'"
        else:
            s3_json_path = "\'s3://{}/{}\'".format( self.s3_bucket, \
                                                    self.json_paths)

        self.log.info("S3_PATH: {}".format(s3_path))
        self.log.info("S3_JSON_PATH: {}".format(s3_json_path))

        # Copy data from S3 to Redshift
        # Select oparations based on input file format (JSON or CSV)
        if self.file_format == "json":
            self.log.info("Preparing for JSON input data")
            formatted_sql = StageToRedshiftOperator.sql_template_json.format(
                self.target_table,
                s3_path,
                credentials.access_key,
                credentials.secret_key,
                s3_json_path
            )
        elif self.file_format == "csv":
            self.log.info("Preparing for CSV input data")
            formatted_sql = StageToRedshiftOperator.sql_template_csv.format(
                self.target_table,
                s3_path,
                credentials.access_key,
                credentials.secret_key,
                self.ignore_headers,
                self.delimiter
            )
        else:
            self.log.info('File Format defined something else than \
                        JSON or CSV. Other formats not supported.')
            pass

        # Executing COPY operation
        self.log.info("Executing Redshift COPY operation...")
        redshift.run(formatted_sql)
        self.log.info("Redshift COPY operation DONE.")
Example #14
0
def push_to_master_table(*args, **kwargs):
    table = kwargs["params"]["table"]
    redshift_hook = PostgresHook("redshift")
    logging.info("Pushing {} data into master table".format(table))
    redshift_hook.run("INSERT INTO {}_master SELECT * FROM {};".format(
        table, table))
Example #15
0
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     if not self.append_data:
         redshift_hook.run('''DELETE FROM {}'''.format(self.table))
     redshift_hook.run(self.sql)
     self.log.info(f'Loading records into {self.table} completed')
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info("Loading the fact table into Redshift")
     formatted_sql = LoadFactOperator.insert_sql.format(
         self.table, self.load_sql_stmt)
     redshift.run(formatted_sql)
Example #17
0
class RedshiftToDruidOperator(BaseOperator):
    """
    Unloads a Redshift table to S3 and load into apache druid

    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: str
    :param s3_key: reference to a specific S3 key
    :type s3_key: str
    :param schema: reference to a specific schema in redshift database
    :type schema: str
    :param table: reference to a specific table in redshift database
    :type table: str
    :param sql: Custom SQL statement to execute
    :type sql: str
    :param druid_ingest_spec: druid ingestion json spec
    :type druid_ingest_spec: json
    :param unload_options: reference to a list of UNLOAD options
    :type unload_options: list
    :param include_header: Should include headers in the final file?
    :type include_header: bool
    :param autocommit: if True perform autocommit
    :type autocommit: bool
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: str
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: str
    :param druid_conn_id: reference to a specific Druid overlord connection
    :type druid_conn_id: str

    """

    template_fields = (
        "s3_bucket",
        "s3_key",
        "schema",
        "table",
        "sql",
        "druid_ingest_spec",
    )
    template_ext = (".sql",)
    ui_color = "#fff7e6"

    @apply_defaults
    def __init__(
        self,
        s3_bucket,
        s3_key,
        schema,
        table,
        sql=None,
        druid_ingest_spec=None,
        unload_options=tuple(),
        include_header=False,
        autocommit=False,
        aws_conn_id="aws_default",
        redshift_conn_id="postgres_default",
        druid_conn_id="druid_ingest_default",
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.aws_conn_id = aws_conn_id
        self.redshift_conn_id = redshift_conn_id
        self.druid_conn_id = druid_conn_id
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.schema = schema
        self.table = table
        self.sql = sql
        self.druid_ingest_spec = druid_ingest_spec
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.include_header = include_header

        self.pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)

        if self.include_header and "PARALLEL OFF" not in [
            uo.upper().strip() for uo in unload_options
        ]:
            self.unload_options = list(unload_options) + ["PARALLEL OFF"]

    def execute(self, context):
        self.unload()
        self.druid_ingest()

    def unload(self):
        credentials = self.s3_hook.get_credentials()
        unload_options = "\n\t\t\t".join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...", self.schema, self.table)

            columns_query = """ SELECT column_name
                                    FROM information_schema.columns
                                    WHERE table_schema = '{schema}'
                                    AND   table_name = '{table}'
                                    ORDER BY ordinal_position
                            """.format(
                schema=self.schema, table=self.table
            )

            cursor = self.pg_hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]

            column_names = ", ".join("{0}".format(c) for c in columns)
            column_headers = ", ".join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ", ".join(
                "CAST({0} AS text) AS {0}".format(c) for c in columns
            )

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order""".format(
                column_names=column_names,
                column_castings=column_castings,
                column_headers=column_headers,
                schema=self.schema,
                table=self.table,
            )
        else:
            if self.sql:
                select_query = self.sql
            else:
                select_query = "SELECT * FROM {schema}.{table}".format(
                    schema=self.schema, table=self.table
                )

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(
            select_query=select_query,
            table=self.table,
            s3_bucket=self.s3_bucket,
            s3_key=self.s3_key,
            access_key=credentials.access_key,
            secret_key=credentials.secret_key,
            unload_options=unload_options,
        )

        self.log.info("Executing UNLOAD command...")
        self.pg_hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")

    def druid_ingest(self):
        druid_hook = DruidHook(druid_ingest_conn_id=self.druid_conn_id)

        self.log.info("Submitting druid task: %s", json.dumps(self.druid_ingest_spec))
        druid_hook.submit_indexing_job(self.druid_ingest_spec)
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     
     self.log.info('Cleaning staging tables')
     redshift.run(self.sql_query)        
 def execute(self, context):
   redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
   redshift.run(self.sql.insert_query)
     self.log.info('LoadFactOperator Successful')
def destroy_and_create_schema():
    logger = logging.getLogger(__name__)
    pg_hook = PostgresHook('redshift_lake')

    for command in table_commands:
        pg_hook.run(command)
Example #21
0
 def execute(self, context):
     redshift = PostgresHook(self.connection_id)
     self.log.info("Appending records to fact table {}".format(
         self.fact_table))
     redshift.run("INSERT INTO {} ({})".format(self.fact_table,
                                               self.sql_statement))
Example #22
0
def create_table(SQL):
    '''Create tables in the local postgres database'''
    pghook = PostgresHook('postgres_local')
    pghook.run(SQL)
def load_data_to_redshift(*args,**kwargs):
    aws_hook = AwsHook('aws_credentials')
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook('redshift')
    redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key))
Example #24
0
def insert_update_table(SQL):
    '''INSERT/UPDATE data into the table in local postgres database'''
    pghook = PostgresHook('postgres_local')
    pghook.run(SQL)
Example #25
0
    def execute(self, context):
        pg = PostgresHook(postgres_conn_id=self.pg_conn_id)

        sql_statement = 'REFRESH MATERIALIZED VIEW ' + self.materialized_view

        pg.run(sql_statement)
Example #26
0
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info("Truncating dim table '{}'".format(self.table))
     redshift.run("TRUNCATE {}".format(self.table))
     self.log.info("loading dim table '{}'".format(self.table))
     redshift.run(self.sql_stmt)
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: str
    :param table: reference to a specific table in redshift database
    :type table: str
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: str
    :param s3_key: reference to a specific S3 key
    :type s3_key: str
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: str
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:

        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 schema: str,
                 table: str,
                 s3_bucket: str,
                 s3_key: str,
                 redshift_conn_id: str = 'redshift_default',
                 aws_conn_id: str = 'aws_default',
                 verify: Optional[Union[bool, str]] = None,
                 copy_options: Optional[List] = None,
                 autocommit: bool = False,
                 *args,
                 **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.copy_options = copy_options or []
        self.autocommit = autocommit

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
Example #28
0
    def execute(self, context):
        aws_hook = AwsHook(self.aws_creds)
        creds = aws_hook.get_credentials()
        rs_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Starting gathering CSV files from S3...")

        s3r = boto3.resource("s3",
                             region_name=self.s3_region,
                             aws_access_key_id=creds.access_key,
                             aws_secret_access_key=creds.secret_key)
        bucket = s3r.Bucket(self.s3_bucket)
        csv_obj = bucket.objects.filter(Prefix=self.s3_key)
        csvlist = []
        for c in csv_obj:
            csvlist.append(c.key)
        csvlist = [x for x in csvlist if x.find(".csv") > -1]

        client = boto3.client("s3",
                              region_name=self.s3_region,
                              aws_access_key_id=creds.access_key,
                              aws_secret_access_key=creds.secret_key)

        i = 1
        for c in csvlist:
            ind = str(i)
            csvo = client.get_object(Bucket=self.s3_bucket, Key=c)
            body = csvo["Body"]
            csv_string = body.read().decode("utf-8")
            df = pd.read_csv(StringIO(csv_string))
            cols = list(df.columns)

            #for bringing sql fields in sql format
            cols = ['"' + c + '"' for c in cols]
            cols = ', '.join(cols)

            self.log.info(f"Copying file {ind} from S3 to Redshift for " +
                          self.table)
            s3_path = "s3://" + self.s3_bucket + "/" + c

            if self.include_cols == True:
                formated_sql = S3CSVToRedshiftOperator.copy_sql.format(
                    self.table, cols, s3_path, creds.access_key,
                    creds.secret_key, "'" + self.s3_region + "'")

            else:
                formated_sql = S3CSVToRedshiftOperator.copy_sql_excl.format(
                    self.table, s3_path, creds.access_key, creds.secret_key,
                    "'" + self.s3_region + "'")

            rs_hook.run(formated_sql)
            self.log.info(f'StageToRedshiftOperator finished for file {ind}!')

            i += 1

        check_sql = f"SELECT COUNT(*) FROM {self.table}"

        checkno = rs_hook.get_records(check_sql)

        if checkno == 0:
            raise ValueError("No data transfered to Redshift!")

        self.log.info("Data Quality check passed successfully!")
Example #29
0
def attach_persona_tags_to_people(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")}

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(
        postgres_conn_id=pg_connection,
        keepalives=1,
        keepalives_idle=30,
        keepalives_interval=10,
        keepalives_count=5
    )

    person_entity_id = requests.get(
        f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes",
        params={"$filter": "Name eq 'Rock.Model.Person'"},
        headers=headers
    ).json()[0]['Id']

    persona_params = {
        "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}",
        "$select": "Id,Name,Guid",
        "$orderby": "ModifiedDateTime desc",
    }

    if not kwargs['do_backfill']:
        persona_params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'"


    personas = requests.get(
            f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews",
            params=persona_params,
            headers=headers).json()


    for persona in personas:
        fetched_all = False
        skip = 0
        top = 10000

        while not fetched_all:
            params = {
                "$top": top,
                "$skip": skip,
                "$select": "Id",
                "$orderby": "ModifiedDateTime desc",
            }

            r = requests.get(
                    f"{Variable.get(kwargs['client'] + '_rock_api')}/People/DataView/{persona['Id']}",
                    params=params,
                    headers=headers)
            rock_objects = r.json()

            if not isinstance(rock_objects, list):
                print(rock_objects)
                print("oh uh, we might have made a bad request")
                print("top: {top}")
                print("skip: {skip}")
                skip += top
                continue

            def generate_insert_sql(tag_id, person_id):
                return f'''
                    INSERT INTO people_tag ("tagId", "personId", "createdAt", "updatedAt")
                    SELECT t.id,
                           p.id,
                           NOW(),
                           NOW()
                    FROM people p,
                         tags t
                    WHERE t."originId" = '{tag_id}'
                      AND p."originId" = '{person_id}'
                    ON CONFLICT ("tagId", "personId") DO NOTHING
                '''

            sql_to_run = map(lambda p: generate_insert_sql(persona['Id'], p['Id']), rock_objects)

            pg_hook.run(sql_to_run)

            skip += top
            fetched_all = len(rock_objects) < top
Example #30
0
def create_table(create_table_sql):
    hook = PostgresHook()
    hook.run(create_table_sql, autocommit=True)
Example #31
0
def fetch_and_save_persona_tags(ds, *args, **kwargs):
    if 'client' not in kwargs or kwargs['client'] is None:
        raise Exception("You must configure a client for this operator")

    headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")}

    fetched_all = False
    skip = 0
    top = 10000

    pg_connection = kwargs['client'] + '_apollos_postgres'
    pg_hook = PostgresHook(
        postgres_conn_id=pg_connection,
        keepalives=1,
        keepalives_idle=30,
        keepalives_interval=10,
        keepalives_count=5
    )

    person_entity_id = requests.get(
        f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes",
        params={"$filter": "Name eq 'Rock.Model.Person'"},
        headers=headers
    ).json()[0]['Id']

    while not fetched_all:

        params = {
            "$top": top,
            "$skip": skip,
            "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}",
            "$select": "Id,Name,Guid",
            "$orderby": "ModifiedDateTime desc",
        }

        if not kwargs['do_backfill']:
            params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'"

        print(params)

        r = requests.get(
                f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews",
                params=params,
                headers=headers)
        rock_objects = r.json()

        if not isinstance(rock_objects, list):
            print(rock_objects)
            print("oh uh, we might have made a bad request")
            print("top: {top}")
            print("skip: {skip}")
            skip += top
            continue


        skip += top
        fetched_all = len(rock_objects) < top

        # "createdAt", "updatedAt", "originId", "originType", "apollosType", "name", "data", "type"
        def update_tags(obj):
            return (
                kwargs['execution_date'],
                kwargs['execution_date'],
                obj['Id'],
                'rock',
                'Tag',
                obj['Name'],
                Json({ "guid": obj["Guid"] }),
                "Persona"
            )

        def fix_casing(col):
            return "\"{}\"".format(col)

        tags_to_insert = list(map(update_tags, rock_objects))
        columns = list(map(fix_casing, ("createdAt","updatedAt", "originId", "originType", "apollosType", "name", "data", "type")))

        pg_hook.insert_rows(
            'tags',
            tags_to_insert,
            columns,
            0,
            True,
            replace_index = ('"originId"', '"originType"')
        )

        add_apollos_ids = """
        UPDATE "tags"
        SET "apollosId" = "apollosType" || ':' || id::varchar
        WHERE "originType" = 'rock' and "apollosId" IS NULL
        """

        pg_hook.run(add_apollos_ids)
Example #32
0
 def execute(self, context):
     self.log.info('LoadFactOperator implemented ')
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     formatted_sql = "INSERT INTO {} ".format(self.table)+self.sql_query
     redshift.run(formatted_sql)
Example #33
0
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info("loading fact table ...")
     redshift.run(self.sql_stmt)
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            copy_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.copy_options = copy_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
Example #35
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        query = LoadFactOperator.sql.format(self.table, self.load_query)

        redshift.run(query)
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers

    :param schema: reference to a specific schema in redshift database
    :type schema: str
    :param table: reference to a specific table in redshift database
    :type table: str
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: str
    :param s3_key: reference to a specific S3 key
    :type s3_key: str
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: str
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:

        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param unload_options: reference to a list of UNLOAD options
    :type unload_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            verify=None,
            unload_options=tuple(),
            autocommit=False,
            include_header=False,
            *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.include_header = include_header

        if self.include_header and \
           'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]:
            self.unload_options = list(unload_options) + ['PARALLEL OFF', ]

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...",
                          self.schema, self.table)

            columns_query = """SELECT column_name
                                        FROM information_schema.columns
                                        WHERE table_schema = '{schema}'
                                        AND   table_name = '{table}'
                                        ORDER BY ordinal_position
                            """.format(schema=self.schema,
                                       table=self.table)

            cursor = self.hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]
            column_names = ', '.join("{0}".format(c) for c in columns)
            column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                        for c in columns)

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order"""\
                            .format(column_names=column_names,
                                    column_castings=column_castings,
                                    column_headers=column_headers,
                                    schema=self.schema,
                                    table=self.table)
        else:
            select_query = "SELECT * FROM {schema}.{table}"\
                .format(schema=self.schema,
                        table=self.table)

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               table=self.table,
                               s3_bucket=self.s3_bucket,
                               s3_key=self.s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :param unload_options: reference to a list of UNLOAD options
    :type unload_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            unload_options=tuple(),
            autocommit=False,
            parameters=None,
            include_header=False,
            *args, **kwargs):
        super(RedshiftToS3Transfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.parameters = parameters
        self.include_header = include_header

        if self.include_header and \
           'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]:
            self.unload_options = list(unload_options) + ['PARALLEL OFF', ]

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...",
                          self.schema, self.table)

            columns_query = """SELECT column_name
                                        FROM information_schema.columns
                                        WHERE table_schema = '{schema}'
                                        AND   table_name = '{table}'
                                        ORDER BY ordinal_position
                            """.format(schema=self.schema,
                                       table=self.table)

            cursor = self.hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]
            column_names = ', '.join("{0}".format(c) for c in columns)
            column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                        for c in columns)

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order"""\
                            .format(column_names=column_names,
                                    column_castings=column_castings,
                                    column_headers=column_headers,
                                    schema=self.schema,
                                    table=self.table)
        else:
            select_query = "SELECT * FROM {schema}.{table}"\
                .format(schema=self.schema,
                        table=self.table)

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               table=self.table,
                               s3_bucket=self.s3_bucket,
                               s3_key=self.s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")