def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
     for output in self.hook.conn.notices:
         self.log.info(output)
Ejemplo n.º 2
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: string
    :param sql: the sql code to be executed
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(PostgresOperator, self).__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        _log.info('Executing: ' + str(self.sql))
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
 def _query_postgres(self):
     """
     Queries Postgres and returns a cursor to the results.
     """
     postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     conn = postgres.get_conn()
     cursor = conn.cursor()
     cursor.execute(self.sql, self.parameters)
     return cursor
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...",
                          self.schema, self.table)

            columns_query = """SELECT column_name
                                        FROM information_schema.columns
                                        WHERE table_schema = '{schema}'
                                        AND   table_name = '{table}'
                                        ORDER BY ordinal_position
                            """.format(schema=self.schema,
                                       table=self.table)

            cursor = self.hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]
            column_names = ', '.join("{0}".format(c) for c in columns)
            column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                        for c in columns)

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order"""\
                            .format(column_names=column_names,
                                    column_castings=column_castings,
                                    column_headers=column_headers,
                                    schema=self.schema,
                                    table=self.table)
        else:
            select_query = "SELECT * FROM {schema}.{table}"\
                .format(schema=self.schema,
                        table=self.table)

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               table=self.table,
                               s3_bucket=self.s3_bucket,
                               s3_key=self.s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Ejemplo n.º 5
0
    def test_bulk_dump(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table))
                values = ",".join("('{}')".format(data) for data in input_data)
                cur.execute("INSERT INTO {} VALUES {}".format(self.table, values))
                conn.commit()

                with NamedTemporaryFile() as f:
                    hook.bulk_dump(self.table, f.name)
                    f.seek(0)
                    results = [line.rstrip().decode("utf-8") for line in f.readlines()]

        self.assertEqual(sorted(input_data), sorted(results))
Ejemplo n.º 6
0
    def test_bulk_load(self):
        hook = PostgresHook()
        input_data = ["foo", "bar", "baz"]

        with hook.get_conn() as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table))
                conn.commit()

                with NamedTemporaryFile() as f:
                    f.write("\n".join(input_data).encode("utf-8"))
                    f.flush()
                    hook.bulk_load(self.table, f.name)

                cur.execute("SELECT * FROM {}".format(self.table))
                results = [row[0] for row in cur.fetchall()]

        self.assertEqual(sorted(input_data), sorted(results))
Ejemplo n.º 7
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param sql: the sql code to be executed. (templated)
    :type sql: Can receive a str representing a sql statement,
        a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
    :param postgres_conn_id: reference to a specific postgres database
    :type postgres_conn_id: str
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :type autocommit: bool
    :param parameters: (optional) the parameters to render the SQL query with.
    :type parameters: mapping or iterable
    :param database: name of database which overwrite defined one in connection
    :type database: str
    """

    template_fields = ('sql',)
    template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self, sql,
            postgres_conn_id='postgres_default', autocommit=False,
            parameters=None,
            database=None,
            *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database

    def execute(self, context):
        self.log.info('Executing: %s', self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
        for output in self.hook.conn.notices:
            self.log.info(output)
    def setUp(self):
        postgres = PostgresHook()
        with postgres.get_conn() as conn:
            with conn.cursor() as cur:
                for table in TABLES:
                    cur.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table))
                    cur.execute("CREATE TABLE {}(some_str varchar, some_num integer);"
                                .format(table))

                cur.execute(
                    "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);",
                    ('mock_row_content_1', 42)
                )
                cur.execute(
                    "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);",
                    ('mock_row_content_2', 43)
                )
                cur.execute(
                    "INSERT INTO postgres_to_gcs_operator VALUES(%s, %s);",
                    ('mock_row_content_3', 44)
                )
Ejemplo n.º 9
0
def waze_jams_to_db():
    """Waze jams feed to PostGIS."""
    pg_hook = PostgresHook(postgres_conn_id='waze')
    tempfile = conf['temp_data_dir'] + '/waze_temp.csv'

    temp_df = pd.read_csv(tempfile, header=None, encoding='utf-8')

    rows_db = temp_df.values

    cols = ['uuid', 'waze_timestamp', 'street', 'start_node',
            'end_node', 'city', 'length', 'delay', 'speed', 'level',
            'road_type', 'geom']


    logging.info('Pushing Waze data to Postgis.')
    pg_hook.insert_rows('public.waze_jams',
                        rows_db,
                        target_fields=cols,
                        commit_every=0)

    return 'Successfully pushed data to PostGIS.'
Ejemplo n.º 10
0
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        a_key, s_key = self.s3.get_credentials()
        unload_options = ('\n\t\t\t').join(self.unload_options)

        logging.info("Retrieving headers from %s.%s..." % (self.schema, self.table))

        columns_query = """SELECT column_name
                            FROM information_schema.columns
                            WHERE table_schema = '{0}'
                            AND   table_name = '{1}'
                            ORDER BY ordinal_position
                        """.format(self.schema, self.table)

        cursor = self.hook.get_conn().cursor()
        cursor.execute(columns_query)
        rows = cursor.fetchall()
        columns = map(lambda row: row[0], rows)
        column_names = (', ').join(map(lambda c: "\\'{0}\\'".format(c), columns))
        column_castings = (', ').join(map(lambda c: "CAST({0} AS text) AS {0}".format(c),
                                            columns))

        unload_query = """
                        UNLOAD ('SELECT {0}
                        UNION ALL
                        SELECT {1} FROM {2}.{3}
                        ORDER BY 1 DESC')
                        TO 's3://{4}/{5}/{3}_'
                        with
                        credentials 'aws_access_key_id={6};aws_secret_access_key={7}'
                        {8};
                        """.format(column_names, column_castings, self.schema, self.table,
                                self.s3_bucket, self.s3_key, a_key, s_key, unload_options)

        logging.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        logging.info("UNLOAD command complete...")
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
class ExecuteCopyToRedshiftOperator(BaseOperator):
    @apply_defaults
    def __init__(self,
                 redshift_conn_id,
                 s3_bucket,
                 s3_key,
                 redshift_schema,
                 redshift_table,
                 iam_role,
                 mode,
                 where_condition_fn=None,
                 copy_params=[],
                 *args,
                 **kwargs):
        """
        Execute Redshift COPY command

        Modes:
            * append - just insert new rows to table
            * overwrite - truncate table and insert new rows
            * append_overwrite - remove selected rows using condition where_condition and then insert new rows

        :param redshift_conn_id: the destination redshift connection id
        :param s3_bucket: name of source S3 bucket
        :param s3_key: path to source data in S3 bucket - can be string or function converting airflow context to path (e.g. to have different path depending on execution date)
        :param redshift_schema: name of destination Redshift schema
        :param redshift_table: name of destination Redshift table
        :param iam_role: name of IAM role for Redshift COPY command
        :param mode: append, overwrite, append_overwrite
        :param where_condition_fn: obligatory parameter for append_overwrite mode, function returning condition for WHERE statement in delete
        :param copy_params: additional COPY command parameters
        """

        super(ExecuteCopyToRedshiftOperator, self).__init__(*args, **kwargs)
        self.pg_hook = PostgresHook(redshift_conn_id)
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_schema = redshift_schema
        self.redshift_table = redshift_table
        self.full_table_name = self.redshift_schema + "." + self.redshift_table
        self.iam_role = iam_role
        self.mode = mode.upper()
        self.where_condition_fn = where_condition_fn
        self.copy_params = copy_params

    def execute(self, context):
        if self.mode == "OVERWRITE":
            self.__truncate_table()
            self.__execute_copy(context)
        elif self.mode == "APPEND":
            self.__execute_copy(context)
        elif self.mode == "APPEND_OVERWRITE":
            self.__delete_from_table(context)
            self.__execute_copy(context)
            self.__vacuum_table()

    def __execute_query(self, query):
        print("Executing query: " + query)
        self.pg_hook.run(query)

    def __vacuum_table(self):
        query = "VACUUM FULL {}".format(self.full_table_name)
        # Using connection, because VACUUM can't be executed in transaction and pg_hook is executing within transaction
        conn = self.pg_hook.get_conn()
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(query)

    def __truncate_table(self):
        query = "TRUNCATE TABLE {}".format(self.full_table_name)
        self.__execute_query(query)

    def __delete_from_table(self, context):
        condition = self.where_condition_fn(context)
        query = "DELETE FROM {} WHERE {}".format(self.full_table_name,
                                                 condition)
        self.__execute_query(query)

    def __execute_copy(self, context):
        copy_query = self.__construct_copy_query(context)
        self.__execute_query(copy_query)

    def __construct_copy_query(self, context):
        additional_params = '\n'.join(self.copy_params)
        s3_key = self.s3_key if type(
            self.s3_key) == str else self.s3_key(context)
        return """
        COPY {table}
        FROM 's3://{bucket}/{key}'
        CREDENTIALS 'aws_iam_role={iam_role}'
        {additional_params}
        """.format(table=self.full_table_name,
                   bucket=self.s3_bucket,
                   key=s3_key,
                   iam_role=self.iam_role,
                   additional_params=additional_params)
 def execute(self, context):
     pg_hook = PostgresHook(self.redshift_conn_id)
     query = self.query if type(self.query) == str else self.query(context)
     logging.info("Execute Redshift query {}".format(query))
     pg_hook.run(query)
Ejemplo n.º 14
0
 def execute(self, context):
     self.log.info("Loading fact...")
     redshift_hook = PostgresHook(self.redshift_conn_id)
     redshift_hook.run(f"INSERT INTO {self.table} " + str(self.sql_query))
Ejemplo n.º 15
0
 def get_db_hook(self):
     return PostgresHook(postgres_conn_id=self.redshift_conn_id)
 def execute(self, context):
     self.log.info(f'LoadFactOperator Executing Query : {self.sql_stmt}')
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info(f'LoadFactOperator : Inserting data to {self.table}')
     insert_stmt = f'INSERT INTO {self.table} {self.sql_stmt}'
     redshift.run(insert_stmt)
Ejemplo n.º 17
0
    def execute(self, context):

        self.log.info('LoadDimensionOperator running!')

        redshift_hook = PostgresHook(self.redshift_conn_id)

        if self.dimension_name == 'song_table':
            sql_stmt = sql_statements.song_table_create
            redshift_hook.run(sql_stmt)
            sql_stmt = sql_statements.song_table_insert
            redshift_hook.run(sql_stmt)

        elif self.dimension_name == 'time_table':
            sql_stmt = sql_statements.time_table_create
            redshift_hook.run(sql_stmt)
            sql_stmt = sql_statements.time_table_insert
            redshift_hook.run(sql_stmt)

        elif self.dimension_name == 'artist_table':
            sql_stmt = sql_statements.artist_table_create
            redshift_hook.run(sql_stmt)
            sql_stmt = sql_statements.artist_table_insert
            redshift_hook.run(sql_stmt)

        elif self.dimension_name == 'user_table':
            sql_stmt = sql_statements.user_table_create
            redshift_hook.run(sql_stmt)
            sql_stmt = sql_statements.user_table_insert
            redshift_hook.run(sql_stmt)

        self.log.info('{} completed!'.format(self.dimension_name))

        return True
Ejemplo n.º 18
0
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: str
    :param table: reference to a specific table in redshift database
    :type table: str
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: str
    :param s3_key: reference to a specific S3 key
    :type s3_key: str
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: str
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:

        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 schema,
                 table,
                 s3_bucket,
                 s3_key,
                 redshift_conn_id='redshift_default',
                 aws_conn_id='aws_default',
                 verify=None,
                 copy_options=tuple(),
                 autocommit=False,
                 parameters=None,
                 *args,
                 **kwargs):
        super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.copy_options = copy_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
def create_stl_table(redshift_conn_id,
                     table,
                     error_table_name,
                     table_id):
    """
    Creates a Redshift table containing all stl error rows associated with the input staging table. All columns within the error table 
    will be converted to VARCHAR given that the errors may be linked to data type issues.

    Keyword Arguments:
    redshift_conn_id -- Redshift connection ID (str)
    table -- Staging table name (str)
    errror_table_name -- Name to be used to create the error table
    table_id -- The staging table's table_id defined in the stl_load_errors table
    """

    get_column_names = """
    SELECT 
        col_name 
    FROM
        (SELECT
            * 
        FROM
            pg_get_cols('{}')
        COLS(
            view_schema name, 
            view_name name, 
            col_name name, 
            col_type varchar,
            col_num int
            )
        )
    """
    
    create_error_table = """
        DROP TABLE IF EXISTS 
            {error_table_name};
        CREATE TABLE
            {error_table_name}
        (
            {cast}, 
            err_code INT,
            err_reason VARCHAR(72)
        );
        """

    insert_rows = """
        INSERT INTO 
            {error_table_name}
        SELECT 
            {split_part},
            err_code,
            err_reason
        FROM 
            stl_load_errors stl
        WHERE 
            stl.tbl = {id}
        """
            
    redshift = PostgresHook(redshift_conn_id)

    # load column names into pandas dataframe
    col_names_df = redshift.get_pandas_df(get_column_names.format(table))

    # put column names into list
    col_names_list = col_names_df['col_name'].values.tolist()

    cast_col = ""
    split_raw_line = ""
    # loop over table's column names
    for i,col in enumerate(col_names_list):
        # if last column don't include ',' at end of string
        if col == col_names_list[-1]:
            # adds CAST statement to cast_col string
            cast_col += "{} VARCHAR".format(col)
            # adds split_part function to split_raw_line string
            split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500))".format(i+1)
        else:
            cast_col += "{} VARCHAR, ".format(col)
            split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500)), ".format(i+1)

    format_dict = {
        'table': table, 
        'error_table_name': error_table_name,
        'cast': cast_col,
        'split_part':split_raw_line,
        'id': table_id
    }
    print(f'Creating error table: {error_table_name}')

    # creates an empty table with duplicate columns of looped table
    formatted_create_sql = create_error_table.format(**format_dict)
    redshift.run(formatted_create_sql)

    # inserts all stl_load_errors raw_line values as strings into apporiate columns within the empty table
    formatted_insert_sql = insert_rows.format(**format_dict)
    redshift.run(formatted_insert_sql)
    
    error_table_count = redshift.get_records(f'SELECT COUNT(*) FROM {error_table_name}')[0][0]
    table_count = redshift.get_records(f'SELECT COUNT(*) FROM {table}')[0][0]

    print(f'{table} COUNT: {table_count}')
    print(f'{error_table_name} COUNT: {error_table_count}')
    
    return error_table_name
Ejemplo n.º 20
0
    def execute(self, context):
       
        aws_hook = AwsHook(self.aws_credentials_id)
        
        
        session = botocore.session.get_session()
        #credentials = session.get_credentials()
        #credentials = credentials.get_frozen_credentials()
        #credentials = aws_hook.get_credentials()    
        
        credentials = session.get_credentials()
        
        self.log.info("Log this variable ...{}".format(credentials))
        
        
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        
        self.log.info("Create tables staging tables...")
        
        redshift.run("""CREATE TABLE IF NOT EXISTS staging_events (
                                    artist varchar(256),
                                    auth varchar(256),
                                    firstname varchar(256),
                                    gender varchar(256),
                                    iteminsession int4,
                                    lastname varchar(256),
                                    length numeric(18,0),
                                    "level" varchar(256),
                                    location varchar(256),
                                    "method" varchar(256),
                                    page varchar(256),
                                    registration numeric(18,0),
                                    sessionid int4,
                                    song varchar(256),
                                    status int4,
                                    ts int8,
                                    useragent varchar(256),
                                    userid int4)
                          """)
        
        redshift.run("""CREATE TABLE IF NOT EXISTS staging_songs (
                                    num_songs int4,
                                    artist_id varchar(256),
                                    artist_name varchar(256),
                                    artist_latitude numeric(18,0),
                                    artist_longitude numeric(18,0),
                                    artist_location varchar(256),
                                    song_id varchar(256),
                                    title varchar(256),
                                    duration numeric(18,0),
                                    "year" int4
)""")

        self.log.info("Clearing data from destination Redshift table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        #rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}".format(self.s3_bucket) #, rendered_key)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            'AKIAJ5QF3MAXCO53LIXQ',
            'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
            
        )
        redshift.run(formatted_sql)
        
        self.log.info(f"Success: Copying {self.table} from S3 to Redshift")
 def execute(self, context):
     log.info('Run Pandas over postgres')
     postgres_instance = PostgresHook(postgres_conn_id=self.connection_id)
     df = postgres_instance.get_pandas_df(self.sql_query)
     self.etl_function(df)
Ejemplo n.º 22
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Loading data into fact table")
        redshift.run(self.sql)
Ejemplo n.º 23
0
def create_compressed_csv(
    target_db: str, base_file_name: str, timestamp_output: bool, query: str, **kwargs,
):
    """
    Given a db, view name and a query create a csv file and upload it to s3.
    """
    if timestamp_output:
        file_name = (
            f'{base_file_name}-{kwargs["next_execution_date"].strftime("%Y-%m-%d")}.csv'
        )
    else:
        file_name = f'{base_file_name}.csv'

    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
        echo=config.DEBUG,
    )
    row_count = 0
    run_date = kwargs.get('run_date', kwargs.get('execution_date'))
    with engine.begin() as conn:
        result = conn.execution_options(stream_results=True).execute(
            sa.text(query), run_date=run_date.date()
        )

        def iter_results_as_csv_lines():
            nonlocal row_count
            s = StringIO()

            writer = csv.writer(s, quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(result.keys())

            while True:
                chunk = result.fetchmany(1000)
                if not chunk:
                    break

                row_count += len(chunk)
                for row in chunk:
                    writer.writerow(row)

                yield s.getvalue().encode('utf8')
                s.truncate(0)
                s.seek(0)

        zip_streamer = zipstream.ZipFile(compression=zipstream.ZIP_DEFLATED)
        zip_streamer.write_iter(file_name, iter_results_as_csv_lines())

        with tempfile.NamedTemporaryFile("wb") as fh:
            logger.info(f"Compressing data to {fh.name}")

            for data in zip_streamer:
                fh.write(data)
            fh.flush()

            logger.info(f'Wrote {row_count} rows to file {file_name} in {fh.name}')

            s3_client = S3Hook('DATA_WORKSPACE_S3')
            s3_output_path = f's3://csv-pipelines/{base_file_name}/{file_name}.zip'
            s3_client.load_file(
                fh.name,
                s3_output_path,
                bucket_name=config.DATA_WORKSPACE_S3_BUCKET,
                replace=True,
            )

            logger.info(f"Uploaded {file_name} to {s3_output_path}")
Ejemplo n.º 24
0
 def execute(self, context):
     hook = PostgresHook(postgres_conn_id=self.conn_id)
     hook.run("DELETE FROM {};".format(self.table))
     self.log.info("Loading fact table {}".format(self.table))
     hook.run(self.insert_query)
Ejemplo n.º 25
0
 def get_hook(self):
     try:
         if self.conn_type == 'mysql':
             from airflow.hooks.mysql_hook import MySqlHook
             return MySqlHook(mysql_conn_id=self.conn_id)
         elif self.conn_type == 'google_cloud_platform':
             from airflow.contrib.hooks.bigquery_hook import BigQueryHook
             return BigQueryHook(bigquery_conn_id=self.conn_id)
         elif self.conn_type == 'postgres':
             from airflow.hooks.postgres_hook import PostgresHook
             return PostgresHook(postgres_conn_id=self.conn_id)
         elif self.conn_type == 'hive_cli':
             from airflow.hooks.hive_hooks import HiveCliHook
             return HiveCliHook(hive_cli_conn_id=self.conn_id)
         elif self.conn_type == 'presto':
             from airflow.hooks.presto_hook import PrestoHook
             return PrestoHook(presto_conn_id=self.conn_id)
         elif self.conn_type == 'hiveserver2':
             from airflow.hooks.hive_hooks import HiveServer2Hook
             return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
         elif self.conn_type == 'sqlite':
             from airflow.hooks.sqlite_hook import SqliteHook
             return SqliteHook(sqlite_conn_id=self.conn_id)
         elif self.conn_type == 'jdbc':
             from airflow.hooks.jdbc_hook import JdbcHook
             return JdbcHook(jdbc_conn_id=self.conn_id)
         elif self.conn_type == 'mssql':
             from airflow.hooks.mssql_hook import MsSqlHook
             return MsSqlHook(mssql_conn_id=self.conn_id)
         elif self.conn_type == 'oracle':
             from airflow.hooks.oracle_hook import OracleHook
             return OracleHook(oracle_conn_id=self.conn_id)
         elif self.conn_type == 'vertica':
             from airflow.contrib.hooks.vertica_hook import VerticaHook
             return VerticaHook(vertica_conn_id=self.conn_id)
         elif self.conn_type == 'cloudant':
             from airflow.contrib.hooks.cloudant_hook import CloudantHook
             return CloudantHook(cloudant_conn_id=self.conn_id)
         elif self.conn_type == 'jira':
             from airflow.contrib.hooks.jira_hook import JiraHook
             return JiraHook(jira_conn_id=self.conn_id)
         elif self.conn_type == 'redis':
             from airflow.contrib.hooks.redis_hook import RedisHook
             return RedisHook(redis_conn_id=self.conn_id)
         elif self.conn_type == 'wasb':
             from airflow.contrib.hooks.wasb_hook import WasbHook
             return WasbHook(wasb_conn_id=self.conn_id)
         elif self.conn_type == 'docker':
             from airflow.hooks.docker_hook import DockerHook
             return DockerHook(docker_conn_id=self.conn_id)
         elif self.conn_type == 'azure_data_lake':
             from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
             return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
         elif self.conn_type == 'azure_cosmos':
             from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
             return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
         elif self.conn_type == 'cassandra':
             from airflow.contrib.hooks.cassandra_hook import CassandraHook
             return CassandraHook(cassandra_conn_id=self.conn_id)
         elif self.conn_type == 'mongo':
             from airflow.contrib.hooks.mongo_hook import MongoHook
             return MongoHook(conn_id=self.conn_id)
         elif self.conn_type == 'gcpcloudsql':
             from airflow.contrib.hooks.gcp_sql_hook import CloudSqlDatabaseHook
             return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     except Exception:
         pass
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers
    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param s3_conn_id: reference to a specific S3 connection
    :type s3_conn_id: string
    :param options: reference to a list of UNLOAD options
    :type options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 schema,
                 table,
                 s3_bucket,
                 s3_key,
                 redshift_conn_id='redshift_default',
                 s3_conn_id='s3_default',
                 unload_options=tuple(),
                 autocommit=False,
                 parameters=None,
                 *args,
                 **kwargs):
        super(RedshiftToS3Transfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.s3_conn_id = s3_conn_id
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        a_key, s_key = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        self.log.info("Retrieving headers from %s.%s...", self.schema,
                      self.table)

        columns_query = """SELECT column_name
                            FROM information_schema.columns
                            WHERE table_schema = '{0}'
                            AND   table_name = '{1}'
                            ORDER BY ordinal_position
                        """.format(self.schema, self.table)

        cursor = self.hook.get_conn().cursor()
        cursor.execute(columns_query)
        rows = cursor.fetchall()
        columns = map(lambda row: row[0], rows)
        column_names = ', '.join(map(lambda c: "\\'{0}\\'".format(c), columns))
        column_castings = ', '.join(
            map(lambda c: "CAST({0} AS text) AS {0}".format(c), columns))

        unload_query = """
                        UNLOAD ('SELECT {0}
                        UNION ALL
                        SELECT {1} FROM {2}.{3}
                        ORDER BY 1 DESC')
                        TO 's3://{4}/{5}/{3}_'
                        with
                        credentials 'aws_access_key_id={6};aws_secret_access_key={7}'
                        {8};
                        """.format(column_names, column_castings, self.schema,
                                   self.table, self.s3_bucket, self.s3_key,
                                   a_key, s_key, unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Ejemplo n.º 27
0
def load_pg_data(**kwargs):
    """
    Load data from the Tableau server and upload it to Postgres.
    """
    # Sign in to the tableau server.
    TABLEAU_SERVER = "https://10az.online.tableau.com"
    TABLEAU_SITENAME = "echo"
    TABLEAU_VERSION = "2.7"
    TABLEAU_USER = Variable.get("BIKESHARE_TABLEAU_USER")
    TABLEAU_PASSWORD = Variable.get("BIKESHARE_TABLEAU_PASSWORD")
    TRIP_TABLE_VIEW_ID = "7530c937-887e-42da-aa50-2a11d279bf51"
    logging.info("Authenticating with Tableau")
    tableau_auth = tableauserverclient.TableauAuth(
        TABLEAU_USER,
        TABLEAU_PASSWORD,
        TABLEAU_SITENAME,
    )
    tableau_server = tableauserverclient.Server(TABLEAU_SERVER,
                                                TABLEAU_VERSION)
    tableau_server.auth.sign_in(tableau_auth)

    # Get the Trips table view. This is a view specifically created for
    # this DAG. Tableau server doesn't allow the download of underlying
    # workbook data via the API (though one can from the UI). This view
    # allows us to get around that.
    logging.info("Loading Trips view")
    all_views, _ = tableau_server.views.get()
    view = next(v for v in all_views if v.id == TRIP_TABLE_VIEW_ID)
    if not view:
        raise Exception("Cannot find the trips table!")
    tableau_server.views.populate_csv(view)
    df = pandas.read_csv(
        io.BytesIO(b"".join(view.csv)),
        parse_dates=["Start Datetime", "End Datetime"],
        thousands=",",
        dtype={
            "Visible ID": str,
            "End Station": str,
            "Start Station": str
        },
    )

    # The data has a weird structure where trip rows are duplicated, with variations
    # on a "Measure" column, containing trip length, duration, etc. We pivot on that
    # column to create a normalized table containing one row per trip.
    logging.info("Cleaning Data")
    df = pandas.merge(
        df.set_index("Trip ID").groupby(level=0).first().drop(
            columns=["Measure Names", "Measure Values"]),
        df.pivot(index="Trip ID",
                 columns="Measure Names",
                 values="Measure Values"),
        left_index=True,
        right_index=True,
    ).reset_index()
    df = df.rename(
        {
            n: n.lower().strip().replace(" ", "_").replace("(", "").replace(
                ")", "")
            for n in df.columns
        },
        axis="columns",
    )
    check_columns(bike_trips, df)

    # Upload the final dataframe to Postgres. Since pandas timestamps conform to the
    # datetime interface, psycopg can correctly handle the timestamps upon insert.
    logging.info("Uploading to PG")
    engine = PostgresHook.get_hook(POSTGRES_ID).get_sqlalchemy_engine()
    insert = sqlalchemy.dialects.postgresql.insert(
        bike_trips).on_conflict_do_nothing()
    conn = engine.connect()
    conn.execute(insert, *df.to_dict(orient="record"))
Ejemplo n.º 28
0
 def execute(self, context):
     """This function has the definition necessary for creating the fact and dimension tables"""
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
Ejemplo n.º 29
0
 def _write_cursor(self, cursor: int, pg_hook: PostgresHook):
     _cursor = int(cursor)
     pg_hook.run("""INSERT INTO cursor (cursor) VALUES (%s)""", autocommit=True, parameters=[_cursor])
Ejemplo n.º 30
0
def log_bike_ride_frequency():
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records("""
        SELECT bikeid, count FROM lifetime_rides ORDER BY count DESC LIMIT 5
    """)
    logging.info(f"Bike ride frequency : \n{records}")
Ejemplo n.º 31
0
 def execute(self, context):
     pg_hook = PostgresHook(postgres_conn_id=self._conn_id)
     df = pg_hook.get_pandas_df(sql='SELECT * FROM templates')
     task_instance = context['task_instance']            # type: TaskInstance
     task_instance.xcom_push('database_df', df)
Ejemplo n.º 32
0
    return failed_alert.execute(context=context)

#to read the python script for pulling data from google sheet and putting it into tables in postgres
try:
    sys.path.append('/etc/airflow/data_scripts/vision_zero/')
    from schools import pull_from_sheet
except:
    raise ImportError("Cannot import functions to pull school safety zone list")

#to get credentials to access google sheets
vz_api_hook = GoogleCloudBaseHook('vz_api_google')
cred = vz_api_hook._get_credentials()
service = build('sheets', 'v4', credentials=cred, cache_discovery=False)

#To connect to pgadmin bot
vz_api_bot = PostgresHook("vz_api_bot")
con = vz_api_bot.get_conn()

DEFAULT_ARGS = {
    'owner': 'cnangini',
    'depends_on_past' : False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'start_date': datetime(2019, 9, 30),
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'on_failure_callback': task_fail_slack_alert
}

dag = DAG('vz_google_sheets', default_args=DEFAULT_ARGS, schedule_interval='@daily', catchup=False)
 def tearDown(self):
     postgres = PostgresHook()
     with postgres.get_conn() as conn:
         with conn.cursor() as cur:
             for table in TABLES:
                 cur.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table))
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :parame verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:
        - False: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - path/to/cert/bundle.pem: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            verify=None,
            copy_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.copy_options = copy_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
 def execute(self, context):
     pg_hook = PostgresHook(self.redshift_conn_id)
     pg_hook.run("DROP TABLE IF EXISTS " + self.full_table_name)
class S3ToRedshiftTransfer(BaseOperator):
    """
    Executes an COPY command to load files from s3 to Redshift

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :param copy_options: reference to a list of COPY options
    :type copy_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            copy_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.copy_options = copy_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {schema}.{table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(schema=self.schema,
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
Ejemplo n.º 37
0
    def execute(self, context):
        redshift = PostgresHook(self.redshift_conn_id)

        for query in self.queries:
            self.log.info(f"Running {query}")
            redshift.run(query)
Ejemplo n.º 38
0
def move_clean_to_conform(**kwargs):
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')
    lote_id = ti.xcom_pull(task_ids='generate_unique_lote_id', key='lote_id')
    source = ti.xcom_pull(task_ids='move_csv_to_stagging_area', key='source')
    file_size = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                             key='file_size')
    table = ti.xcom_pull(task_ids='set_base_table', key='table')
    query = """SELECT * FROM {} WHERE loteId = '{}';""".format(table, lote_id)

    src_conn = PostgresHook(postgres_conn_id='etl_stage_clean').get_conn()
    dest_conn = PostgresHook(postgres_conn_id='etl_stage_conform').get_conn()

    src_cursor = src_conn.cursor()
    dest_cursor = dest_conn.cursor()

    src_cursor.execute(query)
    count = 0
    set_lote_id_begin(dest_cursor, lote_id, source, file_size)
    while True:
        records = src_cursor.fetchmany(size=1)
        if not records:
            break
        query_insert = """INSERT INTO {} VALUES %s""".format(table)
        execute_values(dest_cursor, query_insert, records)
        dest_conn.commit()
        count += 1
    set_lote_id_end(dest_cursor, lote_id, count, source, file_size)
    dest_conn.commit()
    src_cursor.close()
    dest_cursor.close()
    src_conn.close()
    dest_conn.close()
    return True
 def execute(self, context):
     pg_hook = PostgresHook(self.redshift_conn_id)
     pg_hook.run("TRUNCATE TABLE " + self.full_table_name)
Ejemplo n.º 40
0
 def execute(self, context):
     _log.info('Executing: ' + str(self.sql))
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers

    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: string
    :param unload_options: reference to a list of UNLOAD options
    :type unload_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            unload_options=tuple(),
            autocommit=False,
            parameters=None,
            include_header=False,
            *args, **kwargs):
        super(RedshiftToS3Transfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.parameters = parameters
        self.include_header = include_header

        if self.include_header and \
           'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]:
            self.unload_options = list(unload_options) + ['PARALLEL OFF', ]

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...",
                          self.schema, self.table)

            columns_query = """SELECT column_name
                                        FROM information_schema.columns
                                        WHERE table_schema = '{schema}'
                                        AND   table_name = '{table}'
                                        ORDER BY ordinal_position
                            """.format(schema=self.schema,
                                       table=self.table)

            cursor = self.hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]
            column_names = ', '.join("{0}".format(c) for c in columns)
            column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                        for c in columns)

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order"""\
                            .format(column_names=column_names,
                                    column_castings=column_castings,
                                    column_headers=column_headers,
                                    schema=self.schema,
                                    table=self.table)
        else:
            select_query = "SELECT * FROM {schema}.{table}"\
                .format(schema=self.schema,
                        table=self.table)

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               table=self.table,
                               s3_bucket=self.s3_bucket,
                               s3_key=self.s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Ejemplo n.º 42
0
 def _fetch_highest_cursor(self, pg_hook: PostgresHook):
     latest_cursor, = pg_hook.get_first(f'SELECT cursor FROM cursor ORDER BY date DESC LIMIT 1', True) or (0,)
     return None if latest_cursor == 0 else latest_cursor
Ejemplo n.º 43
0
def move_conform_to_final(**kwargs):
    ti = kwargs['ti']
    file = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                        key='currently_processing')
    lote_id = ti.xcom_pull(task_ids='generate_unique_lote_id', key='lote_id')
    source = ti.xcom_pull(task_ids='move_csv_to_stagging_area', key='source')
    file_size = ti.xcom_pull(task_ids='move_csv_to_stagging_area',
                             key='file_size')
    table = ti.xcom_pull(task_ids='set_base_table', key='table')
    query = """SELECT * FROM {} WHERE loteId = '{}';""".format(table, lote_id)

    src_conn = PostgresHook(postgres_conn_id='etl_stage_conform').get_conn()
    dest_conn = PostgresHook(postgres_conn_id='etl_stage_final').get_conn()

    src_cursor = src_conn.cursor()
    dest_cursor = dest_conn.cursor()

    query_header = """ select column_name from information_schema.columns where table_name = '{}';""".format(
        table)
    src_cursor.execute(query_header)
    headers_result = src_cursor.fetchmany(size=1000)
    headers = [str(s[0]).lower() for s in headers_result]

    src_cursor.execute(query)
    count = 0
    set_lote_id_begin(dest_cursor, lote_id, source, file_size)
    while True:
        records = src_cursor.fetchmany(size=1)
        if not records:
            break
        insert_final_table(dest_cursor, table, records, headers)
        count += 1
    set_lote_id_end(dest_cursor, lote_id, count, source, file_size)
    dest_conn.commit()
    src_cursor.close()
    dest_cursor.close()
    src_conn.close()
    dest_conn.close()
    return True
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers
    :param schema: reference to a specific schema in redshift database
    :type schema: string
    :param table: reference to a specific table in redshift database
    :type table: string
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: string
    :param s3_key: reference to a specific S3 key
    :type s3_key: string
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: string
    :param s3_conn_id: reference to a specific S3 connection
    :type s3_conn_id: string
    :param options: reference to a list of UNLOAD options
    :type options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            s3_conn_id='s3_default',
            unload_options=tuple(),
            autocommit=False,
            parameters=None,
            *args, **kwargs):
        super(RedshiftToS3Transfer, self).__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.s3_conn_id = s3_conn_id
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.parameters = parameters

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        a_key, s_key = self.s3.get_credentials()
        unload_options = ('\n\t\t\t').join(self.unload_options)

        _log.info("Retrieving headers from %s.%s..." % (self.schema, self.table))

        columns_query = """SELECT column_name
                            FROM information_schema.columns
                            WHERE table_schema = '{0}'
                            AND   table_name = '{1}'
                            ORDER BY ordinal_position
                        """.format(self.schema, self.table)

        cursor = self.hook.get_conn().cursor()
        cursor.execute(columns_query)
        rows = cursor.fetchall()
        columns = map(lambda row: row[0], rows)
        column_names = (', ').join(map(lambda c: "\\'{0}\\'".format(c), columns))
        column_castings = (', ').join(map(lambda c: "CAST({0} AS text) AS {0}".format(c),
                                            columns))

        unload_query = """
                        UNLOAD ('SELECT {0}
                        UNION ALL
                        SELECT {1} FROM {2}.{3}')
                        TO 's3://{4}/{5}/{3}_'
                        with
                        credentials 'aws_access_key_id={6};aws_secret_access_key={7}'
                        {8};
                        """.format(column_names, column_castings, self.schema, self.table,
                                self.s3_bucket, self.s3_key, a_key, s_key, unload_options)

        _log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        _log.info("UNLOAD command complete...")
Ejemplo n.º 45
0
 def _clear_cursor(self, pg_hook: PostgresHook):
     pg_hook.run(f'DELETE FROM cursor', True)
class RedshiftToS3Transfer(BaseOperator):
    """
    Executes an UNLOAD command to s3 as a CSV with headers

    :param schema: reference to a specific schema in redshift database
    :type schema: str
    :param table: reference to a specific table in redshift database
    :type table: str
    :param s3_bucket: reference to a specific S3 bucket
    :type s3_bucket: str
    :param s3_key: reference to a specific S3 key
    :type s3_key: str
    :param redshift_conn_id: reference to a specific redshift database
    :type redshift_conn_id: str
    :param aws_conn_id: reference to a specific S3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:

        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :param unload_options: reference to a list of UNLOAD options
    :type unload_options: list
    """

    template_fields = ()
    template_ext = ()
    ui_color = '#ededed'

    @apply_defaults
    def __init__(
            self,
            schema,
            table,
            s3_bucket,
            s3_key,
            redshift_conn_id='redshift_default',
            aws_conn_id='aws_default',
            verify=None,
            unload_options=tuple(),
            autocommit=False,
            include_header=False,
            *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.schema = schema
        self.table = table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.redshift_conn_id = redshift_conn_id
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.include_header = include_header

        if self.include_header and \
           'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]:
            self.unload_options = list(unload_options) + ['PARALLEL OFF', ]

    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        if self.include_header:
            self.log.info("Retrieving headers from %s.%s...",
                          self.schema, self.table)

            columns_query = """SELECT column_name
                                        FROM information_schema.columns
                                        WHERE table_schema = '{schema}'
                                        AND   table_name = '{table}'
                                        ORDER BY ordinal_position
                            """.format(schema=self.schema,
                                       table=self.table)

            cursor = self.hook.get_conn().cursor()
            cursor.execute(columns_query)
            rows = cursor.fetchall()
            columns = [row[0] for row in rows]
            column_names = ', '.join("{0}".format(c) for c in columns)
            column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns)
            column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                        for c in columns)

            select_query = """SELECT {column_names} FROM
                                    (SELECT 2 sort_order, {column_castings}
                                     FROM {schema}.{table}
                                    UNION ALL
                                    SELECT 1 sort_order, {column_headers})
                                 ORDER BY sort_order"""\
                            .format(column_names=column_names,
                                    column_castings=column_castings,
                                    column_headers=column_headers,
                                    schema=self.schema,
                                    table=self.table)
        else:
            select_query = "SELECT * FROM {schema}.{table}"\
                .format(schema=self.schema,
                        table=self.table)

        unload_query = """
                    UNLOAD ('{select_query}')
                    TO 's3://{s3_bucket}/{s3_key}/{table}_'
                    with credentials
                    'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
                    {unload_options};
                    """.format(select_query=select_query,
                               table=self.table,
                               s3_bucket=self.s3_bucket,
                               s3_key=self.s3_key,
                               access_key=credentials.access_key,
                               secret_key=credentials.secret_key,
                               unload_options=unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     redshift_hook.run(str(self.sql_query))
def load_data_to_redshift(*args, **kwargs):
    aws_hook = AwsHook("aws_credentials")
    credentials = aws_hook.get_credentials()
    redshift_hook = PostgresHook("redshift")
    redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key))
Ejemplo n.º 49
0
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                              schema=self.database)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)