def execute(self, context):
        """
            Copy data from S3 buckets to staging tables in redhsift cluster.
                - redshift_conn_id: redshift cluster connection
                - aws_credentials_id: AWS connection
                - table: redshift cluster table name
                - s3_bucket: S3 bucket where source data resides
                - s3_key: S3 prefix
                - file_format: source file format - options JSON, CSV
        """
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Deleting data from Redshift staging tables")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift staging tables")
        s3_path = "s3://{}".format(self.s3_bucket)
        if self.execution_date:
            year = self.execution_date.strftime("%Y")
            month = self.execution_date.strftime("%m")
            day = self.execution_date.strftime("%d")
            s3_path = '/'.join([s3_path, str(year), str(month), str(day)])
        s3_path = s3_path + '/' + self.s3_key

        additional = ""
        if self.file_format == 'CSV':
            additional = " DELIMETER ',' IGNOREHEADER 1 "

        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table, s3_path, credentials.access_key,
            credentials.secret_key, self.region, self.file_format, additional)
        redshift.run(formatted_sql)

        self.log.info(f"Success: Copied {self.table} from S3 to Redshift")
Exemple #2
0
    def execute(self, context):
        '''
        The function performs data load and executes the operator
        Arguments:
        self: reference to the object
        context: context provided when the operator is called
        Return: result
        '''
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Creating the fact table in redshift before insert")
        redshift.run(format(self.create_table_sql))

        self.log.info('Insert to the respective fact table')

        if self.mode == "append":
            insert_sql = f"INSERT INTO {self.target_table} {self.insert_table_sql}"
            self.log.info("Command is " + insert_sql)
        else:
            insert_sql = f"DELETE FROM {self.target_table}; INSERT INTO {self.target_table} {self.insert_table_sql}"
            self.log.info("Command is " + insert_sql)
        redshift.run(insert_sql)
Exemple #3
0
    def execute(self, context):

        # retrieve aws credentials
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()

        # connect to redshift
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # clear redshift destination
        if self.clear_table:
            self.log.info(
                "Clearing data from destination Redshift table {}".format(
                    self.table))
            redshift.run("DELETE FROM {}".format(self.table))

        # copy data from S3 to redshift
        self.log.info("Copying data from S3 to Redshift table {}".format(
            self.table))

        # render s3 path
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        self.log.info(s3_path)

        # render jsonpath if not "auto"
        if self.json_path != "auto":
            json = "s3://{}/{}".format(self.s3_bucket, self.json_path)
        else:
            json = self.json_path

        # format sql using copy_sql
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table, s3_path, credentials.access_key,
            credentials.secret_key, json)
        redshift.run(formatted_sql)
Exemple #4
0
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        # RedShift Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        # Get number of records in the table
        records = redshift.get_records(
            f"SELECT COUNT(*) FROM {self.table_name}")
        # Fields and data
        df = redshift.get_pandas_df(self.sql)
        fields = list(df.columns.values)
        data_rows = redshift.get_records(self.sql)

        if self.load_mode == "clean":
            # Clear data
            self.log.info(f"Clearing data from {self.table_name} table")
            redshift.run("DELETE FROM {}".format(self.table_name))
            self.log.info(
                f"Deleted {records[0][0]} records from {self.table_name}")
        else:
            job_execution_ts = self.filter_key[0].format(**context)
            next_job_execution_ts = self.filter_key[1].format(**context)
            filtered_df = df[(df['start_time'] >= job_execution_ts)
                             & (df['start_time'] < next_job_execution_ts)]
            data_rows = [tuple(x) for x in filtered_df.values]

        # Populate table
        self.log.info("Populating data to {} table".format(self.table_name))
        redshift.insert_rows(table=self.table_name,
                             rows=data_rows,
                             target_fields=fields,
                             commit_every=1000,
                             replace=False)
        self.log.info("Inserted {} records to {}".format(
            len(data_rows), self.table_name))
    def execute(self, context):
        """
        Loads the data from S3 to a staging table in Redshift.
        :param context: Airflow's context.
        """
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        s3_path = f"s3://{self.s3_bucket}/{self.s3_key}"
        self.log.info(f'The source path is: {s3_path}')
        if self.file_type.lower() == 'json':
            formatted_sql = StageToRedshiftOperator.copy_json.format(
                self.table, s3_path, credentials.access_key,
                credentials.secret_key, self.json_parameter)
        elif self.file_type.lower() == 'csv':
            formatted_sql = StageToRedshiftOperator.copy_csv.format(
                self.table, s3_path, credentials.access_key,
                credentials.secret_key, self.ignore_headers, self.delimiter)
        else:
            raise ValueError('file_type must be either json or csv')

        self.log.info(f'About to execute the command: {formatted_sql}')
        redshift.run(formatted_sql)
Exemple #6
0
    def execute(self, context):
        # Get AWS Credential
        aws_hook = AwsHook(self.emr_credential_id)
        credentials = aws_hook.get_credentials()

        # Get EMR Client Instance
        emr = boto3.client('emr',
                           'us-west-2',
                           aws_access_key_id=credentials.access_key,
                           aws_secret_access_key=credentials.secret_key)

        # Launching EMR Cluster
        try:
            response = emr.run_job_flow(**self.JOBFLOW_OVERWRITE)
        except Exception as e:
            print(e)
            raise ValueError('EMR LAUNCHING FAILED')

        jobFlowId = response['JobFlowId']
        state = emr.describe_cluster(
            ClusterId=jobFlowId)['Cluster']['Status']['State']

        self.log.info(f'Cluster State: {state}')

        # wait for emr to launch
        while state is 'STARTING':
            time.sleep(15)
            state = emr.describe_cluster(
                ClusterId=jobFlowId)['Cluster']['Status']['State']

            self.log.info(f'Cluster State:{state}')

        self.log.info(f'EMR STATE:{state}')
        context['ti'].xcom_push(key='EMR_ID', value=jobFlowId)

        return jobFlowId
Exemple #7
0
    def execute(self, context):
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Clearing data from destination Redshift table")
        redshift.run(f"DELETE FROM {self.table}")

        #self.log.info("Key = ", credentials.access_key)
        #self.log.info("Secret = ", credentials.secret_key)

        self.log.info("Copying data from S3 to Redshift")
        #self.log.info("s3_key = ",self.s3_key)
        #self.log.info("context = ",context)

        #rendered_key = self.s3_key.format(**context)
        #self.log.info("rendered_key : s3_key : ",rendered_key)
        #s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key)
        formatted_sql = StageToRedshiftOperator.COPY_SQL.format(
            self.table, s3_path, credentials.access_key,
            credentials.secret_key, self.more_options)

        redshift.run(formatted_sql)
Exemple #8
0
    def execute(self, context, *args, **kwargs):
        logging.info("<<<<<<Starting Execution>>>>>")
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift_hook = PostgresHook("redshift")
        if (self.filetype == 'json'):
            fix = " json 'auto';"
        else:
            fix = " csv  IGNOREHEADER 1;"

        logging.info("File Type Detected")
        logging.info(fix)
        logging.info("Clearing data from destination Redshift table")
        redshift_hook.run("DELETE FROM {}".format(self.table))
        logging.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table, s3_path, credentials.access_key,
            credentials.secret_key, fix)
        logging.info("SQL Generated")
        logging.info(formatted_sql)
        redshift_hook.run(formatted_sql)
        logging.info("Data Pushed to Staging Table")
Exemple #9
0
    def execute(self, context):
        """
        Copy data from S3 buckets to redshift cluster into staging tables.
        """

        self.log.info('Fetching Credentials')
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Clearing data from destination Redshift table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            credentials.access_key,
            credentials.secret_key,
            self.json_params
        )
        redshift.run(formatted_sql)
Exemple #10
0
    def execute(self, context):
        '''
        This function copies JSON data from the S3 bucket into the target table in Redshift.
        This function runs automatically when operator is called. 
        '''
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info('Clearing data from destination Redshift table')
        redshift.run('DELETE FROM {}'.format(self.table))

        self.log.info('Copying data from S3 to Redshift')
        rendered_key = self.s3_key.format(**context)
        s3_path = 's3://{}/{}'.format(self.s3_bucket, rendered_key)

        copy_sql = """
        COPY {}
        FROM '{}'
        IAM_ROLE '{}'
        FORMAT AS PARQUET;
        """
        formatted_sql = copy_sql.format(self.table, s3_path, self.iam_role_arn)
        redshift.run(formatted_sql)
    def execute(self, context):
        self.log.info('StageToRedshiftOperator not implemented yet')
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        
        self.log.info("Copying data from S3 to Redshift")

        copy_to_redshift = ("""
                                copy {0}
                                from 's3://{1}/{2}'
                                access_key_id '{3}' secret_access_key '{4}'
                                compupdate off region '{5}'
                                FORMAT AS JSON '{6}';
                                """.format(
                                self.table, self.s3_bucket, self.s3_key, 
                                credentials.access_key, credentials.secret_key,
                                self.region,
                                self.json)
                            )

        redshift.run(copy_to_redshift)

        self.log.info("Copy command completed")
Exemple #12
0
    def execute(self, context):
        """
        LoadDimFileDataOperator execute

        Loads a data file in JSON or CSV format from an S3 bucket into a Redshift table. This
        operator will only load data into a table if it is empty.

        :param context:
        """
        self.log.info("Using data file {} from S3 bucket {}".format(self.s3_key, self.s3_bucket))

        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # only copy data if dimension table is empty
        records = redshift.get_records("SELECT COUNT(*) FROM {}".format(self.table))
        num_records = records.pop(0)[0]

        if num_records == 0:
            self.log.info("Inserting into destination Redshift table {}".format(self.table))

            formatted_sql = LoadDimFileDataOperator.copy_sql.format(
                self.table,
                self.s3_bucket,
                self.s3_key,
                credentials.access_key,
                credentials.secret_key,
                self.copy_format
            )
            redshift.run(formatted_sql)
        else:
            self.log.info("Found {} records in {}".format(num_records, self.table))
            self.log.info("{} table data is already set".format(self.table))

        self.log.info("Data copy complete")
Exemple #13
0
    def execute(self, context):

        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()

        redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info(f"Copying {self.table} from S3 to Redshift")

        copy_query = """
                    COPY {table}
                    FROM '{s3_path}'
                    ACCESS_KEY_ID '{access_key}'
                    SECRET_ACCESS_KEY '{secret_key}'
                    REGION 'us-west-2'
                    FORMAT AS JSON '{json_path}'
                """.format(table=self.table,
                           s3_path=self.s3_path,
                           access_key=credentials.access_key,
                           secret_key=credentials.secret_key,
                           json_path=self.json_path)

        redshift_hook.run(copy_query)
        self.log.info(f"Copy {self.table} complete.")
    def execute(self, context):
        self.log.info("Connecting to AWS")
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        self.log.info("Connecting to Redshift using postgress hook")
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        
        self.log.info("Clearing data in destination table..")
        redshift.run("DELETE FROM {}".format(self.table_name))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table_name,
            "s3://{}/{}".format(self.s3_bucket, rendered_key),
            credentials.access_key,
            credentials.secret_key,
            self.region,
            self.json_path
        )
        self.log.info(f'running query: {formatted_sql}')
        redshift.run(formatted_sql)
        
        self.log.info(f'Successful copy of {self.table_name} from S3 to Redshift')
Exemple #15
0
    def execute(self, context):
        self.log.info('Starting StageToRedshiftOperator')
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(self.redshift_conn_id)

        self.log.info('Drop stage_event table')
        redshift.run(StageToRedshiftOperator.table_drop.format(
            self.table_name))

        self.log.info('Create stage_event table')
        if self.table_name == 'staging_events':
            redshift.run(StageToRedshiftOperator.staging_event_table_create)
        else:
            redshift.run(StageToRedshiftOperator.staging_songs_table_create)

        self.log.info("Copy data from S3 to table")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}/".format(self.s3_bucket, rendered_key)
        redshift.run(
            StageToRedshiftOperator.copy_table.format(self.table_name, s3_path,
                                                      credentials.access_key,
                                                      credentials.secret_key,
                                                      self.copy_format))
Exemple #16
0
    def execute(self, context):
        self.log.info('--------------------------------------------------------------------')
        self.log.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ' Starting loading stage tables')
        self.log.info('Trying to collect AWS Credentials')
        aws_hook = AwsHook(self.aws_credentials)
        credentials = aws_hook.get_credentials()
        self.log.info('Succesfully collected AWS Credentials')
        self.log.info('Trying to connect to Redshift')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Copying data from S3 to Redshift")
        key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, key)

        if self.file_type == "json":
            copy_json_sql = StageToRedshiftOperator.copy_json_statement.format(
                self.redshift_table,
                s3_path,
                credentials.access_key,
                credentials.secret_key,
                self.json_path
            )
            redshift.run(copy_json_sql)

        if self.file_type == "csv":
            copy_csv_sql = StageToRedshiftOperator.copy_csv_to_sql.format(
                self.redshift_table,
                s3_path,
                credentials.access_key,
                credentials.secret_key,
                self.ignore_headers,
                self.delimiter
            )
            redshift.run(copy_csv_sql)
        self.log.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ' Loading stage tables finished succesfully')
        self.log.info('--------------------------------------------------------------------')
    def execute(self, context):
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Clearing data from destination Redshift table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        self.log.info("Rendered_key: {}".format(rendered_key))
        self.log.info("s3_path: {}".format(s3_path))
        self.log.info("access_key: {}".format(credentials.access_key))
        self.log.info("json: {}".format(self.json))

        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            self.json,
            credentials.access_key,
            credentials.secret_key,
        )
        redshift.run(formatted_sql)
    def execute(self, context):

        self.log.info('Loading staging table')
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id = self.conn_id)
        
        #self.log.info('Deleting staging table')
        #redshift.run("DELETE FROM {}".format(self.table))
        
        self.log.info("Copying from S3 to redshift")
        rend_key = self.s3_key.format(**context)
        s3_path = "s3://{}{}".format(self.s3_bucket,rend_key)
       
        final_copy_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            credentials.access_key,
            credentials.secret_key,
            self.region,
            self.jsonpath
        )
        redshift.run(final_copy_sql)
        self.log.info(f"Staging table {self.table} loaded successfully")
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()

        # Postgres Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # Delete previous rows
        self.log.info("Deleting {} table's content.".format(self.table))
        redshift.run("DELETE FROM {}".format(self.table))

        s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            credentials.access_key,
            credentials.secret_key,
            self.data_format
        )

        # Copy data from S3 to Redshift
        self.log.info("Copying data from S3 to Redshift. Table: {}.".format(self.table))
        redshift.run(formatted_sql)
    def execute(self, context):
        '''
        Create a connection to AWS using the aws_credentials and create a PostgresHook to connect to Amazon Redshift.
        Get the data from s3 bucket and load it to redshift tables

        '''
        aws_hook = AwsHook(self.aws_credentials)
        credentials = aws_hook.get_credentials()
        redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        #execution_date = kwargs["execution_date"]

        self.log.info("Clearing data from destination Redshift table")
        redshift_hook.run("DELETE FROM {}".format(self.table))

        self.log.info(f"Copying data to {self.table}")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        formatted_sql = ''
        if self.json == '':
            formatted_sql = StageToRedshiftOperator.copy_sql_csv.format(
                self.table, s3_path, credentials.access_key,
                credentials.secret_key, self.deli)
        else:
            formatted_sql = StageToRedshiftOperator.copy_sql_json.format(
                self.table, s3_path, credentials.access_key,
                credentials.secret_key, self.json)
        redshift_hook.run(formatted_sql)
        self.log.info(f"Copying data to {self.table} completed")


#
#
# COPY tablename
# FROM 'data_source'
# CREDENTIALS 'credentials-args'
# FORMAT AS { AVRO | JSON } 's3://jsonpaths_file';
    def execute(self, context):
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info('Checking for counts')
        for table in self.tables:
            records = redshift.get_records(f"Select count(*) from {table}")[0]
            if records[0] == 0:
                raise ValueError(
                    f"Data quality check failed. {table} contains zero rows")
            else:
                self.log.info(
                    f"Count checks passed for {table},{records[0]} records present in the table"
                )

        self.log.info('Checking for Null ids')
        for check in self.checks:
            records = redshift.get_records(check['check_sql'])[0]
            if records[0] != check['expected_result']:
                raise ValueError(
                    f"Data quality check failed. {check['table']} contains null in id column, , got {records[0]} records instead"
                )
            else:
                self.log.info(f"Null id checks passed for {check['table']}")
    def execute(self, context):
        # Fetch credentials from Airflow
        aws_hook = AwsHook(self.aws_credentials_id)
        redshift_hook = PostgresHook(self.redshift_conn_id)

        if aws_hook is not None and redshift_hook is not None:
            credentials = aws_hook.get_credentials()

            # Resolve the S3 key; expand to include the given year and month
            s3_key = self.s3_key.format(**context)
            s3_path = f"s3://{self.s3_bucket}/{s3_key}"

            # Clear the table if needed
            self.log.info(
                f"Clearing data from destination Redshift table ({self.table})"
            )
            redshift_hook.run(f"TRUNCATE {self.table}")

            self.log.info(f"Ingesting S3 ({s3_path}) to {self.table} [start]")

            cmd = f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' JSON '{self.json_path}' COMPUPDATE OFF"
            redshift_hook.run(cmd)

            self.log.info(f"Ingesting S3 ({s3_path}) to {self.table} [end]")
Exemple #23
0
    def execute(self, context):
        self.log.info('StageToRedshiftOperator starting...', self.s3_bucket,
                      self.json_path)
        redshift = PostgresHook(self.redshift_conn_id)
        path = "s3://{}/{}".format(self.s3_bucket, self.s3_key)
        if self.json_path != "auto":
            json_path = "s3://{}/{}".format(self.s3_bucket, self.json_path)
        else:
            json_path = self.json_path

        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()

        query = StageToRedshiftOperator.copy_sql.format(
            self.table,
            path,
            credentials.access_key,
            credentials.secret_key,
            self.region,
            json_path,
        )
        redshift.run(query)

        self.log.info(f'Copied {self.table} to Redshift successfully...')
 def execute(self, context):
     """
     This operatior takes data from S3 storage and put it into staging tables which from it's parsed to target tables.
     :param context: specified above
     :return: staging tables
     """
     self.log.info(f'Staging phase start to table: {self.table}')
     aws_hook = AwsHook(self.aws_conn_id)
     credentials = aws_hook.get_credentials()
     redshift_hook = PostgresHook(self.redshift_conn_id)
     redshift_hook.run(
         SqlQueries.truncate_table.format(self.table)
     )
     redshift_hook.run(
         SqlQueries.copy_tables_to_stage.format(
             self.table,
             self.s3_bucket,
             self.s3_key,
             credentials.access_key,
             credentials.secret_key,
             self.json_path
         )
     )
     self.log.info(f'Staging phase finished to table: {self.table}')
Exemple #25
0
    def execute(self, context):
        # Connections
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # Clear existing raw data
        self.log.info('Clearing data from destination Redshift table')
        redshift.run("DELETE FROM {}".format(self.table))

        # copy the new raw data
        self.log.info(f"In progress: Copying {self.table} from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)

        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            table_name=self.table,
            s3_path=s3_path,
            access_key=credentials.access_key,
            secret_key=credentials.secret_key,
            file_type=self.file_type)
        redshift.run(formatted_sql)

        self.log.info(f'Done: Copying {self.table} from S3 to Redshift')
Exemple #26
0
    def execute(self, context):
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Clearing data from Redshift destination table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        rendered_date = self.execution_date.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        formatted_sql = StageToRedshiftOperator.copy_sql.format(
            self.table,
            s3_path,
            credentials.access_key,
            credentials.secret_key,
            self.ignore_headers,
            self.delimiter,
            self.json_path,
            self.data_format,
            year=rendered_date.year,
            month=rendered_date.month)
        redshift.run(formatted_sql)
Exemple #27
0
    def execute(self, context):
        """
        Executes the operator logic

        :param context:
        """

        self.log.info('StageCsvToRedshiftOperator execute')

        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        self.log.info("Clearing data from destination Redshift table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        self.log.info('StageCsvToRedshiftOperator s3_path: ' + s3_path)
        formatted_sql = StageJsonToRedshiftOperator.copy_sql.format(
            self.table, s3_path, credentials.access_key,
            credentials.secret_key)
        redshift.run(formatted_sql)
Exemple #28
0
 def execute(self, context):
     self.log.info('Inside StageToRedshiftOperator -->>>')
     s3_path = "s3://{}/{}"
     delete_sql = "Delete from {}".format(self.table_name)
     # Using AWS Hook to get aws credentials from Airflow
     aws_hook = AwsHook(self.aws_credential_id)
     aws_credentials = aws_hook.get_credentials()
     # Using PostgresHook to get Redshift connection details from Airflow
     redshift = PostgresHook(self.redshift_connection_id)
     # Replaces with the execution date and month using JINJA Templates
     updated_key = self.s3_key.format(**context)
     s3_location = s3_path.format(self.s3_bucket, updated_key)
     # Formatting the sql statements with actual table name, aws cred details
     stage_load_sql = StageToRedshiftOperator.stage_load_sql.format(
         self.table_name, s3_location, aws_credentials.access_key,
         aws_credentials.secret_key, self.json_path)
     # Executing delete sql statement
     redshift.run(delete_sql)
     self.log.info("Deleted data from staging table {}".format(
         self.table_name))
     # Executing load sql statement
     redshift.run(stage_load_sql)
     self.log.info("Data loaded successfully to staging table {}".format(
         self.table_name))
Exemple #29
0
 def execute(self, context):
     aws_hook = AwsHook(self.aws_creds)
     creds = aws_hook.get_credentials()
     rs_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     
     self.log.info("Dropping old stagings tables!")
     dropst = CreateTables.dropsql(self.table)
     rs_hook.run(dropst)
     
     self.log.info("createsql staging table")
     rs_hook.run(self.createsql)
     
     self.log.info("Copying data from S3 to Redshift for " + self.table)
     s3_path = "s3://" + self.s3_bucket + "/" + self.s3_key
     formated_sql = StageToRedshiftOperator.copy_sql.format(
             self.table,
             s3_path,
             creds.access_key,
             creds.secret_key,
             self.s3_region,
             self.s3_jsondetails)
     rs_hook.run(formated_sql)
     
     self.log.info('StageToRedshiftOperator finished!')
 def execute(self, context):
     #self.log.info('StageToRedshiftOperator not implemented yet')
     aws_hook = AwsHook(self.aws_credentials_id)
     credentials = aws_hook.get_credentials()
     redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info("Clearing data from destination Redshift table (if it exists)")
     redshift_hook.run(f"DELETE FROM {self.destination_table}")
     self.log.info("Copying data from S3 to Redshift")
     # http://s3-us-west-2.amazonaws.com/udacity-dend/log-data/2018/11/2018-11-04-events.json
     #s3_path = f"s3://{self.s3_bucket}/{self.s3_key}/{ execution_date.strftime('%Y')}/{{ execution_date.strftime('%m') }}/{{ ds }}-events.json"
     if self.s3_key == 'song_data':
         s3_path = "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key)
     else:
         s3_key = '{key}/{{execution_date.year}}/{{execution_date.month}}/{{ds}}-events.json'.format(key=self.s3_key)
         s3_path = "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key)
     if self.input_file_type == 'csv':
         copy_sql = StageToRedshiftOperator.copy_sql_csv.format(self.destination_table,
                                                                s3_path,
                                                                credentials.access_key,
                                                                credentials.secret_key,
                                                                self.region,
                                                                self.delimiter,
                                                                self.ignore_headers)
     else: # json (default)
         if self.s3_key == 'log_data':
             jsonpath = 's3://{}/log_json_path.json'.format(self.s3_bucket)
         else:
             jsonpath = 'auto'
         copy_sql = StageToRedshiftOperator.copy_sql_json.format(self.destination_table,
                                                                 s3_path,
                                                                 credentials.access_key,
                                                                 credentials.secret_key,
                                                                 self.region,
                                                                 jsonpath
                                                                )
     redshift_hook.run(copy_sql)