Example #1
0
    def test_check_for_wildcard_key(self):
        hook = S3Hook(aws_conn_id=None)
        bucket = hook.get_bucket('bucket')
        bucket.create()
        bucket.put_object(Key='abc', Body=b'a')
        bucket.put_object(Key='a/b', Body=b'a')

        self.assertTrue(hook.check_for_wildcard_key('a*', 'bucket'))
        self.assertTrue(hook.check_for_wildcard_key('s3://bucket//a*'))
        self.assertTrue(hook.check_for_wildcard_key('abc', 'bucket'))
        self.assertTrue(hook.check_for_wildcard_key('s3://bucket//abc'))
        self.assertFalse(hook.check_for_wildcard_key('a', 'bucket'))
        self.assertFalse(hook.check_for_wildcard_key('s3://bucket//a'))
        self.assertFalse(hook.check_for_wildcard_key('b', 'bucket'))
        self.assertFalse(hook.check_for_wildcard_key('s3://bucket//b'))
Example #2
0
    def test_download_file(self, mock_temp_file):
        mock_temp_file.return_value.__enter__ = Mock(return_value=mock_temp_file)
        s3_hook = S3Hook(aws_conn_id='s3_test')
        s3_hook.check_for_key = Mock(return_value=True)
        s3_obj = Mock()
        s3_obj.download_fileobj = Mock(return_value=None)
        s3_hook.get_key = Mock(return_value=s3_obj)
        key = 'test_key'
        bucket = 'test_bucket'

        s3_hook.download_file(key=key, bucket_name=bucket)

        s3_hook.check_for_key.assert_called_once_with(key, bucket)
        s3_hook.get_key.assert_called_once_with(key, bucket)
        s3_obj.download_fileobj.assert_called_once_with(mock_temp_file)
Example #3
0
    def test_list_prefixes_paged(self):
        hook = S3Hook(aws_conn_id=None)
        bucket = hook.get_bucket('bucket')
        bucket.create()

        # we dont need to test the paginator
        # that's covered by boto tests
        keys = ["%s/b" % i for i in range(2)]
        dirs = ["%s/" % i for i in range(2)]
        for key in keys:
            bucket.put_object(Key=key, Body=b'a')

        self.assertListEqual(sorted(dirs),
                             sorted(hook.list_prefixes('bucket', delimiter='/',
                                                       page_size=1)))
Example #4
0
    def execute(self, context) -> None:
        postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

        credentials = s3_hook.get_credentials()
        credentials_block = build_credentials_block(credentials)
        unload_options = '\n\t\t\t'.join(self.unload_options)

        unload_query = self._build_unload_query(
            credentials_block, self._select_query, self.s3_key, unload_options
        )

        self.log.info('Executing UNLOAD command...')
        postgres_hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Example #5
0
 def test_load_file_acl(self, s3_bucket):
     hook = S3Hook()
     with tempfile.NamedTemporaryFile(delete=False) as temp_file:
         temp_file.write(b"Content")
         temp_file.seek(0)
         hook.load_file(temp_file.name,
                        "my_key",
                        s3_bucket,
                        gzip=True,
                        acl_policy='public-read')
         response = boto3.client('s3').get_object_acl(
             Bucket=s3_bucket, Key="my_key", RequestPayer='requester')  # pylint: disable=no-member # noqa: E501 # pylint: disable=C0301
         assert ((response['Grants'][1]['Permission'] == 'READ') and
                 (response['Grants'][0]['Permission'] == 'FULL_CONTROL'))
         os.unlink(temp_file.name)
Example #6
0
    def test_check_for_wildcard_key(self, s3_bucket):
        hook = S3Hook()
        bucket = hook.get_bucket(s3_bucket)
        bucket.put_object(Key='abc', Body=b'a')
        bucket.put_object(Key='a/b', Body=b'a')

        assert hook.check_for_wildcard_key('a*', s3_bucket) is True
        assert hook.check_for_wildcard_key('abc', s3_bucket) is True
        assert hook.check_for_wildcard_key('s3://{}//a*'.format(s3_bucket)) is True
        assert hook.check_for_wildcard_key('s3://{}//abc'.format(s3_bucket)) is True

        assert hook.check_for_wildcard_key('a', s3_bucket) is False
        assert hook.check_for_wildcard_key('b', s3_bucket) is False
        assert hook.check_for_wildcard_key('s3://{}//a'.format(s3_bucket)) is False
        assert hook.check_for_wildcard_key('s3://{}//b'.format(s3_bucket)) is False
Example #7
0
def upload_to_s3(state, date):
    '''Grabs data from Covid endpoint and saves to flat file on S3
    '''
    # Connect to S3
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)

    # Get data from API
    url = 'https://covidtracking.com/api/v1/states/'
    res = requests.get(url + '{0}/{1}.csv'.format(state, date))

    # Save data to CSV on S3
    s3_hook.load_string(res.text,
                        '{0}_{1}.csv'.format(state, date),
                        bucket_name=bucket,
                        replace=True)
Example #8
0
    def execute(self, context: 'Context'):
        """
        Executes AWS Glue Job from Airflow

        :return: the id of the current glue job.
        """
        if not self.script_location.startswith(self.s3_protocol):
            s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
            script_name = os.path.basename(self.script_location)
            s3_hook.load_file(self.script_location,
                              self.s3_artifacts_prefix + script_name,
                              bucket_name=self.s3_bucket)
            s3_script_location = f"s3://{self.s3_bucket}/{self.s3_artifacts_prefix}{script_name}"
        else:
            s3_script_location = self.script_location
        glue_job = GlueJobHook(
            job_name=self.job_name,
            desc=self.job_desc,
            concurrent_run_limit=self.concurrent_run_limit,
            script_location=s3_script_location,
            retry_limit=self.retry_limit,
            num_of_dpus=self.num_of_dpus,
            aws_conn_id=self.aws_conn_id,
            region_name=self.region_name,
            s3_bucket=self.s3_bucket,
            iam_role_name=self.iam_role_name,
            create_job_kwargs=self.create_job_kwargs,
        )
        self.log.info(
            "Initializing AWS Glue Job: %s. Wait for completion: %s",
            self.job_name,
            self.wait_for_completion,
        )
        glue_job_run = glue_job.initialize_job(self.script_args,
                                               self.run_job_kwargs)
        if self.wait_for_completion:
            glue_job_run = glue_job.job_completion(self.job_name,
                                                   glue_job_run['JobRunId'])
            self.log.info(
                "AWS Glue Job: %s status: %s. Run Id: %s",
                self.job_name,
                glue_job_run['JobRunState'],
                glue_job_run['JobRunId'],
            )
        else:
            self.log.info("AWS Glue Job: %s. Run Id: %s", self.job_name,
                          glue_job_run['JobRunId'])
        return glue_job_run['JobRunId']
Example #9
0
    def execute(self, context) -> None:
        mysql_hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        data_df = mysql_hook.get_pandas_df(self.query)
        self.log.info("Data from MySQL obtained")

        self._fix_int_dtypes(data_df)
        with NamedTemporaryFile(mode='r+', suffix='.csv') as tmp_csv:
            data_df.to_csv(tmp_csv.name, **self.pd_csv_kwargs)
            s3_conn.load_file(filename=tmp_csv.name,
                              key=self.s3_key,
                              bucket_name=self.s3_bucket)

        if s3_conn.check_for_key(self.s3_key, bucket_name=self.s3_bucket):
            file_location = os.path.join(self.s3_bucket, self.s3_key)
            self.log.info("File saved correctly in %s", file_location)
Example #10
0
    def test_put_bucket_tagging_when_tags_exist_overwrites(self):
        hook = S3Hook()
        hook.create_bucket(bucket_name='new_bucket')
        initial_tag_set = [{'Key': 'Color', 'Value': 'Green'}]
        hook.put_bucket_tagging(bucket_name='new_bucket',
                                tag_set=initial_tag_set)
        assert len(hook.get_bucket_tagging(bucket_name='new_bucket')) == 1
        assert hook.get_bucket_tagging(
            bucket_name='new_bucket') == initial_tag_set

        new_tag_set = [{'Key': 'Fruit', 'Value': 'Apple'}]
        hook.put_bucket_tagging(bucket_name='new_bucket', tag_set=new_tag_set)

        result = hook.get_bucket_tagging(bucket_name='new_bucket')
        assert len(result) == 1
        assert result == new_tag_set
Example #11
0
    def execute(self, context):

        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)

        self.log.info(f'Check files in S3 and in local folder ...')
        s_local_filespath = f"{self.files_path}/{context['ds_nodash']}*.csv"
        l_local_files = glob.glob(s_local_filespath)
        l_files_in_s3 = s3_hook.list_keys(self.dest_bucket_name)

        if all(elem in l_files_in_s3 for elem in l_local_files):
            if len(l_local_files) > 0:
                raise ValueError(f"Not all elements in the path "
                                 f"{s_local_filespath} were found in S3"
                                 f" {self.dest_bucket_name} bucket")
        self.log.info(f"All {len(l_local_files)}  in local folder are in "
                      f" {self.dest_bucket_name} bucket")
Example #12
0
    def execute(self, context):
        self.s3_key = self.get_s3_key(self.s3_key)
        ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id)
        s3_hook = S3Hook(self.s3_conn_id)

        sftp_client = ssh_hook.get_conn().open_sftp()

        with NamedTemporaryFile("w") as f:
            sftp_client.get(self.sftp_path, f.name)

            s3_hook.load_file(
                filename=f.name,
                key=self.s3_key,
                bucket_name=self.s3_bucket,
                replace=True
            )
Example #13
0
    def hook(self):
        """Returns S3Hook."""
        remote_conn_id = conf.get('logging', 'REMOTE_LOG_CONN_ID')
        try:
            from airflow.providers.amazon.aws.hooks.s3 import S3Hook

            return S3Hook(remote_conn_id, transfer_config_args={"use_threads": False})
        except Exception as e:  # pylint: disable=broad-except
            self.log.exception(
                'Could not create an S3Hook with connection id "%s". '
                'Please make sure that airflow[aws] is installed and '
                'the S3 connection exists. Exception : "%s"',
                remote_conn_id,
                e,
            )
            return None
Example #14
0
    def test_generate_presigned_url(self, s3_bucket):
        hook = S3Hook()
        presigned_url = hook.generate_presigned_url(client_method="get_object",
                                                    params={
                                                        'Bucket': s3_bucket,
                                                        'Key': "my_key"
                                                    })

        url = presigned_url.split("?")[1]
        params = {
            x[0]: x[1]
            for x in [x.split("=") for x in url[0:].split("&")]
        }

        assert {"AWSAccessKeyId", "Signature",
                "Expires"}.issubset(set(params.keys()))
Example #15
0
    def execute(self, context) -> None:
        postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

        credentials = s3_hook.get_credentials()
        credentials_block = build_credentials_block(credentials)
        unload_options = '\n\t\t\t'.join(self.unload_options)
        s3_key = f"{self.s3_key}/{self.table}_" if self.table_as_file_name else self.s3_key
        select_query = f"SELECT * FROM {self.schema}.{self.table}"

        unload_query = self._build_unload_query(credentials_block,
                                                select_query, s3_key,
                                                unload_options)

        self.log.info('Executing UNLOAD command...')
        postgres_hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Example #16
0
    def execute(self, context: 'Context'):
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

        if isinstance(self.data, str):
            s3_hook.load_string(
                self.data,
                self.s3_key,
                self.s3_bucket,
                self.replace,
                self.encrypt,
                self.encoding,
                self.acl_policy,
                self.compression,
            )
        else:
            s3_hook.load_bytes(self.data, self.s3_key, self.s3_bucket,
                               self.replace, self.encrypt, self.acl_policy)
Example #17
0
def upload_data(aws_conn,local_file,file_key, bucket):
    '''

    :param aws_conn: the AWS connection ID
    :param local_file: location of the file to be uploaded in your local computer
    :param file_key: destination within the desired bucket where the file will be uploaded
    :param bucket: name of the bucket where the file will be uploaded
    :return:
    '''
    try:
        s3 = S3Hook(aws_conn)
        logging.info(f"{log_start} uploading data onto S3 bucket")
        s3.load_file(filename=local_file, key=file_key ,bucket_name=bucket, replace=True)
        logging.info(f"{log_finish} uploading data onto S3 bucket")
    except Exception as e:
        logging.info(e)
        print("Unable to upload data onto S3. Please review logs")
    def execute(self, context):
        hook = S3Hook(self.aws_credentials_id)

        pathlist = list(Path(self.dataset_dir).glob(self.file_glob))
        for path in pathlist:
            bucket_key = str(path)[len(self.dataset_dir) + 1:]
            if hook.check_for_key(key=bucket_key,
                                  bucket_name=self.bucket_name):
                self.log.info(
                    f"File '{bucket_key}' is already present as s3://{self.bucket_name}/{bucket_key}. Skip upload."
                )
            else:
                self.log.info(
                    f"Upload file '{bucket_key}' to s3://{self.bucket_name}/{bucket_key}. This might take a while."
                )
                hook.load_file(filename=str(path),
                               key=bucket_key,
                               bucket_name=self.bucket_name)
def download_dataset(year_month: str):
    url = (
        f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year_month}.csv'
    )
    response = requests.get(url, stream=True)
    response.raise_for_status()

    s3 = S3Hook('aws_connection_id')

    s3_path = f's3://nyc-yellow-taxi-raw-data/yellow_tripdata_{year_month}.csv.gz'
    bucket, key = s3.parse_s3_url(s3_path)

    with NamedTemporaryFile('w', encoding='utf-8', delete=False) as f:
        for chunk in response.iter_lines():
            f.write('{}\n'.format(chunk.decode('utf-8')))
    s3.load_file(f.name, key, bucket, replace=True, gzip=True)

    return s3_path
Example #20
0
    def execute(self, context):
        s3_hook = S3Hook(self.aws_conn_id)
        ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id)

        with NamedTemporaryFile() as local_tmp_file:
            ftp_hook.retrieve_file(
                remote_full_path=self.ftp_path,
                local_full_path_or_buffer=local_tmp_file.name)

            s3_hook.load_file(
                filename=local_tmp_file.name,
                key=self.s3_key,
                bucket_name=self.s3_bucket,
                replace=self.replace,
                encrypt=self.encrypt,
                gzip=self.gzip,
                acl_policy=self.acl_policy,
            )
Example #21
0
    def execute(self, context: 'Context') -> None:
        sql_hook = self._get_hook()
        s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        data_df = sql_hook.get_pandas_df(sql=self.query, parameters=self.parameters)
        self.log.info("Data from SQL obtained")

        self._fix_dtypes(data_df)
        file_options = FILE_OPTIONS_MAP[self.file_format]

        with NamedTemporaryFile(mode=file_options.mode, suffix=file_options.suffix) as tmp_file:

            self.log.info("Writing data to temp file")
            getattr(data_df, file_options.function)(tmp_file.name, **self.pd_kwargs)

            self.log.info("Uploading data to S3")
            s3_conn.load_file(
                filename=tmp_file.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=self.replace
            )
    def get_list_from_s3(*args, **context):
        """
        Get list of data files from S3 bucket
        """
        updated_prefix = prefix.format(frequency="monthly",
                                       symbol=symbol,
                                       granularity=granularity)

        filepaths = []
        bucket_obj = S3Hook("aws_default").get_bucket(bucket)

        for obj in bucket_obj.objects.filter(Prefix=updated_prefix):
            path, filename = os.path.split(obj.key)
            if filename.endswith("zip"):
                filepaths.append(obj.key)

        context["ti"].xcom_push(key="file_list", value=filepaths)
        logging.info("Found {:d} files".format(len(filepaths)))
Example #23
0
def upload_to_s3(endpoint, date):

    # Instanstiaute
    s3_hook = S3Hook(aws_conn_id=S3_CONN_ID)
    print("Created Connection")
    print(s3_hook.get_session())
    print(s3_hook)

    # Base URL
    url = 'https://covidtracking.com/api/v1/states/'

    res = requests.get(url + '{0}/{1}.csv'.format(endpoint, date))

    # Take string, upload to S3 using predefined method
    s3_hook.load_string(res.text,
                        '{0}_{1}.csv'.format(endpoint, date),
                        bucket_name=BUCKET,
                        replace=True)
Example #24
0
    def execute(self, context: 'Context') -> None:
        sql_hook = self._get_hook()
        s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        data_df = sql_hook.get_pandas_df(sql=self.query, parameters=self.parameters)
        self.log.info("Data from SQL obtained")

        self._fix_int_dtypes(data_df)
        file_options = FILE_OPTIONS_MAP[self.file_format]

        with NamedTemporaryFile(mode=file_options.mode, suffix=file_options.suffix) as tmp_file:

            if self.file_format == FILE_FORMAT.CSV:
                data_df.to_csv(tmp_file.name, **self.pd_kwargs)
            else:
                data_df.to_parquet(tmp_file.name, **self.pd_kwargs)

            s3_conn.load_file(
                filename=tmp_file.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=self.replace
            )
Example #25
0
    def execute_json(self, context, f_get_data):
        hook = S3Hook(self.source_conn.conn_id)
        suffix = self.suffix

        if self.key_name:
            data = f_get_data(
                hook=hook,
                key=self.key_name,
                bucket=self.bucket_name,
            )
            self.upload_data(data=data)
        else:
            data = []
            objects = self._iterate_through_bucket(
                s3hook=hook,
                bucket=self.bucket_name,
                prefix=self.prefix,
            )
            for obj_iter in objects:
                obj = hook.get_key(obj_iter["Key"], self.bucket_name)
                if self.load_data_from and obj.last_modified < self.load_data_from:
                    continue
                if self.load_data_until and obj.last_modified >= self.load_data_until:
                    continue
                if suffix and not suffix == obj.key[-len(suffix):]:
                    continue

                self.log.info("Loading data from file {0}".format(obj.key, ))
                self._metadata.update({
                    "bucket_name":
                    self.bucket_name,
                    "file_name":
                    obj.key,
                    "file_last_modified":
                    str(obj.last_modified),
                })
                data = f_get_data(
                    hook=hook,
                    key=obj.key,
                    bucket=self.bucket_name,
                )
                self.upload_data(data=data)
Example #26
0
    def _upload_ratings(s3_conn_id, s3_bucket, **context):
        year = context["execution_date"].year
        month = context["execution_date"].month

        # Fetch ratings from our 'API'.
        ratings = fetch_ratings(year=year, month=month)

        # Write ratings to temp file.
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_path = path.join(tmp_dir, "ratings.csv")
            ratings.to_csv(tmp_path, index=False)

            # Upload file to S3.
            hook = S3Hook(s3_conn_id)
            hook.load_file(
                tmp_path,
                key=f"ratings/{year}/{month}.csv",
                bucket_name=s3_bucket,
                replace=True,
            )
def pivot_data(**kwargs):
    #Make connection to Snowflake
    hook = SnowflakeHook(snowflake_conn_id='snowflake')
    conn = hook.get_conn()

    #Define SQL query
    query = 'SELECT DATE, STATE, POSITIVE FROM STATE_DATA;'

    #Read data into pandas dataframe
    df = pd.read_sql(query, conn)

    #Pivot dataframe into new format
    pivot_df = df.pivot(index='DATE', columns='STATE', values='POSITIVE').reset_index()

    #Save dataframe to S3
    s3_hook = S3Hook(aws_conn_id=S3_CONN_ID)
    s3_hook.load_string(pivot_df.to_csv(index=False), 
                        '{0}.csv'.format(filename), 
                        bucket_name=BUCKET, 
                        replace=True)
Example #28
0
def process_data(state, date):
    '''Reads data from S3, processes, and saves to new S3 file
    '''
    # Connect to S3
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)

    # Read data
    data = StringIO(
        s3_hook.read_key(key='{0}_{1}.csv'.format(state, date),
                         bucket_name=bucket))
    df = pd.read_csv(data, sep=',')

    # Process data
    processed_data = df[['date', 'state', 'positive', 'negative']]

    # Save processed data to CSV on S3
    s3_hook.load_string(processed_data.to_string(),
                        '{0}_{1}_processed.csv'.format(state, date),
                        bucket_name=bucket,
                        replace=True)
Example #29
0
    def test_get_wildcard_key(self, s3_bucket):
        hook = S3Hook()
        bucket = hook.get_bucket(s3_bucket)
        bucket.put_object(Key='abc', Body=b'a')
        bucket.put_object(Key='a/b', Body=b'a')

        # The boto3 Class API is _odd_, and we can't do an isinstance check as
        # each instance is a different class, so lets just check one property
        # on S3.Object. Not great but...
        assert hook.get_wildcard_key('a*', s3_bucket).key == 'a/b'
        assert hook.get_wildcard_key('a*', s3_bucket, delimiter='/').key == 'abc'
        assert hook.get_wildcard_key('abc', s3_bucket, delimiter='/').key == 'abc'
        assert hook.get_wildcard_key('s3://{}/a*'.format(s3_bucket)).key == 'a/b'
        assert hook.get_wildcard_key('s3://{}/a*'.format(s3_bucket), delimiter='/').key == 'abc'
        assert hook.get_wildcard_key('s3://{}/abc'.format(s3_bucket), delimiter='/').key == 'abc'

        assert hook.get_wildcard_key('a', s3_bucket) is None
        assert hook.get_wildcard_key('b', s3_bucket) is None
        assert hook.get_wildcard_key('s3://{}/a'.format(s3_bucket)) is None
        assert hook.get_wildcard_key('s3://{}/b'.format(s3_bucket)) is None
Example #30
0
    def get_files(self, s3_hook: S3Hook, delimiter: Optional[str] = '/') -> List:
        """Gets a list of files in the bucket"""
        prefix = self.bucket_key
        config = {
            'PageSize': None,
            'MaxItems': None,
        }
        if self.wildcard_match:
            prefix = re.split(r'[\[\*\?]', self.bucket_key, 1)[0]

        paginator = s3_hook.get_conn().get_paginator('list_objects_v2')
        response = paginator.paginate(
            Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
        )
        keys: List = []
        for page in response:
            if 'Contents' in page:
                _temp = [k for k in page['Contents'] if isinstance(k.get('Size', None), (int, float))]
                keys = keys + _temp
        return keys