def test_get_resource_type_returns_a_boto3_resource_of_the_requested_type( self): hook = AwsHook(aws_conn_id='aws_default') resource_from_hook = hook.get_resource_type('dynamodb') # this table needs to be created in production table = resource_from_hook.create_table(TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[{ 'AttributeName': 'name', 'AttributeType': 'S' }], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 }) table.meta.client.get_waiter('table_exists').wait( TableName='test_airflow') self.assertEqual(table.item_count, 0)
def test_get_resource_type_returns_a_boto3_resource_of_the_requested_type(self): hook = AwsHook(aws_conn_id='aws_default') resource_from_hook = hook.get_resource_type('dynamodb') # this table needs to be created in production table = resource_from_hook.create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 0)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_left = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in_left) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) data_right = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in_right) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) joined_data = DataHelper.get_joined_data_from_dfs( data_left, data_right, self.left_on_column, self.right_on_column, self.suffix_name, self.output_columns) DataHelper.write_df_to_tsv_in_s3(s3_resource, joined_data, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote tsv file with joined columns {self.output_columns} dropped to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info(f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe.") unstacked_data = DataHelper.unstack_df_column(data, self.id_column, self. unstack_column) DataHelper.write_df_to_tsv_in_s3(s3_resource, unstacked_data, self.s3_bucket, self.s3_key_out) self.log.info(f"Wrote tsv file with unstacked {self.unstack_column} to s3://{self.s3_bucket}/{self.s3_key_out}.")
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) engagement_dfs = [] for key in DataHelper.generate_all_keys_from_s3_with_prefix(s3_client, self.s3_bucket, f"{self.engagement_type}/"): df = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, key) self.log.info(f"Read tsv file s3://{self.s3_bucket}/{key} into dataframe.") engagement_dfs.append(df) all_engagement_df = DataHelper.combine_engagement_dfs(engagement_dfs, ['user_id', 'engaged_with_id'], lambda x: 1) DataHelper.write_df_to_tsv_in_s3(s3_resource, all_engagement_df, self.s3_bucket, self.s3_key_out) self.log.info(f"Wrote combined engagement tsv file to s3://{self.s3_bucket}/{self.engagement_type}/{self.s3_key_out}.")
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) self.log.info( f"Parsing {self.activity} events from s3://{self.s3_bucket}/{self.s3_key_in}." ) with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket, self.s3_key_in) as f: data = DataHelper.parse_activity_json_to_df( json_file, self.activity) DataHelper.write_df_to_tsv_in_s3(s3_resource, data, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote {self.activity} eveents to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) n_strongest = RecommendationHelper.get_top_n_closest( data, self.n_strongest) DataHelper.write_df_to_tsv_in_s3(s3_resource, n_strongest, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote strongest connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_sec_deg = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_in_sec_deg) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe." ) recs = RecommendationHelper.get_top_n_recommendations( data_sec_deg, self.n_recs) DataHelper.write_df_to_tsv_in_s3(s3_resource, recs, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote recommendations tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_csv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info( f"Read csv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) data_with_dummies = DataHelper.get_dummy_colums(data, self.indicator_column, sep=self.sep) self.log.info( f"Created dummy fields for column {self.indicator_column}.") DataHelper.write_df_to_csv_in_s3(s3_resource, data_with_dummies, self.s3_bucket, self.s3_key_in) self.log.info( f"Wrote updated data back to s3://{self.s3_bucket}/{self.s3_key_out}." )
def _clear_destination_bucket(*, aws_conn_id: str, bucket_name: str, bucket_data_prefix: str, table_name: str, ds_nodash: str, task: BaseOperator, **_) -> str: """Python callable for the `ClearDestinationBucketOperator`.""" log = task.log aws = AwsHook(aws_conn_id) s3 = aws.get_resource_type('s3') # pylint: disable=invalid-name prefix = f'{bucket_data_prefix}{table_name}/date={ds_nodash}/' log.info("erasing any existing data in s3://%s/%s", bucket_name, prefix) resp = s3.Bucket(bucket_name).objects.filter(Prefix=prefix).delete() log.info("got response: %s", resp) if list(item for single_resp in resp for item in single_resp.get('Errors', [])): raise AirflowException( f"Unable to fully erase existing data in s3://{bucket_name}/{prefix}" ) return f"erased {len(list(item for single_resp in resp for item in single_resp.get('Deleted', [])))} files"
def _load_extracted_mappings( *, aws_conn_id: str, bucket_name: str, bucket_data_prefix: str, partner: str, ds_nodash: str, log: Logger ) -> Sequence[Mapping[str, int]]: """Load extracted mappings from S3.""" full_prefix = f'{bucket_data_prefix}{partner}/date={ds_nodash}/' log.info("loading extracted mappings from s3://%s/%s", bucket_name, full_prefix) aws = AwsHook(aws_conn_id) bucket = aws.get_resource_type('s3').Bucket(bucket_name) extracted_mappings = [ json.loads(line) for file_obj in bucket.objects.filter( Prefix=full_prefix ) for line in file_obj.get()['Body'].iter_lines() ] log.info("loaded mappings: %s", extracted_mappings) return extracted_mappings
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_sec_deg = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_in_sec_deg) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe." ) data_existing_conn = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_existing_conn) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_existing_conn} into dataframe." ) sec_deg_conn_valid = RecommendationHelper.remove_invalid_recommendations( data_sec_deg, data_existing_conn, conn_type) DataHelper.write_df_to_tsv_in_s3(s3_resource, sec_deg_conn_valid, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote valid second degree connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )