def get_iam_token(self, conn): """ Uses AWSHook to retrieve a temporary password to connect to Postgres or Redshift. Port is required. If none is provided, default is used for each service """ from airflow.contrib.hooks.aws_hook import AwsHook redshift = conn.extra_dejson.get('redshift', False) aws_conn_id = conn.extra_dejson.get('aws_conn_id', 'aws_default') aws_hook = AwsHook(aws_conn_id) login = conn.login if conn.port is None: port = 5439 if redshift else 5432 else: port = conn.port if redshift: # Pull the custer-identifier from the beginning of the Redshift URL # ex. my-cluster.ccdre4hpd39h.us-east-1.redshift.amazonaws.com returns my-cluster cluster_identifier = conn.extra_dejson.get('cluster-identifier', conn.host.split('.')[0]) client = aws_hook.get_client_type('redshift') cluster_creds = client.get_cluster_credentials( DbUser=conn.login, DbName=self.schema or conn.schema, ClusterIdentifier=cluster_identifier, AutoCreate=False) token = cluster_creds['DbPassword'] login = cluster_creds['DbUser'] else: client = aws_hook.get_client_type('rds') token = client.generate_db_auth_token(conn.host, port, conn.login) return login, token, port
def _wait_for_task_ended(self): hook = AwsHook() ecs_client = hook.get_client_type('ecs', region_name='eu-west-1') task = ecs_client.describe_tasks(cluster='cluster', tasks=[self.arn])['tasks'][0] logging.warn(f"TASK: {task}") container_name = task['containers'][0]['name'] log_client = hook.get_client_type('logs', region_name='eu-west-1') log_stream_name = f"{self.log_prefix}/{container_name}/{self.arn.split('/')[1]}" # hackish waiting pattern - use exception for control flow: # - let waiter poll for task stop # - return on success, repeat when wait fails # # while looping: # - log events on each try and after waiter succeeds # # this should give us all events that have been logged by the task token = None while True: token = self._log_events(log_client, log_stream_name, token) waiter = self.client.get_waiter('tasks_stopped') waiter.config.max_attempts = 2 try: waiter.wait( cluster=self.cluster, tasks=[self.arn] ) self._log_events(log_client, log_stream_name, token) return except WaiterError: continue
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): self._create_clusters() hook = AwsHook(aws_conn_id='aws_default') client_from_hook = hook.get_client_type('redshift') clusters = client_from_hook.describe_clusters()['Clusters'] self.assertEqual(len(clusters), 2)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_left = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in_left) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) data_right = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in_right) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) joined_data = DataHelper.get_joined_data_from_dfs( data_left, data_right, self.left_on_column, self.right_on_column, self.suffix_name, self.output_columns) DataHelper.write_df_to_tsv_in_s3(s3_resource, joined_data, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote tsv file with joined columns {self.output_columns} dropped to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): logging.info("Executing ExtendedEmrCreateJobFlowOperator") aws = AwsHook(aws_conn_id=self.aws_conn_id) emr = aws.get_client_type('emr') response = emr.run_job_flow( Name=self.api_params.get('Name'), LogUri=self.api_params.get('LogUri'), ReleaseLabel=self.api_params.get('ReleaseLabel'), Instances=self.api_params.get('Instances'), Steps=self.api_params.get('Steps', []), BootstrapActions=self.api_params.get('BootstrapActions', []), Applications=self.api_params.get('Applications'), Configurations=self.api_params.get('Configurations', []), VisibleToAllUsers=self.api_params.get('VisibleToAllUsers'), JobFlowRole=self.api_params.get('JobFlowRole'), ServiceRole=self.api_params.get('ServiceRole'), Tags=self.api_params.get('Tags'), ) if not response['ResponseMetadata']['HTTPStatusCode'] == 200: raise AirflowException('JobFlow creation failed: %s' % response) else: logging.info('JobFlow with id %s created', response['JobFlowId']) job_flow_id = response['JobFlowId'] if self.wait_for_status is not None: status = emr.describe_cluster( ClusterId=job_flow_id)['Cluster']['Status']['State'] while status != self.wait_for_status and status not in self.TERMINATE_STATES: logging.info("Waiting for status %s. Current status is %s", self.wait_for_status, status) time.sleep(30) status = emr.describe_cluster( ClusterId=job_flow_id)['Cluster']['Status']['State'] if status in self.TERMINATE_STATES: raise AirflowException( 'Cluster was terminated [%s] before it got to status %s' % (status, self.wait_for_status)) if self.save_livy_connection_name is not None: instances_response = emr.list_instances( ClusterId=job_flow_id, InstanceGroupTypes=['MASTER']) if self.use_public_ip_for_connections: master_ip = instances_response['Instances'][0][ 'PublicIpAddress'] else: master_ip = instances_response['Instances'][0][ 'PrivateIpAddress'] ExtendedEmrCreateJobFlowOperator.create_or_replace_connection( connection_id=self.save_livy_connection_name, connection_type='Livy', ip="http://" + master_ip, port=8998, login='', password='', schema='', extra='') return job_flow_id
def read_from_aws_sm_fn(**kwargs): ### set up Secrets Manager hook = AwsHook() client = hook.get_client_type('secretsmanager') response = client.get_secret_value(SecretId=sm_secret_name) secret_string = response["SecretString"] return secret_string
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): client = boto3.client("emr", region_name="us-east-1") if len(client.list_clusters()["Clusters"]): raise ValueError("AWS not properly mocked") hook = AwsHook(aws_conn_id="aws_default") client_from_hook = hook.get_client_type("emr") self.assertEqual(client_from_hook.list_clusters()["Clusters"], [])
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): client = boto3.client('emr', region_name='us-east-1') if len(client.list_clusters()['Clusters']): raise ValueError('AWS not properly mocked') hook = AwsHook(aws_conn_id='aws_default') client_from_hook = hook.get_client_type('emr') self.assertEqual(client_from_hook.list_clusters()['Clusters'], [])
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): client = boto3.client('emr', region_name='us-east-1') if client.list_clusters()['Clusters']: raise ValueError('AWS not properly mocked') hook = AwsHook(aws_conn_id='aws_default') client_from_hook = hook.get_client_type('emr') self.assertEqual(client_from_hook.list_clusters()['Clusters'], [])
def test(**kwargs): ### set up DMS hook = AwsHook() client = hook.get_client_type('dms') dms_client.start_replication_task( ReplicationTaskArn={HERE}, StartReplicationTaskType={HERE} )['ReplicationTask'] return client
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info(f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe.") unstacked_data = DataHelper.unstack_df_column(data, self.id_column, self. unstack_column) DataHelper.write_df_to_tsv_in_s3(s3_resource, unstacked_data, self.s3_bucket, self.s3_key_out) self.log.info(f"Wrote tsv file with unstacked {self.unstack_column} to s3://{self.s3_bucket}/{self.s3_key_out}.")
def copy_results_s3(self): results = self.query_mysql() aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') concat = StringIO() [concat.write(",".join(map(str, i)) + '\n') for i in results] s3_location = self.prefix + '/' + self.s3_filename s3.put_object(Body=concat.getvalue(), Bucket=self.bucket_name, Key=s3_location)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) engagement_dfs = [] for key in DataHelper.generate_all_keys_from_s3_with_prefix(s3_client, self.s3_bucket, f"{self.engagement_type}/"): df = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, key) self.log.info(f"Read tsv file s3://{self.s3_bucket}/{key} into dataframe.") engagement_dfs.append(df) all_engagement_df = DataHelper.combine_engagement_dfs(engagement_dfs, ['user_id', 'engaged_with_id'], lambda x: 1) DataHelper.write_df_to_tsv_in_s3(s3_resource, all_engagement_df, self.s3_bucket, self.s3_key_out) self.log.info(f"Wrote combined engagement tsv file to s3://{self.s3_bucket}/{self.engagement_type}/{self.s3_key_out}.")
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) self.log.info( f"Loading file s3://{self.s3_bucket}/{self.s3_key} into table {self.table}." ) with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket, self.s3_key) as f: postgres.bulk_load(self.table, f) self.log.info( f"s3://{self.s3_bucket}/{self.s3_key} loaded into table {self.table} sucesfully." )
def get_iam_token(self, conn): """ Uses AWSHook to retrieve a temporary password to connect to MySQL Port is required. If none is provided, default 3306 is used """ from airflow.contrib.hooks.aws_hook import AwsHook aws_conn_id = conn.extra_dejson.get('aws_conn_id', 'aws_default') aws_hook = AwsHook(aws_conn_id) if conn.port is None: port = 3306 else: port = conn.port client = aws_hook.get_client_type('rds') token = client.generate_db_auth_token(conn.host, port, conn.login) return token, port
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) self.log.info( f"Parsing {self.activity} events from s3://{self.s3_bucket}/{self.s3_key_in}." ) with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket, self.s3_key_in) as f: data = DataHelper.parse_activity_json_to_df( json_file, self.activity) DataHelper.write_df_to_tsv_in_s3(s3_resource, data, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote {self.activity} eveents to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): logging.info("Executing EmrOperator") aws = AwsHook(aws_conn_id=self.aws_conn_id) emr = aws.get_client_type('emr') # create the cluster response = emr.run_job_flow( Name=self.cluster_name, LogUri=self.log_s3_uri, ReleaseLabel=self.release_label, Instances={ 'MasterInstanceType': self.instance_type, 'SlaveInstanceType': self.instance_type, 'InstanceCount': self.instance_count, 'KeepJobFlowAliveWhenNoSteps': False, 'Ec2KeyName': self.ec2_key_name }, Applications=self.applications, VisibleToAllUsers=True, JobFlowRole=self.job_flow_role, ServiceRole=self.service_role, Configurations=self.configurations, ) # add steps cluster_id = response['JobFlowId'] response_step = emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=self.steps) start_time = time.time() while True: if self.step_timeout_minutes: if time.time() > start_time + self.step_timeout_minutes * 60: raise AirflowException('EMR step(s) time out!') step_statuses = [ i['Status']['State'] for i in emr.list_steps(ClusterId=cluster_id)['Steps'] ] if not any( map(lambda x: x in ('PENDING', 'RUNNING'), step_statuses)): if not all(map(lambda x: x == 'COMPLETED', step_statuses)): raise AirflowException('EMR step(s) failed!') break else: logging.info("Job(s) still running/pending") time.sleep(60)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_sec_deg = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_in_sec_deg) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe." ) recs = RecommendationHelper.get_top_n_recommendations( data_sec_deg, self.n_recs) DataHelper.write_df_to_tsv_in_s3(s3_resource, recs, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote recommendations tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): hook = AwsHook(aws_conn_id=self.aws_credentials_id) client = hook.get_client_type("emr", region_name=self.region) cluster_id = context['task_instance'].xcom_pull( task_ids=self.cluster_task_id) steps_str = context['task_instance'].xcom_pull( task_ids=self.step_task_id) step_ids = re.sub("\"|'", "", steps_str).split() step_info = client.list_steps(ClusterId=cluster_id, StepIds=step_ids) step_status = step_info['Steps'][-1]['Status']['State'] while step_status in ['PENDING', 'RUNNING']: step_info = client.list_steps(ClusterId=cluster_id, StepIds=step_ids) step_status = step_info['Steps'][-1]['Status']['State'] if self.is_timeout(): raise StepTimeoutError time.sleep(60)
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) n_strongest = RecommendationHelper.get_top_n_closest( data, self.n_strongest) DataHelper.write_df_to_tsv_in_s3(s3_resource, n_strongest, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote strongest connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )
def fetch_definitions(): definitions = {} hook = AwsHook() ecs = hook.get_client_type('ecs', region_name='eu-west-1') first = True next_token = None while first or next_token is not None: if first: first = False ecs_tasks = ecs.list_task_definitions() else: ecs_tasks = ecs.list_task_definitions(nextToken=next_token) next_token = ecs_tasks[ 'nextToken'] if 'nextToken' in ecs_tasks else None for arn in ecs_tasks['taskDefinitionArns']: ecs_task = ecs.describe_task_definition(taskDefinition=arn) containers = ecs_task['taskDefinition']['containerDefinitions'] if len(containers) != 1: continue container = containers[0] if 'dockerLabels' not in container: continue labels = parse_labels(container['dockerLabels']) if 'airflow' not in labels: continue try: name = labels['airflow']['dag']['name'] definitions[name] = { 'airflow': labels['airflow'], 'arn': arn, 'container': container } except KeyError as e: logging.warning(f"Invalid configuration: {labels}", exc_info=e) return definitions
def execute(self, context): logging.info("Executing S3DeletePrefixOperator") aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') objects_to_delete = s3.list_objects(Bucket=self.bucket_name, Prefix=self.prefix) delete_keys = {'Objects': []} delete_keys['Objects'] = [ { 'Key': k } for k in [obj['Key'] for obj in objects_to_delete.get('Contents', [])] ] try: response = s3.delete_objects(Bucket=self.bucket_name, Delete=delete_keys) logging.info(response) except Exception as e: # TODO import botocode client exception for missing delete logging.info('delete error {}'.format(e)) pass
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data = DataHelper.read_csv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in) self.log.info( f"Read csv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe." ) data_with_dummies = DataHelper.get_dummy_colums(data, self.indicator_column, sep=self.sep) self.log.info( f"Created dummy fields for column {self.indicator_column}.") DataHelper.write_df_to_csv_in_s3(s3_resource, data_with_dummies, self.s3_bucket, self.s3_key_in) self.log.info( f"Wrote updated data back to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): logging.info("Executing {}".format(self.__class__.__name__)) aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') s3_location = '{0}/{1}'.format(self.destination_prefix, self.s3_filename) local_file = '/tmp/temp_adwords_data.csv' s3_location = '{0}/{1}'.format(self.destination_prefix, self.s3_filename) # initiliaze adwords client client = adwords.AdWordsClient.LoadFromStorage(self.yaml_file_location) report_downloader = client.GetReportDownloader(version='v201806') # build awql report report_query = adwords.ReportQueryBuilder().\ Select(*self.fields).\ From(self.report_name).\ During(start_date=self.query_start_date, end_date=self.query_end_date) for condition in self.conditions: report_query = report_query.Where(condition['name'])\ .In(*condition['values']) report_query = report_query.Build() # Download report locally (temp) filepath = self.temp_localfile with open(filepath, 'wb') as handler: report_downloader.DownloadReportWithAwql( report_query, 'CSV', output=handler, skip_report_header=True, skip_column_header=False, skip_report_summary=True, include_zero_impressions=False) #Upload to S3 s3.upload_file(filepath, self.destination_bucket, s3_location)
def write_all_to_aws_sm_fn(**kwargs): ### determine secrets manager prefixes connections_prefix = 'airflow/connections' variables_prefix = 'airflow/variables' backend_kwargs = kwargs['conf'].get(section='secrets', key='backend_kwargs') if backend_kwargs: x = json.loads(backend_kwargs) connections_prefix = x['connections_prefix'].strip().rstrip('/') variables_prefix = x['variables_prefix'].strip().rstrip('/') print("using connections_prefix=", connections_prefix, ",variables_prefix=", variables_prefix, "...") else: print("backend_kwargs undefined--using defaults connections_prefix=", connections_prefix, ",variables_prefix=", variables_prefix) ### set up SQL and AWSSM session = settings.Session() hook = AwsHook() client = hook.get_client_type('secretsmanager') ### transfer connections query = session.query(Connection) print(query.count(), " connections: ") for curr_entry in query: curr_id = connections_prefix + '/' + curr_entry.conn_id curr_val = curr_entry.get_uri() write_to_sm_fn(name=curr_id, value=curr_val, client=client) ### transfer variables query = session.query(Variable) print(query.count(), " variables: ") for curr_entry in query: curr_id = variables_prefix + '/' + curr_entry.key curr_val = curr_entry.get_val() write_to_sm_fn(name=curr_id, value=curr_val, client=client) return "OK"
def execute(self, context): logging.info("Executing CloudwatchToS3Operator") logging.info(', '.join("%s: %s" % item for item in vars(self).items())) aws = AwsHook(aws_conn_id=self.aws_conn_id) cloudwatch = aws.get_client_type('logs') response = cloudwatch.create_export_task( taskName=self.task_name, logGroupName=self.log_group_name, logStreamNamePrefix=self.log_stream_name_prefix, fromTime=int(self.from_utc_timestamp), to=int(self.to_utc_timestamp), destination=self.destination_bucket, destinationPrefix=self.destination_prefix) task_id = response['taskId'] status_code = "RUNNING" while status_code == "RUNNING": time.sleep(2) status_code = cloudwatch.describe_export_tasks( taskId=task_id)['exportTasks'][0]['status']['code'] if status_code != "COMPLETED": raise AirflowException( 'Cloudwatch export task failed -{}'.format(status_code))
def execute(self, context): aws = AwsHook(aws_conn_id=self.aws_conn_id) s3_client = aws.get_client_type('s3', region_name=self.s3_region) s3_resource = aws.get_resource_type('s3', region_name=self.s3_region) data_sec_deg = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_in_sec_deg) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe." ) data_existing_conn = DataHelper.read_tsv_from_s3_to_df( s3_client, self.s3_bucket, self.s3_key_existing_conn) self.log.info( f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_existing_conn} into dataframe." ) sec_deg_conn_valid = RecommendationHelper.remove_invalid_recommendations( data_sec_deg, data_existing_conn, conn_type) DataHelper.write_df_to_tsv_in_s3(s3_resource, sec_deg_conn_valid, self.s3_bucket, self.s3_key_out) self.log.info( f"Wrote valid second degree connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}." )
def execute(self, context): logging.info("Executing GoogleAnalyticsToS3Operator") logging.info(', '.join("%s: %s" % item for item in vars(self).items())) aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') s3_location = '{0}/{1}'.format(self.destination_prefix, self.s3_filename) SCOPES = ['https://www.googleapis.com/auth/analytics.readonly'] credentials = ServiceAccountCredentials.from_json_keyfile_name( self.key_file_location, SCOPES) analytics = build('analyticsreporting', 'v4', credentials=credentials) df = pd.DataFrame() PAYLOAD_SIZE = 100000 names = None pagination = 0 while True: #for page in range(MAX_RECORXDS//PAYLOAD_SIZE): payload = { 'viewId': self.view_id, 'dateRanges': [{ 'startDate': self.start, 'endDate': self.end }], 'metrics': [{ 'expression': 'ga:sessions' }, { 'expression': 'ga:pageviews' }], 'dimensions': [{ 'name': 'ga:campaign' }, { 'name': 'ga:medium' }, { 'name': 'ga:source' }, { 'name': 'ga:region' }, { 'name': 'ga:city' }, { 'name': 'ga:dateHourMinute' }, { 'name': 'ga:deviceCategory' }], "orderBys": [{ "fieldName": "ga:sessions", "sortOrder": "DESCENDING" }], "filtersExpression": 'ga:country=~Canada', "pageToken": "{}".format(pagination), 'pageSize': PAYLOAD_SIZE } report = analytics.reports().batchGet(body={ 'reportRequests': [payload] }).execute() if not names: headers = report['reports'][0]['columnHeader'] dims = headers['dimensions'] metrics = [ i['name'] for i in headers['metricHeader']['metricHeaderEntries'] ] names = [i.replace('ga:', '') for i in dims + metrics] if 'rows' in report['reports'][0]['data'].keys(): data = pd.DataFrame([ i['dimensions'] + i['metrics'][0]['values'] for i in report['reports'][0]['data']['rows'] ], columns=names) df = df.append(data) else: break df['dateHourMinute'] = pd.to_datetime( df['dateHourMinute'], format='%Y%m%d%H%M').dt.tz_localize( 'UTC', ambiguous='infer').dt.tz_convert('UTC').dt.strftime( '%Y-%m-%d %H:%M:00') local_file = '/tmp/temp_ga_data.csv' df.to_csv(local_file, index=False) s3.upload_file(local_file, self.destination_bucket, s3_location) pagination += PAYLOAD_SIZE
def _create_destination_paritions( *, aws_conn_id: str, database_name: str, table_name: str, by_hour: bool, ds_nodash: str, task: BaseOperator, **_ ) -> str: """Python callable for the `CreateDestinationPartitionsOperator`.""" aws = AwsHook(aws_conn_id) glue = aws.get_client_type('glue') log = task.log log.info("getting Glue table information for %s.%s", database_name, table_name) storage_desc = glue.get_table( DatabaseName=database_name, Name=table_name )['Table']['StorageDescriptor'] table_location = storage_desc['Location'] log.info("getting existing partitions") existing_partitions = [ part['Values'] for part in glue.get_partitions( DatabaseName=database_name, TableName=table_name, Expression=f"estdate = '{ds_nodash}'" )['Partitions'] ] log.info("found existing partitions: %s", existing_partitions) num_parts_created = 0 if by_hour: existing_datehours = set( f'{val[0]}:{val[1]}' for val in existing_partitions ) for hour in range(24): hour_str = str(hour).zfill(2) if f'{ds_nodash}:{hour_str}' not in existing_datehours: log.info("creating partition date=%s,hour=%s", ds_nodash, hour_str) storage_desc['Location'] = f'{table_location}date={ds_nodash}/hour={hour_str}' try: glue.create_partition( DatabaseName=database_name, TableName=table_name, PartitionInput={ 'StorageDescriptor': storage_desc, 'Values': [ds_nodash, hour_str] } ) num_parts_created += 1 except glue.exceptions.AlreadyExistsException: log.info("partition already exists, skipping") else: if existing_partitions: log.info("partition already exists, skipping") else: log.info("creating partition date=%s", ds_nodash) storage_desc['Location'] = f'{table_location}date={ds_nodash}' try: glue.create_partition( DatabaseName=database_name, TableName=table_name, PartitionInput={ 'StorageDescriptor': storage_desc, 'Values': [ds_nodash] } ) num_parts_created += 1 except glue.exceptions.AlreadyExistsException: log.info("partition already exists, skipping") return f"created {num_parts_created} new partitions"
def execute(self, context): self.log.info('Triggering crawler.') glue_hook = AwsHook() client = glue_hook.get_client_type(client_type='glue') client.start_crawler(Name='s3 crawler')
def execute(self, context): logging.info("Executing UploadFileToS3Operator") aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') s3_location = '{0}/{1}'.format(self.prefix, self.s3_filename) s3.upload_file(self.local_filename, self.bucket_name, s3_location)