Beispiel #1
0
    def get_iam_token(self, conn):
        """
        Uses AWSHook to retrieve a temporary password to connect to Postgres
        or Redshift. Port is required. If none is provided, default is used for
        each service
        """
        from airflow.contrib.hooks.aws_hook import AwsHook

        redshift = conn.extra_dejson.get('redshift', False)
        aws_conn_id = conn.extra_dejson.get('aws_conn_id', 'aws_default')
        aws_hook = AwsHook(aws_conn_id)
        login = conn.login
        if conn.port is None:
            port = 5439 if redshift else 5432
        else:
            port = conn.port
        if redshift:
            # Pull the custer-identifier from the beginning of the Redshift URL
            # ex. my-cluster.ccdre4hpd39h.us-east-1.redshift.amazonaws.com returns my-cluster
            cluster_identifier = conn.extra_dejson.get('cluster-identifier',
                                                       conn.host.split('.')[0])
            client = aws_hook.get_client_type('redshift')
            cluster_creds = client.get_cluster_credentials(
                DbUser=conn.login,
                DbName=self.schema or conn.schema,
                ClusterIdentifier=cluster_identifier,
                AutoCreate=False)
            token = cluster_creds['DbPassword']
            login = cluster_creds['DbUser']
        else:
            client = aws_hook.get_client_type('rds')
            token = client.generate_db_auth_token(conn.host, port, conn.login)
        return login, token, port
Beispiel #2
0
    def _wait_for_task_ended(self):
        hook = AwsHook()
        ecs_client = hook.get_client_type('ecs', region_name='eu-west-1')
        task = ecs_client.describe_tasks(cluster='cluster', tasks=[self.arn])['tasks'][0]
        logging.warn(f"TASK: {task}")
        container_name = task['containers'][0]['name']

        log_client = hook.get_client_type('logs', region_name='eu-west-1')
        log_stream_name = f"{self.log_prefix}/{container_name}/{self.arn.split('/')[1]}"

        # hackish waiting pattern - use exception for control flow:
        # - let waiter poll for task stop
        # - return on success, repeat when wait fails
        #
        # while looping:
        # - log events on each try and after waiter succeeds
        #
        # this should give us all events that have been logged by the task
        token = None
        while True:
            token = self._log_events(log_client, log_stream_name, token)

            waiter = self.client.get_waiter('tasks_stopped')
            waiter.config.max_attempts = 2
            try:
                waiter.wait(
                    cluster=self.cluster,
                    tasks=[self.arn]
                )
                self._log_events(log_client, log_stream_name, token)
                return
            except WaiterError:
                continue
Beispiel #3
0
    def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self):
        self._create_clusters()
        hook = AwsHook(aws_conn_id='aws_default')
        client_from_hook = hook.get_client_type('redshift')

        clusters = client_from_hook.describe_clusters()['Clusters']
        self.assertEqual(len(clusters), 2)
Beispiel #4
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_left = DataHelper.read_tsv_from_s3_to_df(s3_client,
                                                      self.s3_bucket,
                                                      self.s3_key_in_left)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )
        data_right = DataHelper.read_tsv_from_s3_to_df(s3_client,
                                                       self.s3_bucket,
                                                       self.s3_key_in_right)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )

        joined_data = DataHelper.get_joined_data_from_dfs(
            data_left, data_right, self.left_on_column, self.right_on_column,
            self.suffix_name, self.output_columns)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, joined_data,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote tsv file with joined columns {self.output_columns} dropped to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
    def execute(self, context):
        logging.info("Executing ExtendedEmrCreateJobFlowOperator")
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        emr = aws.get_client_type('emr')

        response = emr.run_job_flow(
            Name=self.api_params.get('Name'),
            LogUri=self.api_params.get('LogUri'),
            ReleaseLabel=self.api_params.get('ReleaseLabel'),
            Instances=self.api_params.get('Instances'),
            Steps=self.api_params.get('Steps', []),
            BootstrapActions=self.api_params.get('BootstrapActions', []),
            Applications=self.api_params.get('Applications'),
            Configurations=self.api_params.get('Configurations', []),
            VisibleToAllUsers=self.api_params.get('VisibleToAllUsers'),
            JobFlowRole=self.api_params.get('JobFlowRole'),
            ServiceRole=self.api_params.get('ServiceRole'),
            Tags=self.api_params.get('Tags'),
        )
        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            logging.info('JobFlow with id %s created', response['JobFlowId'])
            job_flow_id = response['JobFlowId']

        if self.wait_for_status is not None:
            status = emr.describe_cluster(
                ClusterId=job_flow_id)['Cluster']['Status']['State']
            while status != self.wait_for_status and status not in self.TERMINATE_STATES:
                logging.info("Waiting for status %s. Current status is %s",
                             self.wait_for_status, status)
                time.sleep(30)
                status = emr.describe_cluster(
                    ClusterId=job_flow_id)['Cluster']['Status']['State']
            if status in self.TERMINATE_STATES:
                raise AirflowException(
                    'Cluster was terminated [%s] before it got to status %s' %
                    (status, self.wait_for_status))

        if self.save_livy_connection_name is not None:
            instances_response = emr.list_instances(
                ClusterId=job_flow_id, InstanceGroupTypes=['MASTER'])
            if self.use_public_ip_for_connections:
                master_ip = instances_response['Instances'][0][
                    'PublicIpAddress']
            else:
                master_ip = instances_response['Instances'][0][
                    'PrivateIpAddress']
            ExtendedEmrCreateJobFlowOperator.create_or_replace_connection(
                connection_id=self.save_livy_connection_name,
                connection_type='Livy',
                ip="http://" + master_ip,
                port=8998,
                login='',
                password='',
                schema='',
                extra='')

        return job_flow_id
Beispiel #6
0
def read_from_aws_sm_fn(**kwargs):
    ### set up Secrets Manager
    hook = AwsHook()
    client = hook.get_client_type('secretsmanager')
    response = client.get_secret_value(SecretId=sm_secret_name)
    secret_string = response["SecretString"]

    return secret_string
    def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self):
        client = boto3.client("emr", region_name="us-east-1")
        if len(client.list_clusters()["Clusters"]):
            raise ValueError("AWS not properly mocked")

        hook = AwsHook(aws_conn_id="aws_default")
        client_from_hook = hook.get_client_type("emr")

        self.assertEqual(client_from_hook.list_clusters()["Clusters"], [])
    def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self):
        client = boto3.client('emr', region_name='us-east-1')
        if len(client.list_clusters()['Clusters']):
            raise ValueError('AWS not properly mocked')

        hook = AwsHook(aws_conn_id='aws_default')
        client_from_hook = hook.get_client_type('emr')

        self.assertEqual(client_from_hook.list_clusters()['Clusters'], [])
Beispiel #9
0
    def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self):
        client = boto3.client('emr', region_name='us-east-1')
        if client.list_clusters()['Clusters']:
            raise ValueError('AWS not properly mocked')

        hook = AwsHook(aws_conn_id='aws_default')
        client_from_hook = hook.get_client_type('emr')

        self.assertEqual(client_from_hook.list_clusters()['Clusters'], [])
Beispiel #10
0
def test(**kwargs):
    ### set up DMS
    hook = AwsHook()
    client = hook.get_client_type('dms')
    dms_client.start_replication_task(
        ReplicationTaskArn={HERE},
        StartReplicationTaskType={HERE}
    )['ReplicationTask']
    return client
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in)
        self.log.info(f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe.")
        unstacked_data = DataHelper.unstack_df_column(data, self.id_column, self. unstack_column)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, unstacked_data, self.s3_bucket, self.s3_key_out)
        self.log.info(f"Wrote tsv file with unstacked {self.unstack_column} to s3://{self.s3_bucket}/{self.s3_key_out}.")
 def copy_results_s3(self):
     results = self.query_mysql()
     aws = AwsHook(aws_conn_id=self.aws_conn_id)
     s3 = aws.get_client_type('s3')
     concat = StringIO()
     [concat.write(",".join(map(str, i)) + '\n') for i in results]
     s3_location = self.prefix + '/' + self.s3_filename
     s3.put_object(Body=concat.getvalue(),
                   Bucket=self.bucket_name,
                   Key=s3_location)
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        engagement_dfs = []
        for key in DataHelper.generate_all_keys_from_s3_with_prefix(s3_client, self.s3_bucket, f"{self.engagement_type}/"):
            df = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, key)
            self.log.info(f"Read tsv file s3://{self.s3_bucket}/{key} into dataframe.")
            engagement_dfs.append(df)
        
        all_engagement_df = DataHelper.combine_engagement_dfs(engagement_dfs, ['user_id', 'engaged_with_id'], lambda x: 1)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, all_engagement_df, self.s3_bucket, self.s3_key_out)
        self.log.info(f"Wrote combined engagement tsv file to s3://{self.s3_bucket}/{self.engagement_type}/{self.s3_key_out}.")
Beispiel #14
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        self.log.info(
            f"Loading file s3://{self.s3_bucket}/{self.s3_key} into table {self.table}."
        )
        with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket,
                                                 self.s3_key) as f:
            postgres.bulk_load(self.table, f)
        self.log.info(
            f"s3://{self.s3_bucket}/{self.s3_key} loaded into table {self.table} sucesfully."
        )
Beispiel #15
0
    def get_iam_token(self, conn):
        """
        Uses AWSHook to retrieve a temporary password to connect to MySQL
        Port is required. If none is provided, default 3306 is used
        """
        from airflow.contrib.hooks.aws_hook import AwsHook

        aws_conn_id = conn.extra_dejson.get('aws_conn_id', 'aws_default')
        aws_hook = AwsHook(aws_conn_id)
        if conn.port is None:
            port = 3306
        else:
            port = conn.port
        client = aws_hook.get_client_type('rds')
        token = client.generate_db_auth_token(conn.host, port, conn.login)
        return token, port
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        self.log.info(
            f"Parsing {self.activity} events from s3://{self.s3_bucket}/{self.s3_key_in}."
        )
        with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket,
                                                 self.s3_key_in) as f:
            data = DataHelper.parse_activity_json_to_df(
                json_file, self.activity)
            DataHelper.write_df_to_tsv_in_s3(s3_resource, data, self.s3_bucket,
                                             self.s3_key_out)
        self.log.info(
            f"Wrote {self.activity} eveents to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
    def execute(self, context):
        logging.info("Executing EmrOperator")
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        emr = aws.get_client_type('emr')
        # create the cluster
        response = emr.run_job_flow(
            Name=self.cluster_name,
            LogUri=self.log_s3_uri,
            ReleaseLabel=self.release_label,
            Instances={
                'MasterInstanceType': self.instance_type,
                'SlaveInstanceType': self.instance_type,
                'InstanceCount': self.instance_count,
                'KeepJobFlowAliveWhenNoSteps': False,
                'Ec2KeyName': self.ec2_key_name
            },
            Applications=self.applications,
            VisibleToAllUsers=True,
            JobFlowRole=self.job_flow_role,
            ServiceRole=self.service_role,
            Configurations=self.configurations,
        )
        # add steps
        cluster_id = response['JobFlowId']
        response_step = emr.add_job_flow_steps(JobFlowId=cluster_id,
                                               Steps=self.steps)

        start_time = time.time()

        while True:
            if self.step_timeout_minutes:
                if time.time() > start_time + self.step_timeout_minutes * 60:
                    raise AirflowException('EMR step(s) time out!')
            step_statuses = [
                i['Status']['State']
                for i in emr.list_steps(ClusterId=cluster_id)['Steps']
            ]
            if not any(
                    map(lambda x: x in ('PENDING', 'RUNNING'), step_statuses)):
                if not all(map(lambda x: x == 'COMPLETED', step_statuses)):
                    raise AirflowException('EMR step(s) failed!')
                break
            else:
                logging.info("Job(s) still running/pending")
                time.sleep(60)
Beispiel #18
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_sec_deg = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_in_sec_deg)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe."
        )

        recs = RecommendationHelper.get_top_n_recommendations(
            data_sec_deg, self.n_recs)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, recs, self.s3_bucket,
                                         self.s3_key_out)
        self.log.info(
            f"Wrote recommendations tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Beispiel #19
0
    def execute(self, context):
        hook = AwsHook(aws_conn_id=self.aws_credentials_id)
        client = hook.get_client_type("emr", region_name=self.region)
        cluster_id = context['task_instance'].xcom_pull(
            task_ids=self.cluster_task_id)
        steps_str = context['task_instance'].xcom_pull(
            task_ids=self.step_task_id)
        step_ids = re.sub("\"|'", "", steps_str).split()
        step_info = client.list_steps(ClusterId=cluster_id, StepIds=step_ids)
        step_status = step_info['Steps'][-1]['Status']['State']

        while step_status in ['PENDING', 'RUNNING']:
            step_info = client.list_steps(ClusterId=cluster_id,
                                          StepIds=step_ids)
            step_status = step_info['Steps'][-1]['Status']['State']
            if self.is_timeout():
                raise StepTimeoutError
            time.sleep(60)
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket,
                                                 self.s3_key_in)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )

        n_strongest = RecommendationHelper.get_top_n_closest(
            data, self.n_strongest)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, n_strongest,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote strongest connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Beispiel #21
0
def fetch_definitions():
    definitions = {}

    hook = AwsHook()
    ecs = hook.get_client_type('ecs', region_name='eu-west-1')
    first = True
    next_token = None
    while first or next_token is not None:
        if first:
            first = False
            ecs_tasks = ecs.list_task_definitions()
        else:
            ecs_tasks = ecs.list_task_definitions(nextToken=next_token)
        next_token = ecs_tasks[
            'nextToken'] if 'nextToken' in ecs_tasks else None

        for arn in ecs_tasks['taskDefinitionArns']:
            ecs_task = ecs.describe_task_definition(taskDefinition=arn)
            containers = ecs_task['taskDefinition']['containerDefinitions']
            if len(containers) != 1:
                continue

            container = containers[0]
            if 'dockerLabels' not in container:
                continue

            labels = parse_labels(container['dockerLabels'])

            if 'airflow' not in labels:
                continue

            try:
                name = labels['airflow']['dag']['name']

                definitions[name] = {
                    'airflow': labels['airflow'],
                    'arn': arn,
                    'container': container
                }

            except KeyError as e:
                logging.warning(f"Invalid configuration: {labels}", exc_info=e)

    return definitions
 def execute(self, context):
     logging.info("Executing S3DeletePrefixOperator")
     aws = AwsHook(aws_conn_id=self.aws_conn_id)
     s3 = aws.get_client_type('s3')
     objects_to_delete = s3.list_objects(Bucket=self.bucket_name,
                                         Prefix=self.prefix)
     delete_keys = {'Objects': []}
     delete_keys['Objects'] = [
         {
             'Key': k
         } for k in
         [obj['Key'] for obj in objects_to_delete.get('Contents', [])]
     ]
     try:
         response = s3.delete_objects(Bucket=self.bucket_name,
                                      Delete=delete_keys)
         logging.info(response)
     except Exception as e:  # TODO import botocode client exception for missing delete
         logging.info('delete error {}'.format(e))
     pass
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_csv_from_s3_to_df(s3_client, self.s3_bucket,
                                                 self.s3_key_in)
        self.log.info(
            f"Read csv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )
        data_with_dummies = DataHelper.get_dummy_colums(data,
                                                        self.indicator_column,
                                                        sep=self.sep)
        self.log.info(
            f"Created dummy fields for column {self.indicator_column}.")
        DataHelper.write_df_to_csv_in_s3(s3_resource, data_with_dummies,
                                         self.s3_bucket, self.s3_key_in)
        self.log.info(
            f"Wrote updated data back to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Beispiel #24
0
    def execute(self, context):
        logging.info("Executing {}".format(self.__class__.__name__))
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3 = aws.get_client_type('s3')
        s3_location = '{0}/{1}'.format(self.destination_prefix,
                                       self.s3_filename)
        local_file = '/tmp/temp_adwords_data.csv'
        s3_location = '{0}/{1}'.format(self.destination_prefix,
                                       self.s3_filename)

        # initiliaze adwords client
        client = adwords.AdWordsClient.LoadFromStorage(self.yaml_file_location)
        report_downloader = client.GetReportDownloader(version='v201806')

        # build awql report
        report_query = adwords.ReportQueryBuilder().\
                       Select(*self.fields).\
                       From(self.report_name).\
                       During(start_date=self.query_start_date,
                              end_date=self.query_end_date)

        for condition in self.conditions:
            report_query = report_query.Where(condition['name'])\
                                       .In(*condition['values'])

        report_query = report_query.Build()

        # Download report locally (temp)
        filepath = self.temp_localfile
        with open(filepath, 'wb') as handler:
            report_downloader.DownloadReportWithAwql(
                report_query,
                'CSV',
                output=handler,
                skip_report_header=True,
                skip_column_header=False,
                skip_report_summary=True,
                include_zero_impressions=False)
        #Upload to S3
        s3.upload_file(filepath, self.destination_bucket, s3_location)
def write_all_to_aws_sm_fn(**kwargs):
    ### determine secrets manager prefixes
    connections_prefix = 'airflow/connections'
    variables_prefix = 'airflow/variables'
    backend_kwargs = kwargs['conf'].get(section='secrets',
                                        key='backend_kwargs')
    if backend_kwargs:
        x = json.loads(backend_kwargs)
        connections_prefix = x['connections_prefix'].strip().rstrip('/')
        variables_prefix = x['variables_prefix'].strip().rstrip('/')
        print("using connections_prefix=", connections_prefix,
              ",variables_prefix=", variables_prefix, "...")
    else:
        print("backend_kwargs undefined--using defaults connections_prefix=",
              connections_prefix, ",variables_prefix=", variables_prefix)

    ### set up SQL and AWSSM
    session = settings.Session()
    hook = AwsHook()
    client = hook.get_client_type('secretsmanager')

    ### transfer connections
    query = session.query(Connection)
    print(query.count(), " connections: ")
    for curr_entry in query:
        curr_id = connections_prefix + '/' + curr_entry.conn_id
        curr_val = curr_entry.get_uri()
        write_to_sm_fn(name=curr_id, value=curr_val, client=client)

    ### transfer variables
    query = session.query(Variable)
    print(query.count(), " variables: ")
    for curr_entry in query:
        curr_id = variables_prefix + '/' + curr_entry.key
        curr_val = curr_entry.get_val()
        write_to_sm_fn(name=curr_id, value=curr_val, client=client)

    return "OK"
    def execute(self, context):
        logging.info("Executing CloudwatchToS3Operator")
        logging.info(', '.join("%s: %s" % item for item in vars(self).items()))
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        cloudwatch = aws.get_client_type('logs')
        response = cloudwatch.create_export_task(
            taskName=self.task_name,
            logGroupName=self.log_group_name,
            logStreamNamePrefix=self.log_stream_name_prefix,
            fromTime=int(self.from_utc_timestamp),
            to=int(self.to_utc_timestamp),
            destination=self.destination_bucket,
            destinationPrefix=self.destination_prefix)
        task_id = response['taskId']
        status_code = "RUNNING"
        while status_code == "RUNNING":
            time.sleep(2)
            status_code = cloudwatch.describe_export_tasks(
                taskId=task_id)['exportTasks'][0]['status']['code']

        if status_code != "COMPLETED":
            raise AirflowException(
                'Cloudwatch export task failed -{}'.format(status_code))
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_sec_deg = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_in_sec_deg)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe."
        )
        data_existing_conn = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_existing_conn)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_existing_conn} into dataframe."
        )

        sec_deg_conn_valid = RecommendationHelper.remove_invalid_recommendations(
            data_sec_deg, data_existing_conn, conn_type)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, sec_deg_conn_valid,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote valid second degree connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Beispiel #28
0
    def execute(self, context):
        logging.info("Executing GoogleAnalyticsToS3Operator")
        logging.info(', '.join("%s: %s" % item for item in vars(self).items()))
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3 = aws.get_client_type('s3')
        s3_location = '{0}/{1}'.format(self.destination_prefix,
                                       self.s3_filename)

        SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
        credentials = ServiceAccountCredentials.from_json_keyfile_name(
            self.key_file_location, SCOPES)
        analytics = build('analyticsreporting', 'v4', credentials=credentials)

        df = pd.DataFrame()
        PAYLOAD_SIZE = 100000

        names = None
        pagination = 0
        while True:
            #for page in range(MAX_RECORXDS//PAYLOAD_SIZE):
            payload = {
                'viewId':
                self.view_id,
                'dateRanges': [{
                    'startDate': self.start,
                    'endDate': self.end
                }],
                'metrics': [{
                    'expression': 'ga:sessions'
                }, {
                    'expression': 'ga:pageviews'
                }],
                'dimensions': [{
                    'name': 'ga:campaign'
                }, {
                    'name': 'ga:medium'
                }, {
                    'name': 'ga:source'
                }, {
                    'name': 'ga:region'
                }, {
                    'name': 'ga:city'
                }, {
                    'name': 'ga:dateHourMinute'
                }, {
                    'name': 'ga:deviceCategory'
                }],
                "orderBys": [{
                    "fieldName": "ga:sessions",
                    "sortOrder": "DESCENDING"
                }],
                "filtersExpression":
                'ga:country=~Canada',
                "pageToken":
                "{}".format(pagination),
                'pageSize':
                PAYLOAD_SIZE
            }
            report = analytics.reports().batchGet(body={
                'reportRequests': [payload]
            }).execute()
            if not names:
                headers = report['reports'][0]['columnHeader']
                dims = headers['dimensions']
                metrics = [
                    i['name']
                    for i in headers['metricHeader']['metricHeaderEntries']
                ]
                names = [i.replace('ga:', '') for i in dims + metrics]

            if 'rows' in report['reports'][0]['data'].keys():
                data = pd.DataFrame([
                    i['dimensions'] + i['metrics'][0]['values']
                    for i in report['reports'][0]['data']['rows']
                ],
                                    columns=names)
                df = df.append(data)
            else:
                break
            df['dateHourMinute'] = pd.to_datetime(
                df['dateHourMinute'], format='%Y%m%d%H%M').dt.tz_localize(
                    'UTC', ambiguous='infer').dt.tz_convert('UTC').dt.strftime(
                        '%Y-%m-%d %H:%M:00')
            local_file = '/tmp/temp_ga_data.csv'
            df.to_csv(local_file, index=False)
            s3.upload_file(local_file, self.destination_bucket, s3_location)
            pagination += PAYLOAD_SIZE
def _create_destination_paritions(
        *,
        aws_conn_id: str,
        database_name: str,
        table_name: str,
        by_hour: bool,
        ds_nodash: str,
        task: BaseOperator,
        **_
) -> str:
    """Python callable for the `CreateDestinationPartitionsOperator`."""

    aws = AwsHook(aws_conn_id)
    glue = aws.get_client_type('glue')
    log = task.log

    log.info("getting Glue table information for %s.%s", database_name, table_name)
    storage_desc = glue.get_table(
        DatabaseName=database_name,
        Name=table_name
    )['Table']['StorageDescriptor']
    table_location = storage_desc['Location']

    log.info("getting existing partitions")
    existing_partitions = [
        part['Values'] for part in glue.get_partitions(
            DatabaseName=database_name,
            TableName=table_name,
            Expression=f"estdate = '{ds_nodash}'"
        )['Partitions']
    ]
    log.info("found existing partitions: %s", existing_partitions)

    num_parts_created = 0
    if by_hour:
        existing_datehours = set(
            f'{val[0]}:{val[1]}' for val in existing_partitions
        )
        for hour in range(24):
            hour_str = str(hour).zfill(2)
            if f'{ds_nodash}:{hour_str}' not in existing_datehours:
                log.info("creating partition date=%s,hour=%s", ds_nodash, hour_str)
                storage_desc['Location'] = f'{table_location}date={ds_nodash}/hour={hour_str}'
                try:
                    glue.create_partition(
                        DatabaseName=database_name,
                        TableName=table_name,
                        PartitionInput={
                            'StorageDescriptor': storage_desc,
                            'Values': [ds_nodash, hour_str]
                        }
                    )
                    num_parts_created += 1
                except glue.exceptions.AlreadyExistsException:
                    log.info("partition already exists, skipping")
    else:
        if existing_partitions:
            log.info("partition already exists, skipping")
        else:
            log.info("creating partition date=%s", ds_nodash)
            storage_desc['Location'] = f'{table_location}date={ds_nodash}'
            try:
                glue.create_partition(
                    DatabaseName=database_name,
                    TableName=table_name,
                    PartitionInput={
                        'StorageDescriptor': storage_desc,
                        'Values': [ds_nodash]
                    }
                )
                num_parts_created += 1
            except glue.exceptions.AlreadyExistsException:
                log.info("partition already exists, skipping")

    return f"created {num_parts_created} new partitions"
Beispiel #30
0
 def execute(self, context):
     self.log.info('Triggering crawler.')
     glue_hook = AwsHook()
     client = glue_hook.get_client_type(client_type='glue')
     client.start_crawler(Name='s3 crawler')
 def execute(self, context):
     logging.info("Executing UploadFileToS3Operator")
     aws = AwsHook(aws_conn_id=self.aws_conn_id)
     s3 = aws.get_client_type('s3')
     s3_location = '{0}/{1}'.format(self.prefix, self.s3_filename)
     s3.upload_file(self.local_filename, self.bucket_name, s3_location)