Ejemplo n.º 1
0
    def test_get_resource_type_returns_a_boto3_resource_of_the_requested_type(
            self):
        hook = AwsHook(aws_conn_id='aws_default')
        resource_from_hook = hook.get_resource_type('dynamodb')

        # this table needs to be created in production
        table = resource_from_hook.create_table(TableName='test_airflow',
                                                KeySchema=[
                                                    {
                                                        'AttributeName': 'id',
                                                        'KeyType': 'HASH'
                                                    },
                                                ],
                                                AttributeDefinitions=[{
                                                    'AttributeName':
                                                    'name',
                                                    'AttributeType':
                                                    'S'
                                                }],
                                                ProvisionedThroughput={
                                                    'ReadCapacityUnits': 10,
                                                    'WriteCapacityUnits': 10
                                                })

        table.meta.client.get_waiter('table_exists').wait(
            TableName='test_airflow')

        self.assertEqual(table.item_count, 0)
Ejemplo n.º 2
0
    def test_get_resource_type_returns_a_boto3_resource_of_the_requested_type(self):

        hook = AwsHook(aws_conn_id='aws_default')
        resource_from_hook = hook.get_resource_type('dynamodb')

        # this table needs to be created in production
        table = resource_from_hook.create_table(
            TableName='test_airflow',
            KeySchema=[
                {
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'name',
                    'AttributeType': 'S'
                }
            ],
            ProvisionedThroughput={
                'ReadCapacityUnits': 10,
                'WriteCapacityUnits': 10
            }
        )

        table.meta.client.get_waiter(
            'table_exists').wait(TableName='test_airflow')

        self.assertEqual(table.item_count, 0)
Ejemplo n.º 3
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_left = DataHelper.read_tsv_from_s3_to_df(s3_client,
                                                      self.s3_bucket,
                                                      self.s3_key_in_left)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )
        data_right = DataHelper.read_tsv_from_s3_to_df(s3_client,
                                                       self.s3_bucket,
                                                       self.s3_key_in_right)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )

        joined_data = DataHelper.get_joined_data_from_dfs(
            data_left, data_right, self.left_on_column, self.right_on_column,
            self.suffix_name, self.output_columns)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, joined_data,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote tsv file with joined columns {self.output_columns} dropped to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, self.s3_key_in)
        self.log.info(f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe.")
        unstacked_data = DataHelper.unstack_df_column(data, self.id_column, self. unstack_column)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, unstacked_data, self.s3_bucket, self.s3_key_out)
        self.log.info(f"Wrote tsv file with unstacked {self.unstack_column} to s3://{self.s3_bucket}/{self.s3_key_out}.")
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        engagement_dfs = []
        for key in DataHelper.generate_all_keys_from_s3_with_prefix(s3_client, self.s3_bucket, f"{self.engagement_type}/"):
            df = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket, key)
            self.log.info(f"Read tsv file s3://{self.s3_bucket}/{key} into dataframe.")
            engagement_dfs.append(df)
        
        all_engagement_df = DataHelper.combine_engagement_dfs(engagement_dfs, ['user_id', 'engaged_with_id'], lambda x: 1)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, all_engagement_df, self.s3_bucket, self.s3_key_out)
        self.log.info(f"Wrote combined engagement tsv file to s3://{self.s3_bucket}/{self.engagement_type}/{self.s3_key_out}.")
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        self.log.info(
            f"Parsing {self.activity} events from s3://{self.s3_bucket}/{self.s3_key_in}."
        )
        with DataHelper.buffer_s3_object_as_file(s3_client, self.s3_bucket,
                                                 self.s3_key_in) as f:
            data = DataHelper.parse_activity_json_to_df(
                json_file, self.activity)
            DataHelper.write_df_to_tsv_in_s3(s3_resource, data, self.s3_bucket,
                                             self.s3_key_out)
        self.log.info(
            f"Wrote {self.activity} eveents to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_tsv_from_s3_to_df(s3_client, self.s3_bucket,
                                                 self.s3_key_in)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )

        n_strongest = RecommendationHelper.get_top_n_closest(
            data, self.n_strongest)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, n_strongest,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote strongest connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Ejemplo n.º 8
0
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_sec_deg = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_in_sec_deg)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe."
        )

        recs = RecommendationHelper.get_top_n_recommendations(
            data_sec_deg, self.n_recs)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, recs, self.s3_bucket,
                                         self.s3_key_out)
        self.log.info(
            f"Wrote recommendations tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data = DataHelper.read_csv_from_s3_to_df(s3_client, self.s3_bucket,
                                                 self.s3_key_in)
        self.log.info(
            f"Read csv file s3://{self.s3_bucket}/{self.s3_key_in} into dataframe."
        )
        data_with_dummies = DataHelper.get_dummy_colums(data,
                                                        self.indicator_column,
                                                        sep=self.sep)
        self.log.info(
            f"Created dummy fields for column {self.indicator_column}.")
        DataHelper.write_df_to_csv_in_s3(s3_resource, data_with_dummies,
                                         self.s3_bucket, self.s3_key_in)
        self.log.info(
            f"Wrote updated data back to s3://{self.s3_bucket}/{self.s3_key_out}."
        )
Ejemplo n.º 10
0
def _clear_destination_bucket(*, aws_conn_id: str, bucket_name: str,
                              bucket_data_prefix: str, table_name: str,
                              ds_nodash: str, task: BaseOperator, **_) -> str:
    """Python callable for the `ClearDestinationBucketOperator`."""

    log = task.log

    aws = AwsHook(aws_conn_id)
    s3 = aws.get_resource_type('s3')  # pylint: disable=invalid-name

    prefix = f'{bucket_data_prefix}{table_name}/date={ds_nodash}/'

    log.info("erasing any existing data in s3://%s/%s", bucket_name, prefix)
    resp = s3.Bucket(bucket_name).objects.filter(Prefix=prefix).delete()
    log.info("got response: %s", resp)

    if list(item for single_resp in resp
            for item in single_resp.get('Errors', [])):
        raise AirflowException(
            f"Unable to fully erase existing data in s3://{bucket_name}/{prefix}"
        )

    return f"erased {len(list(item for single_resp in resp for item in single_resp.get('Deleted', [])))} files"
def _load_extracted_mappings(
        *,
        aws_conn_id: str,
        bucket_name: str,
        bucket_data_prefix: str,
        partner: str,
        ds_nodash: str,
        log: Logger
) -> Sequence[Mapping[str, int]]:
    """Load extracted mappings from S3."""

    full_prefix = f'{bucket_data_prefix}{partner}/date={ds_nodash}/'
    log.info("loading extracted mappings from s3://%s/%s", bucket_name, full_prefix)
    aws = AwsHook(aws_conn_id)
    bucket = aws.get_resource_type('s3').Bucket(bucket_name)
    extracted_mappings = [
        json.loads(line) for file_obj in bucket.objects.filter(
            Prefix=full_prefix
        ) for line in file_obj.get()['Body'].iter_lines()
    ]
    log.info("loaded mappings: %s", extracted_mappings)

    return extracted_mappings
    def execute(self, context):
        aws = AwsHook(aws_conn_id=self.aws_conn_id)
        s3_client = aws.get_client_type('s3', region_name=self.s3_region)
        s3_resource = aws.get_resource_type('s3', region_name=self.s3_region)

        data_sec_deg = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_in_sec_deg)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_in_sec_deg} into dataframe."
        )
        data_existing_conn = DataHelper.read_tsv_from_s3_to_df(
            s3_client, self.s3_bucket, self.s3_key_existing_conn)
        self.log.info(
            f"Read tsv file s3://{self.s3_bucket}/{self.s3_key_existing_conn} into dataframe."
        )

        sec_deg_conn_valid = RecommendationHelper.remove_invalid_recommendations(
            data_sec_deg, data_existing_conn, conn_type)
        DataHelper.write_df_to_tsv_in_s3(s3_resource, sec_deg_conn_valid,
                                         self.s3_bucket, self.s3_key_out)
        self.log.info(
            f"Wrote valid second degree connections tsv file to s3://{self.s3_bucket}/{self.s3_key_out}."
        )