Exemple #1
0
def delete_store_products(dynamodb, store_product_urls):
    product_table = dynamodb.Table(get_table_name('product'))
    tag_table = dynamodb.Table(get_table_name('product_tag'))

    with product_table.batch_writer() as batch:
        for sp_url in store_product_urls:
            batch.delete_item(Key={'store_product_url': sp_url})

    with tag_table.batch_writer() as batch:
        for tag in fetch_product_tags(dynamodb, store_product_urls):
            batch.delete_item(Key=dict(
                store_product_url=tag['store_product_url'],
                tag=tag['tag'],
            ))
Exemple #2
0
def fetch_product_tags(dynamodb, store_product_urls, product_tag=None):
    tag_table = dynamodb.Table(get_table_name('product_tag'))

    index_name = None
    if product_tag is not None:
        index_name = f'{product_tag.name}_idx'

    for sp_url in store_product_urls:
        sp_url = clean_product_url(sp_url)
        start_key = None
        while True:
            key_expr = Key('store_product_url').eq(sp_url)

            query_kwargs = {}
            if start_key is not None:
                query_kwargs['ExclusiveStartKey'] = start_key
            if index_name is not None:
                query_kwargs['IndexName'] = index_name

            results = tag_table.query(KeyConditionExpression=key_expr,
                                      **query_kwargs)
            for item in results['Items']:
                yield item

            start_key = results.get('LastEvaluatedKey')
            if start_key is None:
                break
Exemple #3
0
def _fetch_products(dynamodb,
                    index_name,
                    key_expr,
                    limit=None,
                    only_attributes=None,
                    consistent_read=False):
    product_table = dynamodb.Table(get_table_name('product'))

    projection_expression = None
    if only_attributes is not None:
        # Always retrieve "product_uuid" so that blacklisted product UUIDs can
        # be filtered from results
        if 'product_uuid' not in only_attributes:
            fetch_attributes = list(only_attributes) + ['product_uuid']
        else:
            fetch_attributes = only_attributes
        projection_expression = ','.join(fetch_attributes)

    start_key = None
    while True:
        query_kwargs = {}
        query_kwargs['ConsistentRead'] = consistent_read
        if start_key is not None:
            query_kwargs['ExclusiveStartKey'] = start_key
        if projection_expression is not None:
            query_kwargs['ProjectionExpression'] = projection_expression
        if limit is not None:
            # Note, if we start using Filter Expressions on the results of
            # queries, "limit" will be applied before filtering results.
            # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Query.html#Query.Limit
            query_kwargs['Limit'] = limit

        results = product_table.query(IndexName=index_name,
                                      KeyConditionExpression=key_expr,
                                      **query_kwargs)
        for item in results['Items']:
            # stored as a "number" in DynamoDB
            # (required to allow indexing)
            if 'is_available' in item:
                item['is_available'] = bool(item['is_available'])

            if item['product_uuid'] not in PRODUCT_UUID_BLACKLIST:
                if (only_attributes is not None
                        and 'product_uuid' not in only_attributes):
                    # Do not include product UUID in results if not requested
                    del item['product_uuid']
                yield item

        if limit is not None:
            limit -= len(results['Items'])
            if limit <= 0:
                break

        start_key = results.get('LastEvaluatedKey')
        if start_key is None:
            break
Exemple #4
0
def delete_product_tags(dynamodb, product_tag, store_product_urls):
    tag_table = dynamodb.Table(get_table_name('product_tag'))

    with tag_table.batch_writer() as batch:
        for sp_url in store_product_urls:
            sp_url = clean_product_url(sp_url)
            batch.delete_item(Key=dict(
                store_product_url=sp_url,
                tag=product_tag.name,
            ))
Exemple #5
0
def set_product_tag(dynamodb, store_product_url, product_tag, **attrs):
    tag_table = dynamodb.Table(get_table_name('product_tag'))
    store_product_url = clean_product_url(store_product_url)
    item = {
        'store_product_url': store_product_url,
        'tag': product_tag.name,
        product_tag.name: 1,
    }
    item.update(attrs)
    tag_table.put_item(Item=item)
Exemple #6
0
def add_store_product(dynamodb,
                      product_url,
                      store_domain,
                      is_available=True,
                      **attrs):
    product_table = dynamodb.Table(get_table_name('product'))

    store_product_url = clean_product_url(product_url)
    item_data = dict(
        store_product_url=store_product_url,
        full_store_product_url=product_url,
        store_domain=store_domain,
        # "is_available" is stored as a "number" in DynamoDB
        # (required to allow indexing)
        is_available=int(is_available),
        **attrs)

    item_data = parse_store_product_data(item_data)

    # Set "brand domain" if available for new products
    # (for existing products, this is set according to "product_uuid" by bulk
    # data processing job)
    if attrs.get('store_product_brand_domain'):
        item_data['brand_domain'] = attrs.get('store_product_brand_domain')

    try:
        # product does not yet exist in DB, assign a new product ID
        item_data['product_uuid'] = uuid.uuid4().hex

        product_table.put_item(
            Item=item_data,
            ConditionExpression='attribute_not_exists(store_product_url)')
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
            raise
        else:
            raise ValueError(
                f'Product with url "{store_product_url}" already exists')

    primary_image_url = None
    if item_data.get('image_urls'):
        primary_image_url = item_data['image_urls'][0]

    # Tag this store product for requiring indexing if it has an image
    if primary_image_url:
        set_product_tag(dynamodb,
                        store_product_url,
                        ProductTag.image_not_indexed,
                        image_url=primary_image_url)
    # Tag this store product for requiring metadata update
    set_product_tag(dynamodb, store_product_url,
                    ProductTag.update_product_meta)
def update_product_attribute(dynamodb, product_url, attr, value):
    """
    Define separate function to update product table data without using the API.
    The API does not support updating certain attributes that are managed
    internally and by separate by batch processes.
    """
    product_table = dynamodb.Table(get_table_name('product'))
    store_product_url = clean_product_url(product_url)
    update_expression = f'SET {attr} = {value}'
    product_table.update_item(
        Key={'store_product_url': store_product_url},
        UpdateExpression=update_expression,
    )
Exemple #8
0
def update_store_product(dynamodb, product_url, **attrs):
    product_table = dynamodb.Table(get_table_name('product'))

    store_product_url = clean_product_url(product_url)

    item_data = dict(full_store_product_url=product_url, **attrs)

    image_not_indexed = False
    update_product_meta = False

    old_item_data = product_table.get_item(Key={
        'store_product_url':
        store_product_url
    }, ).get('Item')

    if not old_item_data:
        raise ValueError(
            f'Product with url "{store_product_url}" does not yet exist')

    old_primary_image_url = None
    if old_item_data.get('image_urls'):
        old_primary_image_url = old_item_data['image_urls'][0]
    new_primary_image_url = None
    if item_data.get('image_urls'):
        new_primary_image_url = item_data['image_urls'][0]

    if (new_primary_image_url is not None and
            # Assume query string does not affect image contents and compare image
            # URLs without query string component
            clean_product_url(new_primary_image_url) !=
            clean_product_url(old_primary_image_url)):
        image_not_indexed = True

    if (old_item_data.get('store_product_brand_domain') !=
            item_data.get('store_product_brand_domain')):
        update_product_meta = True

    item_data = parse_store_product_data(item_data, new_item=False)

    update_expression = 'SET {}'.format(', '.join(
        [f'{attr} = :{attr}' for attr in item_data]))
    expression_attribute_values = {
        f':{attr}': value
        for attr, value in item_data.items()
    }
    product_table.update_item(
        Key={'store_product_url': store_product_url},
        UpdateExpression=update_expression,
        ExpressionAttributeValues=expression_attribute_values)

    # flag image for feature extraction and indexing
    if image_not_indexed:
        set_product_tag(dynamodb,
                        store_product_url,
                        ProductTag.image_not_indexed,
                        image_url=new_primary_image_url)

    # flag product metadata to be updated (re-evaluate product "brand domain")
    if update_product_meta:
        set_product_tag(dynamodb, store_product_url,
                        ProductTag.update_product_meta)
Exemple #9
0
def get_store_product(dynamodb, product_url):
    product_table = dynamodb.Table(get_table_name('product'))
    store_product_url = clean_product_url(product_url)
    return product_table.get_item(Key={
        'store_product_url': store_product_url
    }).get('Item')
def migrate():
    client = boto3.client('dynamodb')

    client.create_table(
        TableName=get_table_name('product'),
        AttributeDefinitions=[
            # unique reference to a product
            # (hash of product URL for product)
            {
                'AttributeName': 'store_product_url',
                'AttributeType': 'S'
            },
            # UUID
            # - auto-generate when adding new products
            # - merge with matching "store product" UUIDs when running
            #   "mega products" pipeline
            {
                'AttributeName': 'product_uuid',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'brand_domain',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'store_domain',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'vendor_name',
                'AttributeType': 'S'
            },
            {
                # Set to 1 if product is available, otherwise do not set this key
                # (use a sparse index to reduce index size)
                'AttributeName': 'is_available',
                'AttributeType': 'N'
            },
        ],
        KeySchema=[{
            'AttributeName': 'store_product_url',
            'KeyType': 'HASH'
        }],
        GlobalSecondaryIndexes=[
            # "Mega product" queries
            {
                'IndexName':
                'product_uuid_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'product_uuid',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'is_available',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'ALL',
                },
            },
            # Brand product queries
            {
                'IndexName':
                'brand_domain_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'brand_domain',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'is_available',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'ALL',
                },
            },
            # Store product queries
            {
                'IndexName':
                'store_domain_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'store_domain',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'is_available',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'ALL',
                },
            },
            # Look up products by store/vendor
            {
                'IndexName':
                'store_vendor_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'store_domain',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'vendor_name',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'KEYS_ONLY',
                },
            },
        ],
        BillingMode='PAY_PER_REQUEST',
    )

    client.create_table(
        TableName=get_table_name('product_visual_features'),
        AttributeDefinitions=[
            {
                'AttributeName': 'store_product_url',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'image_url',
                'AttributeType': 'S'
            },
        ],
        KeySchema=[
            {
                'AttributeName': 'store_product_url',
                'KeyType': 'HASH'
            },
            {
                'AttributeName': 'image_url',
                'KeyType': 'RANGE'
            },
        ],
        BillingMode='PAY_PER_REQUEST',
    )

    client.create_table(
        TableName=get_table_name('product_tag'),
        # tags include:
        # - "image_not_indexed"
        # - "run_megaproduct_pipeline"
        AttributeDefinitions=[
            {
                'AttributeName': 'store_product_url',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'tag',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'image_not_indexed',
                'AttributeType': 'N'
            },
            {
                'AttributeName': 'update_product_meta',
                'AttributeType': 'N'
            },
        ],
        KeySchema=[
            {
                'AttributeName': 'store_product_url',
                'KeyType': 'HASH'
            },
            {
                'AttributeName': 'tag',
                'KeyType': 'RANGE'
            },
        ],
        GlobalSecondaryIndexes=[
            {
                'IndexName':
                'image_not_indexed_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'store_product_url',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'image_not_indexed',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'KEYS_ONLY',
                },
            },
            {
                'IndexName':
                'update_product_meta_idx',
                'KeySchema': [
                    {
                        'AttributeName': 'store_product_url',
                        'KeyType': 'HASH'
                    },
                    {
                        'AttributeName': 'update_product_meta',
                        'KeyType': 'RANGE'
                    },
                ],
                'Projection': {
                    'ProjectionType': 'KEYS_ONLY',
                },
            },
        ],
        BillingMode='PAY_PER_REQUEST',
    )
def migrate():
    client = boto3.client('dynamodb')

    def boto_do_retry(f):
        """
        Boto Client will throw errors while resources are updating.
        Use this function to catch and retry those errors until the operation
        succeeds.
        """
        for i in range(RETRY_ATTEMPTS):
            try:
                f()
                return
            except (client.exceptions.LimitExceededException,
                    client.exceptions.ResourceInUseException):
                if i == (RETRY_ATTEMPTS - 1):
                    raise
                time.sleep(RETRY_DELAY)

    client.update_table(
        TableName=get_table_name('product_tag'),
        GlobalSecondaryIndexUpdates=[{
            'Delete': {
                'IndexName': 'update_product_meta_idx'
            }
        }],
    )
    boto_do_retry(lambda: client.update_table(
        TableName=get_table_name('product_tag'),
        AttributeDefinitions=[
            {
                'AttributeName': 'store_product_url',
                'AttributeType': 'S'
            },
            {
                'AttributeName': 'update_product_meta',
                'AttributeType': 'N'
            },
        ],
        GlobalSecondaryIndexUpdates=[
            {
                'Create': {
                    'IndexName':
                    'update_product_meta_idx',
                    'KeySchema': [
                        {
                            'AttributeName': 'store_product_url',
                            'KeyType': 'HASH'
                        },
                        {
                            'AttributeName': 'update_product_meta',
                            'KeyType': 'RANGE'
                        },
                    ],
                    'Projection': {
                        'ProjectionType': 'ALL',
                    },
                },
            },
        ],
    ))
def migrate():
    client = boto3.client('dynamodb')

    client.delete_table(TableName=get_table_name('product_visual_features'), )