def fetch_product_tags(dynamodb, store_product_urls, product_tag=None): tag_table = dynamodb.Table(get_table_name('product_tag')) index_name = None if product_tag is not None: index_name = f'{product_tag.name}_idx' for sp_url in store_product_urls: sp_url = clean_product_url(sp_url) start_key = None while True: key_expr = Key('store_product_url').eq(sp_url) query_kwargs = {} if start_key is not None: query_kwargs['ExclusiveStartKey'] = start_key if index_name is not None: query_kwargs['IndexName'] = index_name results = tag_table.query(KeyConditionExpression=key_expr, **query_kwargs) for item in results['Items']: yield item start_key = results.get('LastEvaluatedKey') if start_key is None: break
def test_clean_product_url(): cleaned_url = 'store.com/products/fork' assert clean_product_url(cleaned_url) == cleaned_url assert clean_product_url(f'www.{cleaned_url}') == cleaned_url assert clean_product_url(f'http://{cleaned_url}') == cleaned_url assert clean_product_url(f'https://www.{cleaned_url}') == cleaned_url assert clean_product_url(f'//www.{cleaned_url}') == cleaned_url assert clean_product_url(f'http://{cleaned_url}?arg=1') == cleaned_url assert clean_product_url(f'http://{cleaned_url}#fragment') == cleaned_url assert clean_product_url(f'xyz.{cleaned_url}') != cleaned_url
def delete_product_tags(dynamodb, product_tag, store_product_urls): tag_table = dynamodb.Table(get_table_name('product_tag')) with tag_table.batch_writer() as batch: for sp_url in store_product_urls: sp_url = clean_product_url(sp_url) batch.delete_item(Key=dict( store_product_url=sp_url, tag=product_tag.name, ))
def set_product_tag(dynamodb, store_product_url, product_tag, **attrs): tag_table = dynamodb.Table(get_table_name('product_tag')) store_product_url = clean_product_url(store_product_url) item = { 'store_product_url': store_product_url, 'tag': product_tag.name, product_tag.name: 1, } item.update(attrs) tag_table.put_item(Item=item)
def add_store_product(dynamodb, product_url, store_domain, is_available=True, **attrs): product_table = dynamodb.Table(get_table_name('product')) store_product_url = clean_product_url(product_url) item_data = dict( store_product_url=store_product_url, full_store_product_url=product_url, store_domain=store_domain, # "is_available" is stored as a "number" in DynamoDB # (required to allow indexing) is_available=int(is_available), **attrs) item_data = parse_store_product_data(item_data) # Set "brand domain" if available for new products # (for existing products, this is set according to "product_uuid" by bulk # data processing job) if attrs.get('store_product_brand_domain'): item_data['brand_domain'] = attrs.get('store_product_brand_domain') try: # product does not yet exist in DB, assign a new product ID item_data['product_uuid'] = uuid.uuid4().hex product_table.put_item( Item=item_data, ConditionExpression='attribute_not_exists(store_product_url)') except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise else: raise ValueError( f'Product with url "{store_product_url}" already exists') primary_image_url = None if item_data.get('image_urls'): primary_image_url = item_data['image_urls'][0] # Tag this store product for requiring indexing if it has an image if primary_image_url: set_product_tag(dynamodb, store_product_url, ProductTag.image_not_indexed, image_url=primary_image_url) # Tag this store product for requiring metadata update set_product_tag(dynamodb, store_product_url, ProductTag.update_product_meta)
def update_store_product(dynamodb, product_url, **attrs): product_table = dynamodb.Table(get_table_name('product')) store_product_url = clean_product_url(product_url) item_data = dict(full_store_product_url=product_url, **attrs) image_not_indexed = False update_product_meta = False old_item_data = product_table.get_item(Key={ 'store_product_url': store_product_url }, ).get('Item') if not old_item_data: raise ValueError( f'Product with url "{store_product_url}" does not yet exist') old_primary_image_url = None if old_item_data.get('image_urls'): old_primary_image_url = old_item_data['image_urls'][0] new_primary_image_url = None if item_data.get('image_urls'): new_primary_image_url = item_data['image_urls'][0] if (new_primary_image_url is not None and # Assume query string does not affect image contents and compare image # URLs without query string component clean_product_url(new_primary_image_url) != clean_product_url(old_primary_image_url)): image_not_indexed = True if (old_item_data.get('store_product_brand_domain') != item_data.get('store_product_brand_domain')): update_product_meta = True item_data = parse_store_product_data(item_data, new_item=False) update_expression = 'SET {}'.format(', '.join( [f'{attr} = :{attr}' for attr in item_data])) expression_attribute_values = { f':{attr}': value for attr, value in item_data.items() } product_table.update_item( Key={'store_product_url': store_product_url}, UpdateExpression=update_expression, ExpressionAttributeValues=expression_attribute_values) # flag image for feature extraction and indexing if image_not_indexed: set_product_tag(dynamodb, store_product_url, ProductTag.image_not_indexed, image_url=new_primary_image_url) # flag product metadata to be updated (re-evaluate product "brand domain") if update_product_meta: set_product_tag(dynamodb, store_product_url, ProductTag.update_product_meta)
def get_store_product(dynamodb, product_url): product_table = dynamodb.Table(get_table_name('product')) store_product_url = clean_product_url(product_url) return product_table.get_item(Key={ 'store_product_url': store_product_url }).get('Item')