Ejemplo n.º 1
0
    def process_item(self, item, spider):

        # Check if the Product already exists
        product = (self.session.query(Product).filter_by(
            store=item["store"], sku=item["sku"]).first())

        if product is None:
            product = Product(store=item["store"], sku=item["sku"])

        product.barcodes = item["barcodes"]
        product.brand = item["brand"]
        product.name = item["name"]
        product.description = item["description"]
        product.image_url = item["image_url"]

        self.session.add(product)
        self.session.commit()

        # Check if the BranchProduct already exists
        branch_product = (self.session.query(BranchProduct).filter_by(
            product=product, branch=item["branch"]).first())

        if branch_product is None:
            branch_product = BranchProduct(product=product,
                                           branch=item["branch"])

        branch_product.stock = item["stock"]
        branch_product.price = item["price"]

        self.session.add(branch_product)
        self.session.commit()

        return item
Ejemplo n.º 2
0
def fetch_category(search_index, amazon_node_id):
    api = caching.ResponseCachingAPI(settings.AMAZON_AWS_KEY,
                                     settings.AMAZON_SECRET_KEY,
                                     settings.AMAZON_API_LOCALE,
                                     settings.AMAZON_ASSOCIATE_TAG,
                                     cachedir='cache',
                                     cachetime=86400)

    try:
        for root in api.item_search(
                search_index,
                BrowseNode=str(amazon_node_id),
                ResponseGroup=settings.AMAZON_RESPONSE_GROUP):

            for item in root.Items.Item:
                product = Product()
                product.category = Category.objects.get(
                    amazon_node_id=amazon_node_id)
                product.asin = item.ASIN
                product.title = unicode(item.ItemAttributes.Title)
                product.detailpageurl = unicode(item.DetailPageURL)
                product.manufacturer = unicode(
                    getattr(item.ItemAttributes, 'Manufacturer', None))
                product.publisher = unicode(
                    getattr(item.ItemAttributes, 'Publisher', None))
                product.brand = unicode(
                    getattr(item.ItemAttributes, 'Brand', None))
                product.popularity = getattr(item, 'SalesRank', 1000)
                if hasattr(item, 'MediumImage'):
                    product.medium_image = getattr(item.MediumImage, 'URL',
                                                   None)
                if hasattr(item, 'LargeImage'):
                    product.large_image = getattr(item.LargeImage, 'URL', None)
                if hasattr(item, 'EditorialReviews'):
                    product.description = unicode(
                        getattr(item.EditorialReviews.EditorialReview,
                                'Content', None))
                if hasattr(item.Offers, 'Offer'):
                    product.price = item.Offers.Offer.OfferListing.Price.FormattedPrice.pyval
                elif hasattr(item.ItemAttributes, 'ListPrice'):
                    product.price = item.ItemAttributes.ListPrice.FormattedPrice.pyval
                elif hasattr(item.OfferSummary, 'LowestUsedPrice'):
                    product.price = u'used from %s' % item.OfferSummary.LowestUsedPrice.FormattedPrice.pyval
                else:
                    product.price = None
                product.save()

    except AWSError, e:
        if e.code == 'AWS.ParameterOutOfRange':
            pass  # reached the api limit of 10 pages
        else:
            raise ValidationError(message=e.msg)
Ejemplo n.º 3
0
def fetch_category(search_index, amazon_node_id):
    api = caching.ResponseCachingAPI(
        settings.AMAZON_AWS_KEY,
        settings.AMAZON_SECRET_KEY,
        settings.AMAZON_API_LOCALE,
        settings.AMAZON_ASSOCIATE_TAG,
        cachedir='cache',
        cachetime=86400)

    try:
        for root in api.item_search(search_index, BrowseNode=str(amazon_node_id),
            ResponseGroup=settings.AMAZON_RESPONSE_GROUP):

            for item in root.Items.Item:
                product = Product()
                product.category = Category.objects.get(amazon_node_id=amazon_node_id)
                product.asin = item.ASIN
                product.title = unicode(item.ItemAttributes.Title)
                product.detailpageurl = unicode(item.DetailPageURL)
                product.manufacturer = unicode(getattr(item.ItemAttributes, 'Manufacturer', None))
                product.publisher = unicode(getattr(item.ItemAttributes, 'Publisher', None))
                product.brand = unicode(getattr(item.ItemAttributes, 'Brand', None))
                product.popularity = getattr(item, 'SalesRank', 1000)
                if hasattr(item, 'MediumImage'):
                    product.medium_image = getattr(item.MediumImage, 'URL', None)
                if hasattr(item, 'LargeImage'):
                    product.large_image = getattr(item.LargeImage, 'URL', None)
                if hasattr(item, 'EditorialReviews'):
                    product.description = unicode(getattr(item.EditorialReviews.EditorialReview, 'Content', None))
                if hasattr(item.Offers, 'Offer'):
                    product.price = item.Offers.Offer.OfferListing.Price.FormattedPrice.pyval
                elif hasattr(item.ItemAttributes, 'ListPrice'):
                    product.price = item.ItemAttributes.ListPrice.FormattedPrice.pyval
                elif hasattr(item.OfferSummary, 'LowestUsedPrice'):
                    product.price =  u'used from %s' % item.OfferSummary.LowestUsedPrice.FormattedPrice.pyval
                else:
                    product.price = None
                product.save()

    except AWSError, e:
        if e.code == 'AWS.ParameterOutOfRange':
            pass # reached the api limit of 10 pages
        else:
            raise ValidationError(message=e.msg)
def products_to_db(products):
    """
    It saves the products in the database
    :param products: dictionary with the desired information
    """
    session = load_session()
    for key, item in products.items():
        print('\n>>> Processing:', key, item['NAME'])
        product = (session.query(Product).filter_by(store="Richart's",
                                                    sku=item["SKU"]).first())

        if product is None:
            product = Product(store="Richart's", sku=item["SKU"])

        product.barcodes = item["BARCODES"]
        product.brand = item["BRAND"].capitalize()
        product.name = item["NAME"].capitalize()
        description = remove_html_tags(item["DESCRIPTION"])
        product.description = description.capitalize()
        product.image_url = item["IMAGE_URL"]
        product.category = item["FULL_CATEGORY"]
        product.package = product.description.replace(product.name, '')

        session.add(product)
        session.commit()

        # Check if the BranchProduct already exists
        branch_product = (session.query(BranchProduct).filter_by(
            product=product, branch=item["BRANCH"]).first())

        if branch_product is None:
            branch_product = BranchProduct(product=product,
                                           branch=item["BRANCH"])

        branch_product.stock = item["STOCK"]
        branch_product.price = item["PRICE"]

        session.add(branch_product)
        session.commit()

    session.close()
Ejemplo n.º 5
0
    def saveProduct(self, product):
        try:
            record = Product()
            if Product.objects.filter(title=str(product['title'])).exists():
                record = Product.objects.filter(title=str(product['title']))[0]
            record.title = str(product['title'])
            record.description = str(product['description'])
            record.current_price = formatPrice(
                product['price']['current_price'] if 'current_price' in
                product['price'] else 0)
            record.old_price = formatPrice(
                product['price']['old_price'] if 'old_price' in
                product['price'] else 0)
            record.you_save = formatPrice(
                product['price']['you_save'] if 'you_save' in
                product['price'] else 0)
            record.url = str(product['url'])
            record.images = product['images']
            if product['connections']:
                record.connection_value = str(product['connections'])

            record.manufacturer_en = str(product['manufacturer_en'])
            record.brand = str(product['manufacturer_en']
                               ) if product['manufacturer_en'] else str(
                                   product['seller']['name'])
            record.tags = product['tags']
            record.other_specs = str(product['specs'])
            record.original_json = json.dumps(product, ensure_ascii=False)

            record.save()
            return record
        except Exception as e:

            print('Error during save proudct {}  cause {} '.format(
                product['title'], str(e)))
            return None
Ejemplo n.º 6
0
def process_item(item):

    Session = sessionmaker(bind=engine)
    session = Session()
    # Check if the Product already exists
    product = (session.query(Product).filter_by(store=item["store"],
                                                sku=item["sku"]).first())

    if product is None:
        product = Product(store=item["store"], sku=item["sku"])

    product.barcodes = item["barcodes"]
    product.brand = item["brand"]
    product.name = item["name"]
    product.description = item["description"]
    product.image_url = item["image_url"]
    product.category = item["category"]
    product.package = item["package"]

    session.add(product)
    session.commit()

    # Check if the BranchProduct already exists
    branch_product = (session.query(BranchProduct).filter_by(
        product=product, branch=item["branch"]).first())

    if branch_product is None:
        branch_product = BranchProduct(product=product, branch=item["branch"])

    branch_product.stock = item["stock"]
    branch_product.price = item["price"]

    session.add(branch_product)
    session.commit()

    return item
Ejemplo n.º 7
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url)
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
            return

        product_nodes = tree.cssselect('div#searchResults a')
        for product_node in product_nodes:
            price = None; listprice = None
            price = product_node.cssselect('.price-6pm')[0].text
            listprice_node = product_node.cssselect('.discount')
            listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None

            # eliminate products of no discountIndexError:
            if price is None or listprice is None:
                # common_failed.send(sender=ctx, url=url, \
                #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                continue

            key = product_node.get('data-product-id')
            if not key:
                common_failed.send(sender=ctx, url=url, reason='listing product has no key')
                continue

            combine_url = product_node.get('href')
            key = '%s_%s' % (key, combine_url.split('/')[-1])
            match = re.search(r'https?://.+', combine_url)
            if not match:
                combine_url = '%s%s' % (HOST, combine_url)

            brand = product_node.cssselect('.brandName')[0].text.strip()
            title = product_node.cssselect('.productName')[0].text.strip()

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='6pm').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )


            print product.key; print product.brand; print product.title; \
            print product.price, ' / ', product.listprice; print product.combine_url; \
            print product.dept; print

        # Go to the next page to keep on crawling.
        next_page = None
        page_node = tree.cssselect('div.pagination')
        if not page_node:
            return

        last_node =page_node[0].cssselect('.last')
        if last_node:
            next_page = page_node[0].cssselect('a')[-1].get('href')

        if next_page:
            match = re.search(r'https?://.+', next_page)
            if not match:
                next_page = '%s%s' % (HOST, next_page)
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
Ejemplo n.º 8
0
    def crawl_product(self, url, casin, ctx='', **kwargs):
        r = req.get(url)
        data = re.compile(r'parse_asin_\w+\((.*)\);$').search(r.text).group(1)
        data = json.loads(data)

        image_urls = []
        for i in data['detailJSON']['main']['altviews']:
            if i['zoomImage'] not in image_urls:
                image_urls.append(i['zoomImage'])

        if not image_urls:
            for i in data['detailJSON']['asins']:
                if i['asin'] == casin:
                    for j in i['altviews']:
                        if j['zoomImage'] not in image_urls:
                            image_urls.append(j['zoomImage'])
                    break

        asin = data['detailJSON']['asin']
        summary = data['productDescription']['shortProdDesc']
        if data['productDescription']['bullets']:
            list_info = [i.replace('"', '"').replace(''', '\'') for i in data['productDescription']['bullets'][0]['bulletsList']]
        else:
            list_info = []
        brand = data['detailJSON']['brand']
        returned = data['detailJSON']['returnPolicy']
#        if 'intlShippable' in data['detailJSON']:
#            shipping = 'international shipping' if data['detailJSON']['intlShippable'] == 1 else 'no international shipping'
#        elif 'choices' in data['detailJSON']:
#            for i in data['detailJSON']['choices']:
#                if i['asin'] == casin:
#                    shipping = 'international shipping' if i['intlShippable'] == 1 else 'no international shipping'
#                    break
#        shipping = shipping if shipping else ''

        video = ''
        for p in data['detailJSON']['asins']:
            if p['asin'] == casin:
                video = p['videos'][0]['url'] if p['videos'] else ''
                break

        is_new, is_updated = False, False
        product = Product.objects(key=casin).first()
        if not product:
            is_new = True
            product = Product(key=casin)
        product.summary = summary
        product.list_info = list_info
        product.brand = brand
        product.shipping = 'FAST, FREE SHIPPING, FREE RETURN SHIPPING in the U.S.'
        product.returned = returned
        product.video = video
        product.image_urls = image_urls
        product.full_update_time = datetime.utcnow()

        if product.updated == False:
            product.updated = True
            ready = True
        else: ready = False
        product.save()
        common_saved.send(sender=ctx, obj_type='Product', key=casin, url=url, is_new=is_new, is_updated=is_updated, ready=ready)
Ejemplo n.º 9
0
def update_product(self, product_id):
    # -------------------------
    # Update data of product
    # -------------------------
    if request.form.get('_method') != 'PUT':
        app.logger.Info(
            'Cannot perform this action. Please contact administrator')
        abort(405)

    product = Product(id=product_id)

    try:
        product = product.list_one_or_none_product()

        if product is None:
            app.logger.info(
                f'No data with Employee ID = {product_id} could be found!')
            abort(422)

        product.id = product_id
        product.name = request.form.get('name', product.name)
        product.price_per_cost_unit = request.form.get(
            'price_per_cost_unit', product.price_per_cost_unit)
        product.cost_unit = request.form.get('cost_unit', product.cost_unit)
        product.quantity_in_stock = request.form.get(
            'quantity_in_stock', product.quantity_in_stock)
        product.brand = request.form.get('brand', product.brand)

        product.production_date = request.form.get(
            'production_date', product.production_date)

        product.best_before_date = request.form.get(
            'best_before_date', product.best_before_date)

        product.plu = request.form.get('plu', product.plu)
        product.upc = request.form.get('upc', product.upc)
        form_organic = request.form.get('organic', 'off')

        product.organic = 0

        if form_organic == 'on':
            product.organic = 1

        product.cut = request.form.get('cut', product.cut)
        product.animal = request.form.get('animal', product.animal)

        department = request.form.get('department_name')
        product.department_id = department.split(' - ', 2)[0]

        # Need to update aisle_number in AisleContains table as well
        aisle = request.form.get('aisle_name')

        if aisle is not None:
            aisle_number = int(aisle.split(' - ', 2)[0])
            aisle_contains = AisleContains(
                aisle_number=aisle_number,
                product_id=product_id
            )

            aisle_contains = \
                aisle_contains.list_one_or_none_aisle_contains(product)

            if aisle_contains is not None:
                aisle_contains.aisle_number = aisle_number
            else:
                aisle_contains = AisleContains(
                    aisle_number=aisle_number,
                    product_id=product_id
                )

                # If the product is associated with any aisle, this code
                # should never have been reached. The other option here
                # would be to add the association to the AisleContains
                # table.
                try:
                    aisle_contains = \
                        aisle_contains.add_aisle_contains_to_database()
                except BaseException:
                    app.logger.info(
                        f'An error occurred. Product {product_id} failed to be \
                        associated with Aisle {aisle_number}.')
                    abort(422)

        try:
            product.update_product_in_database()
            flash(
                f'Product {product_id} was successfully updated!',
                'success')
        except BaseException:
            app.logger.info(
                f'An error occurred. Product {product_id} \
                    could not be updated!')
            abort(422)
    except BaseException:
        app.logger.info(
            f'An error occurred. No data with Product ID\
                 = {product_id} could be found!')
        abort(422)

    return redirect('/products')
Ejemplo n.º 10
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url)
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = kwargs['category'] if kwargs.get('category') else Category.objects(key=kwargs.get('key')).first()
        if not category:
            common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
            return
        
        product_nodes = tree.cssselect('div#atg_store_prodList ul li')
        for product_node in product_nodes:
            info_node = product_node.cssselect('div.thumbnailInfo')[0]

            price = None; listprice = None
            price_node = info_node.cssselect('div.our_price')[0]
            weekly_price_node = price_node.cssselect('.newPrice_value')
            sale_price_node = price_node.cssselect('#salePrice')
            if weekly_price_node:
                price = weekly_price_node[0].text.strip()
            elif sale_price_node:
                price = sale_price_node[0].text.strip()
            else:
                price = ''.join(price_node.xpath('.//text()')).strip()
            listprice = info_node.cssselect('div.retail_price')[0].text.strip()
            listprice = re.sub('\n', '', listprice)

            # eliminate products of no discountIndexError:
            if price is None or listprice is None:
                # common_failed.send(sender=ctx, url=url, \
                #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                continue

            key = info_node.cssselect('div.product_id')[0].text.strip()
            brand = info_node.cssselect('a.sameBrandProduct')[0].text.strip()
            title_node = info_node.cssselect('a.product_gender_name')[0]
            # title = title_node.get('title')
            combine_url = title_node.get('href')
            match = re.search(r'https?://.+', combine_url)
            if not match:
                combine_url = '%s%s' % (HOST, combine_url)

            #
            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            # if category.cats and set(category.cats).difference(product.dept):
            #     product.dept = list(set(category.cats) | set(product.dept or []))
            #     is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            try:
                selected = Picker(site='ashford').pick(product) if product.updated \
                    else self.crawl_detail(ctx, is_new, is_updated, product)
            except:
                common_failed.send(sender=ctx, url=product.combine_url, reason=traceback.format_exc())
                continue
            if not selected:
                continue

            if is_updated:
                product.list_update_time = datetime.utcnow()

            product.hit_time = datetime.utcnow()
            product.save()
            
            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated), ready=(product.ready if hasattr(product, 'ready') else False))
Ejemplo n.º 11
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url, params={'Ns': 'P_sale_flag|1'})
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            print 'Category does not exist'
            common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs))
            return

        product_nodes = tree.cssselect('div#product-container div');
        no_discount_num = 0 # sometimes no discount product occurs between the  discount ones ordered by sale.

        for product_node in product_nodes:
            if not product_node.get('id') or 'product' not in product_node.get('id').lower():
                continue

            key = product_node.get('id')
            info_node = product_node.cssselect('div.product-text a')[0]
            price = None; listprice = None
            listprice_node = info_node.cssselect('span.product-price')
            price_node = info_node.cssselect('span.product-sale-price')
            if listprice_node:
                listprice = ''.join(listprice_node[0].xpath('.//text()')).strip()
            if price_node:
                price = ''.join(price_node[0].xpath('.//text()')).strip()

            if price is None or listprice is None:
                no_discount_num += 1
                if no_discount_num < 3:
                    continue
                return
            no_discount_num = 0

            brand = info_node.cssselect('p span.product-designer-name')[0].text
            if brand:
                brand = brand.strip()
            title = info_node.cssselect('p.product-description')[0].text.strip()
            combine_url = info_node.get('href')

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True
                product.update_history['title'] = datetime.utcnow()

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True
                product.update_history['combine_url'] = datetime.utcnow()

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='saksfifthavenue').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()
            
            # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated
            # print

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )

        # Go to the next page to keep on crawling.
        next_page = None
        page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a')
        for page_node in page_nodes:
            if page_node.get('class') == 'next':
                href = page_node.get('href')
                match = re.search(r'https?://.+', href)
                next_page = href if match else '{0}/{1}'.format(HOST, href)
                break

        if next_page:
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)