Ejemplos de HtmlXPathSelector.HtmlXPathSelector en Python, ejemplos de scrapy.selector.HtmlXPathSelector.HtmlXPathSelector en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: meublesconcept_spider.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brand = hxs.select(
            "//*[contains(text(),'Designer:')]/text()").extract()
        brand = brand[0].split(':')[1].strip() if brand else ''

        name = hxs.select(
            '//td[@class="cont_heading_td"]//h1/text()').extract()[0]
        identifier = hxs.select(
            '//input[@name="products_id"]/@value').extract()
        if identifier:
            identifier = identifier[0]
        else:
            identifier = re.search('p-(\d+).html', response.url)
            if identifier:
                identifier = identifier.group(1)
            else:
                log.msg('PRODUCT WIHTOUT IDENTIFIER: ' + response.url)
                return

        image_url = hxs.select('//a[@rel="fotografias"]/img/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        category = hxs.select(
            '//td[@class="cont_heading_td"]/span/text()').extract()
        sku = hxs.select('//tr/td[contains(text(), "Ref: ")]/text()').re(
            'Ref: (.*)')

        price = hxs.select('//td[@class="preu"]/text()').extract()
        price = price[0] if price else '0'
        price = extract_price(price)

        options = self.get_options(response, price)
        if options:
            for option in options:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url.split('?osCsid=')[0])
                loader.add_value('name', name + option[1])
                loader.add_value('image_url', image_url)
                loader.add_value('brand', brand)
                loader.add_value('identifier',
                                 identifier + '-'.join(option[0]))
                loader.add_value('category', category)
                loader.add_value('sku', sku)
                loader.add_value('price', option[2])
                out_of_stock = hxs.select(
                    '//form[contains(@id, "cart_quantity_")]/img[contains(@alt, "OUT_STOCK")]'
                )
                if out_of_stock:
                    loader.add_value('stock', 0)

                formdata = {'products_id': identifier}
                for option_id in option[0]:
                    attr_id = hxs.select('//select[option[@value="' +
                                         option_id + '"]]/@id').re('(\d+)')[0]
                    formdata['attribute_' + attr_id] = option_id

                product = {'product': loader.load_item(), 'formdata': formdata}
                self.collect_products.append(product)
        else:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('identifier', identifier)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('price', price)
            out_of_stock = hxs.select(
                '//form[contains(@id, "cart_quantity_")]/img[contains(@alt, "OUT_STOCK")]'
            )
            if out_of_stock:
                loader.add_value('stock', 0)
            formdata = {'products_id': identifier}
            product = {'product': loader.load_item(), 'formdata': formdata}
            self.collect_products.append(product)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: harveysfurniture_spider.py Proyecto: oceancloud82/scraping

 def parse_category_products(self, response):
     hxs = HtmlXPathSelector(response)
     self.category_products[response.meta['category']] = \
         hxs.select('//div[contains(@class, "list-product")]/a[contains(@class, "link")]/@href').extract()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: hma.py Proyecto: yuandra/scraperwiki-scraper-vault

    def parse(self, response):
        self.log('No item received for %s' % response.url)

        for elem in super(HideMyAssSpider, self).parse(response):
            yield elem       

        hxs = HtmlXPathSelector(response)
        links = hxs.select('//tr[@class="altshade"]')

        for link in links:
            ipaddress_parts = link.select('td[2]/span')

            style_text = ipaddress_parts.select('style/text()').extract()
            style_text = style_text[0].split('\n')
            display_none = [style[1:style.index('{')]
                            for style in style_text
                            if 'none' in style]
            display_inline = [style[1:style.index('{')]
                            for style in style_text
                            if 'inline' in style]
            display_none = set(display_none)
            display_inline = set(display_inline)

            ipaddress = []

            for ipaddress_part in ipaddress_parts.select('span|div|text()'):
                tag_class = tag_style = tag_name = None
                try:
                    tag_class = ipaddress_part.select('@class').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:
                    tag_style = ipaddress_part.select('@style').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:                
                    tag_name = ipaddress_part.select("name()")
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) 
                    pass

                if tag_name:
                    tag_text = ipaddress_part.select('text()').extract()
                else:
                    tag_text = ipaddress_part.extract()

                if tag_style and 'none' in tag_style[0]:
                    continue
                if tag_class and tag_class[0] in display_none:
                    continue

                if isinstance(tag_text, list):
                    tag_text = ''.join(tag_text)

                tag_texts = tag_text.split('.')
                for tag_text in tag_texts:
                    tag_text = tag_text.strip()
                    if not tag_text.isdigit():
                        continue
                    ipaddress.append(tag_text)

            ipaddress = '.'.join(ipaddress)

            loader = WebsiteLoader(selector=link)
            loader.add_value('ipaddress', ipaddress)
            loader.add_xpath('port', 'td[3]/text()')
            loader.add_xpath('country', 'td[4]/span/text()')
            loader.add_xpath('_type', 'td[7]/text()')
            loader.add_xpath('anonimity', 'td[8]/text()')
            loader.add_value('url', response.url)

            item = loader.load_item()
            
            yield item

Ejemplo n.º 4

0

Mostrar archivo

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()')
        loader.add_value('url', response.url)

        loader.add_xpath('image_url',
                         u'//tr[@id="prodImageContainer"]//img/@src')
        if not loader.get_output_value(u'image_url'):
            soup = BeautifulSoup(response.body)
            image_url = soup.find(lambda tag: tag.name == u'img' and tag.
                                  findParent(u'tr', id=u'prodImageContainer'))
            if image_url:
                loader.add_value('image_url', image_url.get(u'src'))

        loader.add_xpath(
            'brand',
            u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()'
        )

        loader.add_xpath('price', u'//b[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="priceLarge"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="price"]/text()')

        sku = hxs.select(
            u'//li/b[contains(text(),"Item model number")]/../text()').extract(
            )
        if sku:
            sku = sku[0].strip()
        else:
            log.msg('No sku.')
        csv_sku = response.meta['sku'].strip()
        log.msg('SKU: [%s == %s]' % (sku.lower() if sku else u'None', csv_sku))

        csv_name = response.meta['name'].lower().split(u' ')
        site_name = loader.get_output_value('name').lower().split(u' ')
        log.msg(u'NAME: [%s == %s]' % (csv_name, site_name))
        name_match = any(map(lambda elem: elem in site_name, csv_name))

        if sku and (self.match_skus(sku, csv_sku)
                    or self.match_skus(csv_sku, sku)) and name_match:
            if valid_price(response.meta['price'],
                           loader.get_output_value('price')):
                loader.add_value('sku', response.meta['sku'])
                loader.add_value('identifier', response.meta['sku'].lower())
                # if loader.get_output_value('price'):
                yield loader.load_item()
        else:
            meta = response.meta
            next_result = meta['next_results']
            if next_result:
                next_result = next_result[0]
                meta['next_results'] = meta['next_results'][1:]
                yield Request(next_result,
                              callback=self.parse_product,
                              meta=response.meta)
            elif meta.get('next_page'):
                next_page = meta['next_page']
                yield Request(next_page, meta=response.meta)
            elif meta.get('search_urls'):
                meta = response.meta
                search_url = meta['search_urls'][0]
                meta['search_urls'] = meta['search_urls'][1:]
                yield Request(search_url % {'q': meta['sku']}, meta=meta)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: yahoo_answer.py Proyecto: KeithYue/QA-spider

    def parse_question_page(self,response):
        hxs = HtmlXPathSelector(response)
        question_loader = XPathItemLoader(item = YahooQuestion(), selector = hxs)
        answers_loader = XPathItemLoader(item = YahooAnswer(), selector = hxs)
# get question id
        question_loader.add_value('question_id',''.join(parse_qs(urlparse(response.request.url).query)['qid']))
# print question_loader.get_output_value('question_id')

# get question title
        question_loader.add_xpath('question_title',self.question_xpath+'//h1[contains(@class, "subject")]/text()')

# get question content
        question_loader.add_xpath('question_content',self.question_xpath+'//div[contains(@class, "content")]/text()')

# get question status
        question_loader.add_xpath('status',self.question_xpath+'//div[@class="hd"]//h2/text()')

#get question url
        question_loader.add_value('question_url',''.join([
            'http://answers.yahoo.com/question/index?qid=',
            question_loader.get_output_value('question_id')
            ]))
#get question date
        question_loader.add_xpath('asking_date',''.join([
            self.question_xpath,
            '//div[@class="qa-container"]//ul[@class="meta"]',
            '/li[1]/abbr/@title'
            ]))
#import date
        question_loader.add_value('import_date',time.strftime("%Y-%m-%d %A %X %Z", time.localtime()))

# asking user
        question_loader.add_value('asker', self.get_user(hxs.select(''.join([
            self.question_xpath,
            ]))))

# interestin marks
        question_loader.add_xpath('number_of_interesting_marks', ''.join([
            '//ul[@id="yan-question-tools"]',
            '//li[@id="yan-starthis"]',
            '//span[contains(@class,"star-count")]/text()'
            ]))
# number of answers
        question_loader.add_xpath('number_of_answers',''.join([
            self.answer_xpath,
            '/div[@class="hd"]',
            '/h3/text()'
            ]))
#begin to parse answers

# category of the question item
        question_loader.add_xpath('category',''.join([self.category_xpath, '//li//a//text()']))
# best answer
        best_answer_selector = hxs.select(self.best_answer_xpath)

        if best_answer_selector:
            yield self.get_answer(best_answer_selector, question_loader)

#other answers
        for ans_selector in hxs.select(self.answer_xpath).select('.//li/div[@class="answer"]'):
            # self.get_answer(ans_selector, question_loader)
            yield self.get_answer(ans_selector, question_loader)

        yield question_loader.load_item()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: a56888.py Proyecto: zongmingshu/56wlk

    def parse_detail(self, response):
        items = []
        item = WuTongCarLineItem()

        hxs = HtmlXPathSelector(response)

        item['url'] = response.url
        #item['ruku_time'] = int(time.time())

        company_name = hxs.select(
            './/*[@id="line_info"]/table/tr[1]/td/text()').extract()
        if len(company_name) == 0:
            item['company_name'] = ''
        else:
            item['company_name'] = company_name[0].strip()

        contact_name = hxs.select(
            './/*[@id="line_info"]/table/tr[4]/td/text()').extract()
        if len(contact_name) == 0:
            item['contact_name'] = ''
        else:
            item['contact_name'] = contact_name[0].strip()

        from_to = hxs.select('.//*[@id="line_info"]/table/tr[2]/td[1]/text()'
                             ).extract()[0].strip()
        if len(from_to) == 0:
            item['start_place'] = ''
            item['to_place'] = ''
        else:
            m = re.match(r'(.*)--->(.*)', from_to)
            item['start_place'] = m.group(1)
            item['to_place'] = m.group(2)

        tel = hxs.select(
            './/*[@id="line_info"]/table/tr[5]/td[1]/text()').extract()
        if len(tel) == 0:
            item['tel'] = ''
        else:
            item['tel'] = tel[0].strip()

        phone_contact = hxs.select(
            './/*[@id="line_info"]/table/tr[5]/td[2]/text()').extract()
        if len(phone_contact) == 0:
            item['phone_contact'] = ''
        else:
            m = re.match(u"([\u4e00-\u9fa5]+)：(.*)",
                         phone_contact[0].decode("utf-8"))
            item['phone_contact'] = m.group(2)

        addr = hxs.select(
            './/*[@id="line_info"]/table/tr[6]/td/text()').extract()
        if len(addr) == 0:
            item['addr'] = ''
        else:
            item['addr'] = addr[0].strip()

        trans_type = hxs.select(
            './/*[@id="line_info"]/table/tr[3]/td/text()').extract()
        if len(trans_type) == 0:
            item['trans_type'] = ''
        else:
            item['trans_type'] = trans_type[0].strip()

        remark = hxs.select(
            './/*[@id="line_info"]/table/tr[7]/td//text()').extract()
        if len(remark) == 0:
            item['remark'] = ''
        else:
            item['remark'] = "".join(remark).strip()

        item['specia_lines'] = 1

        items.append(item)

        return items

Ejemplo n.º 7

0

Mostrar archivo

Archivo: lookfantastic_spider.py Proyecto: ontiyonke/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select(
         '//div[@class="nav"]/ul/li[count(div)>0]/a/@href').extract()
     for category in categories:
         yield Request(category, callback=self.parse_page)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: northsea.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):
        # inspect_response(response, self)
        # return
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
	
	if not hxs.select('//select[@id="customerTaxType"]/option[@selected="selected"]').re('Excl'):
	  url = hxs.select('//select[@id="customerTaxType"]/option[not (@selected)]/@value').extract()
	  yield Request(urljoin(base_url, url[0]), callback=self.parse_product, dont_filter=True, meta=response.meta)
	  return
	
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('category', response.meta['category'])
        name = ''
        tmp = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        tmp = hxs.select('//div[@class="gallery"]//a[1]/@href').extract()
        if tmp:
            loader.add_value('image_url', tmp[0])
        # Find brand.
        for brand in self.brands:
            if brand.lower() in name.lower():
                loader.add_value('brand', brand)
                break
        # p = loader.load_item()
        tmp = hxs.select('//input[contains(@id,"add-to-cart-button-")]/@data-productid').extract()
        if tmp:
            # identifier = product['identifier']
            loader.add_value('identifier', tmp[0])
        tmp = hxs.select('//p/span[strong="Product Code:"]/text()').extract()
        if tmp:
            loader.add_value('sku', tmp[0].strip())
        tmp = hxs.select('//span[@itemprop="price"]/text()').extract()
        if tmp:
            price = extract_price(tmp[0].strip().split()[0])
            loader.add_value('price', price)

        product = loader.load_item()
        url_post = 'http://www.northseaworkwear.com/addproducttocart/details/%s/1' % product['identifier']
        qty = '1'
        tmp = hxs.select('//input[contains(@class,"qty-input")]/@value').extract()
        if tmp:
            qty = tmp[0]

        selections = hxs.select('//div[@class="attributes"]//select')
        if not selections:
            # loader.add_value('stock', 0)
            # yield loader.load_item()
            formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty}
            self.cookie_num += 1
            yield FormRequest(url_post, formdata=formdata, meta={'item':product, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)
            return

        attrs = []
        for sel in selections:
            attr_name = ''
            tmp = sel.select('@name').extract()
            if tmp:
                attr_name = tmp[0]
            attr_values = []
            for option in sel.select('option'):
                value = ''
                tmp = option.select('@value').extract()
                if tmp:
                    value = tmp[0]
                txt = ''
                tmp = option.select('text()').extract()
                if tmp:
                    txt = tmp[0].strip()
                if value != '' and value != '0':
                    attr_values.append((attr_name, value, txt))
            attrs.append(attr_values)
        # print '### Selections:', attrs
        for option in itertools.product(*attrs):
            # print '### option:', o
            item = copy.deepcopy(product)
            item['name'] += ' - ' + '-'.join([attr[2] for attr in option])
            item['identifier'] += '-' + '-'.join([attr[1] for attr in option])
            # yield item
            formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty}
            for attr in option:
                formdata[attr[0]] = attr[1]
            # print 'formdata:', formdata
            self.cookie_num += 1
            yield FormRequest(url_post, formdata=formdata, meta={'item':item, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: debenhams_spider.py Proyecto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        categories = response.xpath(
            '//li[div[contains(text(), "Audio, vision & technology")]]//a/@href'
        ).extract()
        for category in categories:
            yield Request(response.urljoin(category))

        categories = response.xpath(
            '//div[@id="subCategorycategories"]/ul/li/a/@href').extract()
        categories += response.xpath(
            '//li[@id="categories"]/ul/li/a/@href').extract()
        categories += response.xpath(
            '//div[@class="cat_detail"]/div/a/@href').extract()
        for category in categories:
            url = urljoin_rfc(get_base_url(response), category)
            yield Request(url)

        # products new parse method
        products = response.xpath('//div[contains(@id, "PSPProductList")]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            name = "".join(
                product.xpath(
                    ".//div[contains(@class, 'product_name')]//text()").
                extract()).strip()
            brand = product.xpath(
                'div/a/div[@class="brand_name"]/text()').extract()[0].strip()

            url = product.xpath(".//a/@href").extract()
            url = urljoin_rfc(get_base_url(response), url[0])

            sku = product.xpath(".//div[contains(@id, 'psp')]/@id").re(
                "psp_(.+)")[0]

            price = product.xpath(".//span[@class='price_now']/text()").re(
                u'Now\xa0\xa3(.*)')
            if not price:
                price = product.xpath(
                    ".//span[@class='price-actual' and @itemprop='price']/text()"
                ).extract()

            if price:
                price = price[0]
            else:
                price = ''
                loader.add_value('stock', 0)

            category = response.xpath(
                '//div[@id="box_productSelectionPage"]/div/h1/text()').extract(
                )
            category = category[0].strip() if category else ''

            loader.add_value('name', name)
            loader.add_value('brand', brand)
            #            loader.add_value('category', category)
            loader.add_value('url', url)
            loader.add_xpath('image_url', 'div//img[@class="proImg"]/@src')
            loader.add_value('sku', sku)
            loader.add_value('identifier', sku)
            loader.add_value('price', price)

            item = loader.load_item()
            metadata = DemoRMeta()
            metadata['reviews'] = []
            metadata['promotion'] = ''.join(
                product.xpath(
                    './/span[@class="discount_savings"]/text()').extract())
            item = loader.load_item()
            item['metadata'] = metadata

            yield Request(item['url'],
                          meta={'item': item},
                          callback=self.parse_product)

        for page in response.xpath(
                '//div[@id="pagination"]/a/@href').extract():
            url = urljoin_rfc(get_base_url(response), page)
            yield Request(url)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: bedworld_net.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(
                '//a[contains(@class,"size-boxes")]/@href').extract():
            yield Request(urljoin_rfc(get_base_url(response), url),
                          callback=self.parse_product)

        product_category = hxs.select(
            '//div[contains(@class,"breadcrumbs")]/ul/li/a/text()').extract(
            )[-1].strip()
        product_name = hxs.select('//h1/text()').extract()[0]

        product_image = hxs.select('//a[@id="zoom-btn"]/@href').extract()
        if product_image:
            product_image = urljoin_rfc(get_base_url(response),
                                        product_image[0])

        product_brand = hxs.select('//img[@class="man-logo"]/@alt').extract()
        product_brand = product_brand[0] if product_brand else ''

        product_sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract()
        product_sku = product_sku[0] if product_sku else ''

        product_config_reg = re.search(
            'var spConfig = new Product.Config\((\{.*\})\);', response.body)
        product_identifier = hxs.select(
            '//input[@name="product"]/@value').extract()[0]

        if product_config_reg:
            products = json.loads(product_config_reg.group(1))
            for identifier, product in products['childProducts'].items():
                product_loader = ProductLoader(item=Product(),
                                               response=response)
                if identifier:
                    product_loader.add_value(
                        'identifier', product_identifier + '-' + identifier)
                else:
                    product_loader.add_value('identifier', product_sku)
                product_loader.add_value('price', product[u'finalPrice'])
                option_name = product_name
                for attr_id, attribute in products[u'attributes'].items():
                    for option in attribute['options']:
                        if identifier in option['products']:
                            option_name += ' ' + option['label']
                product_loader.add_value(
                    'name', re.sub(r' \((.+?)\)', r'', option_name))
                product_loader.add_value('sku', product_sku)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', product_brand)
                product_loader.add_value('category', product_category)
                product_loader.add_value('image_url', product_image)

                if identifier:
                    yield Request('http://www.bedworld.net/oi/ajax/co/?id=' +
                                  identifier + '&pid=' + product_identifier,
                                  meta={'item': product_loader.load_item()},
                                  callback=self.parse_options)
                else:
                    price = product_loader.get_output_value('price')
                    net_price = price / Decimal('1.2')

                    p = product_loader.load_item()
                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    p['metadata'] = meta_

                    yield p
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name',
                                     re.sub(r' \((.+?)\)', r'', product_name))
            product_loader.add_value('sku', product_sku)
            product_loader.add_value('brand', product_brand)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', product_category)
            product_loader.add_value('image_url', product_image)
            price = hxs.select('//span[@id="product-price-' +
                               product_identifier +
                               '"]//text()').re(r'([\d.,]+)')
            price = price[0] if price else 0
            product_loader.add_value('price', price)

            option_elements = []
            dropdown_elements = hxs.select(
                '//select[contains(@class, "product-custom-options")]')
            for dropdown_options in dropdown_elements:
                options = []
                for dropdown_option in dropdown_options.select(
                        'option[@value!=""]'):
                    option = {}
                    option['identifier'] = dropdown_option.select(
                        '@value').extract()[0]
                    option['desc'] = dropdown_option.select(
                        './/text()').extract()[0].split('+')[0]
                    option['price'] = dropdown_option.select(
                        '@price').extract()[0]
                    options.append(option)
                option_elements.append(options)

            final_options = []
            if option_elements:
                combined_options = list(itertools.product(*option_elements))
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + option['desc']
                        final_option['price'] = final_option.get(
                            'price', Decimal(0)) + extract_price(
                                option['price'])
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + '-' + option['identifier']
                    final_options.append(final_option)

            if final_options:
                for opt in final_options:
                    opt_product = product_loader.load_item()
                    opt_product['name'] += ' ' + normalize_space(opt['desc'])
                    opt_product['price'] += opt['price']
                    opt_product['identifier'] += opt['identifier']
                    price = Decimal(opt_product['price'])
                    net_price = price / Decimal('1.2')

                    meta_ = Meta()
                    meta_['net_price'] = str(net_price)
                    opt_product['metadata'] = meta_

                    yield opt_product
            else:
                price = product_loader.get_output_value('price')
                net_price = price / Decimal('1.2')

                p = product_loader.load_item()
                meta_ = Meta()
                meta_['net_price'] = str(net_price)
                p['metadata'] = meta_

                yield p

Ejemplo n.º 11

0

Mostrar archivo

    def parse_detail(self, response):
        item = Item()
        item["url"] = response.url
        m2 = hashlib.md5()
        m2.update(item["url"])
        item["url_md5sum"] = m2.hexdigest()
        item["source"] = "智联招聘"
        item["sub_url"] = response.meta["sub_url"]
        response_selector = HtmlXPathSelector(response)
        #组装item
        for file_name in self.detail_xpath_dict:
            item[file_name] = ""
            if (len(
                    response_selector.select(
                        self.detail_xpath_dict[file_name]).extract()) > 0):
                item[file_name] = response_selector.select(
                    self.detail_xpath_dict[file_name]).extract()[0]

        #公司url的MD5

        m2 = hashlib.md5()
        m2.update(item["company_url"])
        item["company_url_md5sum"] = m2.hexdigest()
        #时间处理
        pub_at = item['time']
        today = datetime.date.today()
        if pub_at == '15天前':
            item['time'] = (today -
                            datetime.timedelta(days=15)).strftime('%Y-%m-%d')
            print 'a'
        elif pub_at == "前天":
            item['time'] = (today -
                            datetime.timedelta(days=2)).strftime('%Y-%m-%d')
            print 'b'
        elif pub_at.find('刚') != -1 or pub_at.find("小时") != -1:
            print 'c'
            item['time'] = today.strftime('%Y-%m-%d')
        elif pub_at == '昨天':
            item['time'] = (today -
                            datetime.timedelta(days=1)).strftime('%Y-%m-%d')
            print 'd'

            #组装完成，返回发送到pipline

        yield item

        company_url = item['company_url']
        #如果没有公司链接
        if len(company_url) == 0:

            return
        if company_url.split("/")[2] == "special.zhaopin.com":
            yield Request(url=company_url,
                          callback=self.parse_company,
                          meta={
                              "sub_url": response.url,
                              "company_name": item["company_name"]
                          },
                          headers=SPECIAL_REQUEST_HEADERS,
                          dont_filter=False)
        elif company_url.split("/")[2] == "company.zhaopin.com":
            yield Request(url=company_url,
                          callback=self.parse_company,
                          meta={
                              "sub_url": response.url,
                              "company_name": item["company_name"]
                          },
                          headers=COMPANY_REQUEST_HEADERS,
                          dont_filter=False)

Ejemplo n.º 12

0

Mostrar archivo

    def parse_anntaylor(self, response):
        self.check_shelfit_validity(response)
        return (False, None)
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select('//div[@class="hd-info"]//h1/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            print "Invalid link:  " + str(response.url)
            return (False, None)
        item_name = item_name_path.extract()[0]
        logging.critical("Name: " + str(item_name))

        self.count_scraped += 1
        ''' 
        PLAYING NICE: sleeping for 1min after crawling every 100 pages
        '''
        if self.count_scraped % 100 == 0:
            print "Sleeping for 60 secs..."
            sleep(60)  # sleep for 1 mins for express

        meta_tag_url = hxs.select('//meta[@property="og:url"]/@content')

        prod_url = meta_tag_url.extract()[0]
        logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " +
                         str(item_name) + " TOTAL SO FAR " +
                         str(self.count_scraped))

        # Ann Taylor is for women only
        gender = 'F'

        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + str(item_id_))
            # store the category for this itemid
            print "Appending categories for product " + str(item_id_)
            categories_path = hxs.select(
                '//div[@id="cat-pro-pagnation"]//a/text()').extract()
            num_categories = len(categories_path)
            categories = []
            for i in range(0, num_categories):
                category = str(categories_path[i]).strip('\n').strip()
                categories.append(category)
                logging.critical("Categories: " + category)
            product = ProductModel.objects.filter(idx=item_id_).filter(
                insert_date=insert_date)
            self._create_category(product, categories)
            return (False, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " +
                             str(price_) + " SALE PRICE " + str(sale_price_))

        # extract image URL
        prod_img_path = hxs.select('//img[@id="productImage"]/@src')
        prod_img_url = str(prod_img_path.extract()[0])
        logging.critical("Image URL: " + str(prod_img_url))

        # find description and keywords: these will be useful in categorization
        desc = hxs.select(
            '//div[@class="gu gu-first description"]/p/text()').extract()
        prod_desc = ''.join(desc)
        logging.critical("Description: " + prod_desc)

        # promo text
        # DIDN'T FIND ANY
        #promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract()
        #promo_str = str(promo_path)
        #logging.critical("Promotion: ")
        #logging.critical(promo_str)
        promo_str = ""



        product, created_new = self._create_product_item(item_name, int(item_id_), str(prod_url), price_, \
                                                         sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        product = None

        #self._store_in_file(response, item_id_)
        #raise CloseSpider('Blah')
        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: joueclubfr.py Proyecto: oceancloud82/scraping

 def extract_all_page_data(self, response):
     hxs = HtmlXPathSelector(response)
     return process(hxs, response)

Ejemplo n.º 14

0

Mostrar archivo

    def parseItem(self,
                  spiderName=None,
                  itemCollectionName=None,
                  response=None,
                  responseBody='',
                  pageid=''):
        '''
        parse the page, get the information of attraction to initiate noteItem, then return items to pipeLine
        the pipeLine configured by "settings" will store the data
        '''
        #bbsSpider单独处理
        isbbsSpider = False
        if spiderName in self.bbsSpiderName:
            config = extractorConfig['BBsSpider']
            isbbsSpider = True
        else:
            config = extractorConfig[spiderName]
        if not config:
            raise NotConfigured('解析配置信息没有找到，请检查extracotrConfig是否有爬虫%s的配置！ ' %
                                spiderName)

        hxs = HtmlXPathSelector(response)
        if not itemCollectionName or not itemCollectionName in config:
            raise NotConfigured('%s下载网页的类型%s没有找到，请检查解析配置文件' %
                                (spiderName, itemCollectionName))

        item = {}
        item['collectionName'] = itemCollectionName
        if itemCollectionName in self.collectionNameMap:
            item['collectionName'] = self.collectionNameMap[itemCollectionName]
        item['url'] = response.url
        item['status'] = 100
        item['spiderName'] = spiderName
        item['optDateTime'] = datetime.datetime.now()
        xpathItem = config[itemCollectionName]
        #使用正文抽取，只要title、publishdate、content,imgList
        if 'mainext' in xpathItem and xpathItem['mainext']:
            return self.extractMainTxt(
                item, responseBody,
                config['threshold'] if 'threshold' in config else None,
                spiderName, pageid)
        #对bbs进行单独处理其Article
        if isbbsSpider and re.match('.*(Article).*',
                                    itemCollectionName) is not None:
            #标题 作者 发比日期等用regex抓
            regexs = config['printpage']
            for k, v in regexs.items():
                if k.endswith('Regex'):
                    continue
                regex = regexs[k + 'Regex']
                value = hxs.select(v).re(regex)
                if (value is None
                        or len(value) < 1) and k in self.requiredField:
                    self.parseLog(
                        'bbs解析发现item缺失属性：%s，类型： %s，spiderName:%s, pageid:%s' %
                        (k, itemCollectionName, spiderName, pageid),
                        level=LogLevel.INFO)
                    return None
                if type(value) == list and len(value) > 0:
                    item[k] = value[0]
                elif value is not None:
                    item[k] = value
            #content
            bodys = hxs.select('//body').extract()[0]
            content = self.ext.getText(bodys)
            #过滤掉水贴回复、标题、作者信息
            contents = content.splitlines()
            newcontent = ''
            block = ''
            isblock = 0
            for p in contents:
                p_strip = p.strip()
                #识别文本块
                if re.match('.*作者.*时间.*\d+.*\d+.*', str(p_strip)) is not None:
                    isblock += 1
                elif len(p_strip) >= 20 and re.match('.*\[打印本页\].*',
                                                     str(p_strip)) is None:
                    block += p_strip
                    block += '\n'
                if isblock >= 1:
                    if len(block) > 100 or re.match('.*img.*src.*=.*>.*',
                                                    block) is not None:
                        newcontent += '---------------------------------------------------------------------------------\n'
                        newcontent += block
                    block = ''
                    isblock = 0
            if newcontent is None or len(newcontent.strip()) < 10:
                self.parseLog(
                    'bbs解析发现item缺失属性：%s，类型： %s，spiderName:%s, pageid:%s' %
                    ('content', itemCollectionName, spiderName, pageid),
                    level=LogLevel.INFO)
                return None
            item['content'] = newcontent
            images = self.ext.getImg(bodys)
            if images is not None:
                item['images'] = images
            return item

        #xpath解析
        for k, v in xpathItem.items():
            values = hxs.select(v).extract()
            if (not values or len(values) < 1 or
                (" ".join("%s" % p for p in values)).strip()
                    == "") and k in self.requiredField:
                self.parseLog(
                    'xpath解析发现item缺失属性：%s，类型： %s，spiderName:%s,xpath=%s, pageid:%s 。改用正文抽取尝试'
                    % (k, itemCollectionName, spiderName, v, pageid),
                    level=LogLevel.INFO)
                #若为Article，xpath没有解析出来，就用正文抽取再解析一次
                if item['collectionName'] == 'Article':
                    return self.extractMainTxt(
                        item, responseBody,
                        config['threshold'] if 'threshold' in config else None,
                        spiderName, pageid)
            if k in self.listFields:
                item[k] = values
            else:
                value = self.parseSpecialField(k, values)
                if value is not None:
                    item[k] = value
                #图片
                if k == 'content':
                    imgs = self.ext.getImg(value)
                    if imgs is not None:
                        item['images'] = imgs
        #regex+xpath解析
        regexItem = {}
        regexName = itemCollectionName + 'Regex'
        if regexName in config:
            regexItem = config[regexName]
        for k, v in regexItem.items():
            if k.endswith('Regex'):
                continue
            regex = k + 'Regex'
            if not regex in regexItem:
                raise NotConfigured('找不到匹配的正则表达式，配置文件的%s配置缺少相应的%s' %
                                    (k, regex))
            else:
                regex = regexItem[regex]
            values = hxs.select(v).re(regex)
            if (not values or len(values) < 1 or
                (" ".join("%s" % p for p in values)).strip()
                    == "") and k in self.requiredField:
                self.parseLog(
                    'regex+xpath解析item缺失属性：%s，类型： %s，spiderName:%s, pageid:%s 。改用正文抽取尝试'
                    % (k, itemCollectionName, spiderName, pageid),
                    level=LogLevel.INFO)
                #若为Article，xpath没有解析出来，就用正文抽取再解析一次
                if item['collectionName'] == 'Article':
                    return self.extractMainTxt(
                        item, responseBody,
                        config['threshold'] if 'threshold' in config else None,
                        spiderName, pageid)
            if k in self.listFields:
                item[k] = values
            else:
                value = self.parseSpecialField(k, values)
                if value is not None:
                    item[k] = value

        #解析response中的数据
        respItem = {}
        respName = itemCollectionName + 'Resp'
        if respName in config:
            respItem = config[respName]
        for k, v in respItem.items():
            value = None
            if v == 'url':
                value = response.url
            elif v == 'header':
                if v.items():
                    header = response.headers
                    for hk, hv in v.items():
                        value = header[hv]
                        if not value:
                            self.parseLog('response.headers中没有该属性：%s，类型： %s' %
                                          (hk, itemCollectionName),
                                          level=LogLevel.WARNING)
                            continue
                        if not value and hk in self.requiredField:
                            self.parseLog(
                                '非item页，因为缺失属性：%s，类型： %s， pageid:%s' %
                                (hk, itemCollectionName, pageid),
                                level=LogLevel.WARNING)
                            #若为Article，xpath没有解析出来，就用正文抽取再解析一次
                            if item['collectionName'] == 'Article':
                                return self.extractMainTxt(
                                    item, responseBody, config['threshold']
                                    if 'threshold' in config else None,
                                    spiderName, pageid)
                        if hk in self.specialField:
                            value = self.parseSpecialField(hk, value)
                        item[hk] = value.strip()
                    continue
                value = response.headers
            elif k == 'status':
                value = response.status

            if not value and k in self.requiredField:
                self.parseLog('item缺失属性：%s，类型： %s，spiderName:%s, pageid:%s' %
                              (k, itemCollectionName, spiderName, pageid),
                              level=LogLevel.INFO)
                return None
            elif not value:
                continue
            value = self.parseSpecialField(k, value)
            item[k] = value.strip()
        return item

Ejemplo n.º 15

0

Mostrar archivo

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)

        name = hxs.select(u'//h1[@class="mainbox-title"]/text()')[0].extract()
        loader.add_value('name', name)

        loader.add_value('url', response.url)

        price = hxs.select(
            u'//div[@id="product_info"]//span[@class="price"]/span[@class="price" and @id]/text()'
        )
        if not price:
            price = hxs.select(
                u'//*[@itemprop="price"]/span[@class="price" and @id]/text()')
        price = price[0].extract().replace(',', '')
        loader.add_value('price', price)

        image_url = hxs.select(
            u'//a[contains(text(),"View larger image")]/@href')
        if image_url:
            image_url = urljoin_rfc(get_base_url(response),
                                    image_url[0].extract())
            loader.add_value('image_url', image_url)
        category = hxs.select(
            u'//div[@class="breadcrumbs"]/a[1]/following-sibling::a[1]/text()'
        ).extract()
        if category:
            loader.add_value('category', category[0])
        sku = hxs.select(
            u'//div[@class="product-main-info" or @id="product_info"]//p[@class="sku"]//span[starts-with(@id,"product_code")]/text()'
        )
        if sku and sku[0].extract().lower() != 'n/a':
            sku = sku[0].extract().lower()
            loader.add_value('sku', sku)

        loader.add_xpath('identifier',
                         '//input[contains(@name, "product_id")]/@value')

        options = hxs.select(u'//div[starts-with(@id,"opt_")]//select/option')

        select_name = hxs.select(
            u'//div[starts-with(@id,"opt_")]//select/@name').extract()

        if len(options) == 1:
            formdata = {
                'additional_info[get_detailed]': '1',
                'additional_info[get_discounts]': '1',
                'additional_info[get_features]': '',
                'additional_info[get_icon]': '1',
                'additional_info[get_options]': '1',
                'additional_info[info_type]': 'D',
                'appearance[but_role]': 'action',
                'appearance[capture_options_vs_qty]': '',
                'appearance[details_page]': '1',
                'appearance[separate_buttons]': '',
                'appearance[show_add_to_cart]': '1',
                'appearance[show_list_buttons]': '1',
                'appearance[show_price]': '1',
                'appearance[show_price_values]': '1',
                'appearance[show_product_amount]': '1',
                'appearance[show_product_options]': '1',
                'appearance[show_qty]': '1',
                'appearance[show_sku]': '1',
                'dispatch': 'products.options',
                select_name[0]: options[0].select(u'./@value').extract()[0]
            }
            yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php',
                              formdata=formdata,
                              meta={'loader': loader},
                              callback=self.reload_price,
                              dont_filter=True)
            return
        else:
            out_stock = hxs.select('//span[contains(@class, "out-of-stock")]')
            if out_stock:
                loader.add_value('stock', 0)
            yield loader.load_item()

        for option in options:
            option_text = option.select(u'./text()')[0].extract()
            opt_value = option.select(u'./@value').extract()[0]
            if not opt_value:
                continue
            loader = ProductLoader(item=Product(), selector=hxs)
            res = re.search('(.*?) \(\+\xa3([\d\.,]+)\)', option_text)
            if res:
                option_name, option_price = res.groups()
            else:
                option_name = re.search('(.*)', option_text).groups()[0]
                option_price = u'0.00'

            loader.add_value('name', u'%s %s' % (name, option_name))

            loader.add_value('url', response.url)
            if category:
                loader.add_value('category', category[0])

            loader.add_value('price',
                             str(Decimal(price) + Decimal(option_price)))
            if image_url:
                loader.add_value('image_url', image_url)

            formdata = {
                'additional_info[get_detailed]': '1',
                'additional_info[get_discounts]': '1',
                'additional_info[get_features]': '',
                'additional_info[get_icon]': '1',
                'additional_info[get_options]': '1',
                'additional_info[info_type]': 'D',
                'appearance[but_role]': 'action',
                'appearance[capture_options_vs_qty]': '',
                'appearance[details_page]': '1',
                'appearance[separate_buttons]': '',
                'appearance[show_add_to_cart]': '1',
                'appearance[show_list_buttons]': '1',
                'appearance[show_price]': '1',
                'appearance[show_price_values]': '1',
                'appearance[show_product_amount]': '1',
                'appearance[show_product_options]': '1',
                'appearance[show_qty]': '1',
                'appearance[show_sku]': '1',
                'dispatch': 'products.options',
                select_name[0]: opt_value
            }
            yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php',
                              formdata=formdata,
                              meta={
                                  'loader': loader,
                                  'opt_value': opt_value
                              },
                              callback=self.parse_identifier,
                              dont_filter=True)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: walmart_spider.py Proyecto: lifelonglearner127/tmtext

    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        # assume new design of walmart product page
        product_name_node = hxs.select("//h1[contains(@class, 'product-name')]//text()").extract()

        if not product_name_node:
            # assume old design
            product_name_node = hxs.select("//h1[contains(@class, 'productTitle')]//text()").extract()

        if product_name_node:
            product_name = "".join(product_name_node).strip()
        else:
            self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR)
            # TODO:is this ok? I think so
            # return

        if product_name_node:
            item['product_name'] = product_name

            # extract product model number
            # TODO: use meta? works for both old and new?

            # extract features table for new page version:
            table_node = hxs.select("//div[@class='specs-table']/table").extract()

            if not table_node:
                # old page version:
                table_node = hxs.select("//table[@class='SpecTable']").extract()

            if table_node:
                try:
                    product_model = table_node.select(".//td[contains(text(),'Model')]/following-sibling::*/text()").extract()[0]
                    item['product_model'] = product_model
                except:
                    pass

            upc_node = hxs.select("//meta[@itemprop='productID']/@content")
            if upc_node:
                item['product_upc'] = [upc_node.extract()[0]]


            brand_holder = hxs.select("//meta[@itemprop='brand']/@content").extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]

            # extract price
            # TODO: good enough for all pages? could also extract from page directly
            price_holder = hxs.select("//meta[@itemprop='price']/@content").extract()
            product_target_price = None
            if price_holder:
                product_target_price = price_holder[0].strip()

            else:
                product_target_price = "".join(hxs.select("//div[@itemprop='price']//text()").extract()).strip()

            # if we can't find it like above try other things:
            if product_target_price:
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",","",product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n", level=log.INFO)

            try:
                item['product_category_tree'] = hxs.select("//li[@class='breadcrumb']/a/span[@itemprop='name']/text()").extract()[1:]
            except:
                pass

            try:
                item['product_keywords'] = hxs.select("//meta[@name='keywords']/@content").extract()[0]
            except:
                pass

        return item

Ejemplo n.º 17

0

Mostrar archivo

Archivo: cdiscount_aivarsk.py Proyecto: oceancloud82/scraping

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url))
            return

        hxs = HtmlXPathSelector(response)

        # logic to find categories
        # find subcats for Outilage Jardin
        categories = hxs.select(
            '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()
        # find subcats for Aspirateurs
        categories += hxs.select(
            '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()

        for url in categories:
            url = urljoin_rfc(get_base_url(response), url)
            yield Request(url)

        totalproducts = hxs.select('//span[@class="SearchBig"]/text()').re(
            r'(\d+)')

        # pagination
        next_page = hxs.select(
            u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href'
        ).extract()
        if next_page and int(totalproducts[0]) <= 100000:
            if not 'filter_active' in response.meta:
                next_page = urljoin_rfc(get_base_url(response), next_page[0])
                yield Request(next_page,
                              meta={
                                  'next_page_retry': 1,
                                  'dont_redirect': True
                              })
            else:
                next_page = hxs.select(
                    u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]'
                )
                next_page_onclick_id = next_page.select(
                    '@id').extract()[-1] + '.OnClick'
                req = FormRequest.from_response(
                    response,
                    formname='PageForm',
                    formdata={next_page_onclick_id: u'1'},
                    meta={'filter_active': True})
                req.dont_filter = True
                yield req

        if totalproducts and int(
                totalproducts[0]) > 100000 and not response.meta.get(
                    'filter_active'):
            filters = hxs.select(
                '//div[@class="blocFilter" and contains(strong/text(), "Type de produit")]//input/@name'
            ).extract()
            req_base = FormRequest.from_response(response,
                                                 formname='PageForm',
                                                 meta={'filter_active': True},
                                                 dont_click=True)
            for filter in filters:
                req = replace_formdata(req_base, formdata={filter: u'1'})
                req.dont_filter = True
                yield req

        products = hxs.select(
            u'//div[@id="productList"]//div[contains(@class,"plProductView")]')
        if products:
            for product in products:
                product_loader = ProductLoader(item=Product(),
                                               selector=product)
                product_loader.add_xpath(
                    'url', './/a[contains(@class,"plPrName")]/@href')
                product_loader.add_xpath(
                    'name', './/a[contains(@class,"plPrName")]/text()')
                product_loader.add_xpath(
                    'category', '//div[@class="productListTitle"]/h1/text()')
                product_loader.add_xpath(
                    'image_url',
                    './/div[contains(@class, "plProductImg")]//img/@data-src')
                product_loader.add_xpath('sku', './@data-sku')
                product_loader.add_xpath(
                    'identifier',
                    './/input[contains(@name, "ProductPostedForm.ProductId")]/@value'
                )
                price = product.select(
                    u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()'
                ).extract()
                if price:
                    decimals = product.select(
                        u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()'
                    ).re(u'(\d+)')
                    if decimals:
                        price = price[0] + '.' + decimals[0]
                product_loader.add_value('price', price)
                product_loader.add_value('stock', 1)
                if product_loader.get_output_value(
                        'name') and product_loader.get_output_value('price'):
                    identifier = product_loader.get_output_value('identifier')
                    if identifier and identifier.strip():
                        yield product_loader.load_item()
                    else:
                        self.log('PRODUCT WITH NO IDENTIFIER => %s' %
                                 response.url)
        else:
            # this site is buggy (it returns no products when we traverse thru the pages at random rate)
            # so this is a kind of retry code
            if 'next_page_retry' in response.meta:
                self.log('ERROR - NO PRODUCTS FOUND, retrying...')
                count = response.meta['next_page_retry']
                if count < self.RETRY_TIMES:
                    self.log(
                        'ERROR - NO PRODUCTS FOUND, retry #{} url: {}'.format(
                            count, response.url))
                    if not 'filter_active' in response.meta:
                        yield Request(response.url,
                                      meta={
                                          'next_page_retry': count + 1,
                                          'dont_redirect': True
                                      },
                                      dont_filter=True)
                    else:
                        # TODO: FormRequest?
                        pass
                else:
                    self.log(
                        'ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}'
                        .format(response.url))

Ejemplo n.º 18

0

Mostrar archivo

    def parse(self, response):
        # storing in the mongo database
        client = MongoClient(os.environ['MONGODB_URI'])
        db = client.heroku_5s156rtt
        dininghalls = db.dininghalls

        currUrl = response.request.url
        currDate = currUrl.replace(
            "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-507&menu_date=",
            "").replace("%2F", "/")
        currDate = currDate.replace(
            "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-514&menu_date=",
            "").replace("%2F", "/")
        currDate = currDate.replace(
            "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-518&menu_date=",
            "").replace("%2F", "/")

        # Both of these work
        hxs = HtmlXPathSelector(response)

        #declaring json
        dininghall = {'stations': []}

        cafeTitle = hxs.xpath(
            "//h2[contains(@class, 'fw-accordion-title ui-state-active')]/text()"
        ).extract()
        print cafeTitle[0].encode('utf-8')

        differentSections = hxs.xpath(
            "//div[contains(@class, 'col-sm-6 col-md-4')]")

        for differentSections in differentSections:

            mealTimes = differentSections.xpath("h3/text()").extract()
            stations = differentSections.xpath("h4/text()").extract()

            print(mealTimes[0].encode('utf-8')
                  ).strip("[]").strip('u\'').strip('\'')
            dininghall.update({
                'mealtype': (mealTimes[0].encode('utf-8')
                             ).strip("[]").strip('u\'').strip('\'')
            })

            dininghall.update({'date': currDate})

            # if datetime.datetime.strftime(datetime.date.today(), '%d') in cafeTitle[0]:
            #   dininghall.update({'date': datetime.datetime.strftime(datetime.date.today(), '%x')})
            # if datetime.datetime.strftime(datetime.date.today() + datetime.timedelta(days=1), '%d') in cafeTitle[0]:
            #   dininghall.update({'date': datetime.datetime.strftime(datetime.date.today() + datetime.timedelta(days=1), '%x')})
            if "Kitchen" in cafeTitle[0]:
                dininghall.update({'name': 'EVK'})
            if "Parkside" in cafeTitle[0]:
                dininghall.update({'name': 'Parkside'})
            if "84" in cafeTitle[0]:
                dininghall.update({'name': 'Cafe 84'})

            foodItemSections = differentSections.xpath(
                "ul[contains(@class, 'menu-item-list')]")

            i = 0
            for foodItemSections in foodItemSections:
                foodItems = foodItemSections.xpath("li/text()").extract()

                print stations[i].encode('utf-8')
                stationMiniJSON = {
                    'name': (stations[i].encode('utf-8')
                             ).strip("[]").strip('u\'').strip('\''),
                    'options': []
                }

                for foodItems in foodItems:
                    print foodItems.encode('utf-8')
                    if "\"" in foodItems:
                        individualFoodItemsWrapper = foodItemSections.xpath(
                            "li[contains(., '" + foodItems + "')]")
                    else:
                        individualFoodItemsWrapper = foodItemSections.xpath(
                            "li[contains(., \"" + foodItems + "\")]")
                    foodItemsTags = individualFoodItemsWrapper.xpath(
                        "span/i/span/text()").extract()
                    foodMiniJSON = {
                        'name': foodItems.encode('utf-8'),
                        'tags': []
                    }
                    for foodItemsTags in foodItemsTags:
                        foodMiniJSON['tags'].append(
                            (foodItemsTags.encode('utf-8')
                             ).strip("[]").strip('u\'').strip('\''))
                    stationMiniJSON['options'].append(foodMiniJSON)
                dininghall['stations'].append(stationMiniJSON)
                i += 1
            print dininghall
            dininghalls.insert(dininghall)
            dininghall = {'stations': []}

Ejemplo n.º 19

0

Mostrar archivo

def load_html(url):
    resp = requests.get(url)
    resp.raise_for_status()
    resp.encoding = "gbk"
    data = HTMLParser.HTMLParser().unescape(resp.text)
    return HtmlXPathSelector(text=data)

Ejemplo n.º 20

0

Mostrar archivo

    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        if response.meta['parse_options']:
            color_options = hxs.select(
                "//div[@class='more-colors']//a/@href").extract()
            for color in color_options:
                url = self.base_url + color
                yield Request(url,
                              meta={'parse_options': False},
                              callback=self.parse_product)

        item_data = re.search('dataLayer.push\((.*)\);', response.body)
        item_data = json.loads(item_data.group(
            1))['ecommerce']['detail']['products'][0] if item_data else None

        name = item_data.get('name')
        sku = item_data.get('id')
        price = str(item_data.get('price'))
        price = float(extract_price(price))
        brand = item_data.get('brand')
        stock = 1 if price else 0
        brand = '' if brand == False else brand

        image_url = ''.join(
            hxs.select("//img[@itemprop='image']/@src").extract())
        categories = [
            category.strip() for category in hxs.select(
                "//div[@class='breadcrumbs']//a/text()").extract()[1:]
        ]
        categories = [
            category for category in categories
            if not category.lower() in ['mehr', 'designer']
        ]
        shipping = 0

        color_swatches = re.search('ColorswatchConfig\((.*)\) ,',
                                   response.body)
        color_swatches = json.loads(
            color_swatches.group(1))['swatch'] if color_swatches else None

        if color_swatches:
            for color_swatch, colors in color_swatches.iteritems():
                for color_id, values in colors.iteritems():
                    #== If the next part throws an error, swatch is not available and we should skip it ==#
                    try:
                        option_name = name + ' ' + values['option_values'][
                            'store_label']
                    except:
                        continue
                    option_id = sku + values['option_values']['value_index']
                    option_price = values['option_values']['pricing_value']
                    if option_price:
                        option_price = 0 if option_price in [
                            'null', 'None', None
                        ] else option_price
                        option_price = price + float(option_price)
                    else:
                        option_price = price

                    product_loader = ProductLoader(item=Product(),
                                                   selector=hxs)
                    product_loader.add_value('image_url', image_url)
                    product_loader.add_value('shipping_cost', shipping)
                    product_loader.add_value('sku', option_id)
                    product_loader.add_value('url', response.url)
                    product_loader.add_value('name', option_name)
                    product_loader.add_value('brand', brand)
                    product_loader.add_value('identifier', option_id)
                    product_loader.add_value('price', option_price)
                    for category in categories:
                        if not category.lower() == 'more':
                            product_loader.add_value('category',
                                                     category.strip())

                    yield product_loader.load_item()

        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('image_url', image_url)
            product_loader.add_value('shipping_cost', shipping)
            product_loader.add_value('sku', sku)
            product_loader.add_value('url', response.url)
            product_loader.add_value('name', name)
            product_loader.add_value('brand', brand)
            product_loader.add_value('identifier', sku)
            product_loader.add_value('price', price)
            for category in categories:
                if not category.lower() == 'more':
                    product_loader.add_value('category', category.strip())

            yield product_loader.load_item()

Ejemplo n.º 21

0

Mostrar archivo

    def extract_product_data(self, response, item):
        hxs = HtmlXPathSelector(response)

        try:
            item['product_name'] = hxs.xpath(
                "//h1[starts-with(@class,'title')]//text()").extract(
                )[0].strip()
        except:
            try:
                item['product_name'] = hxs.xpath(
                    "//div[@class='pdp_title']//text()[normalize-space()!='']"
                ).extract()[0].strip()
            except:
                try:
                    item['product_name'] = hxs.xpath(
                        "//h1//text()").extract()[0].strip()
                except:
                    # out of stock products return 404s with this text, not the actual product page
                    out_of_stock = hxs.xpath(
                        "//strong[contains(text(),'out of stock')]").extract()
                    if not out_of_stock:
                        self.log("Error: No product name: " +
                                 str(response.url) + " from product: " +
                                 item['origin_url'],
                                 level=log.ERROR)
                    # ignore products with no name
                    return None

        price_node = hxs.select("//meta[@itemprop='price']/@content").extract()

        if price_node:

            try:
                price_currency = price_node[0][0]
                price_amount = "".join(price_node[0][1:])

                price_amount = re.sub(",", "", price_amount)

                m1 = re.match("[0-9]+\.?[0-9]*", price_amount)
                m2 = re.match("(\xa3)|(\$)", price_currency)
                if not m1 or not m2:
                    self.log("Didn't match product price: " + price_amount +
                             price_currency + " " + response.url + "\n",
                             level=log.WARNING)
                else:
                    price = Utils.convert_to_dollars(float(price_amount),
                                                     price_currency)
                    item['product_target_price'] = price
            except Exception:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

        try:
            product_model_node = hxs.select(
                "//div[@class='prod_description1']//li[contains(text(), 'Style')]/text()"
            ).re("[sS]tyle +[nN]o\.? +[a-zA-Z0-9]+")
            item['product_model'] = re.match(
                "[sS]tyle +[nN]o\.? +([a-zA-Z0-9]+)",
                product_model_node[0]).group(1)
        except Exception:
            pass

        try:
            item['product_brand'] = hxs.select(
                "//meta[@itemprop='brand']/@content").extract()[0]
        except Exception:
            pass

        try:
            js_body = hxs.select(
                "//script[contains(text(),'Upc')]/text()").extract()[0]
            item['product_upc'] = re.match('.*"skuUpcCode":"([0-9a-zA-Z]+)".*',
                                           js_body,
                                           re.DOTALL | re.MULTILINE).group(1)
        except Exception:
            pass

        return item

Ejemplo n.º 22

0

Mostrar archivo

Archivo: poingdestres_spider.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        image_url = hxs.select(
            '//p[contains(@class, "product-image")]/a/@href').extract()
        price = extract_price("".join(
            hxs.select(
                '//div/span/span[@class="price"]/text()').extract()).strip())
        if not price:
            price = extract_price("".join(
                hxs.select(
                    '//p[@class="special-price"]//span[@class="price"]/text()'
                ).extract()).strip())
        if image_url:
            image_url = image_url[
                0]  # urljoin_rfc(get_base_url(response), image_url[0])
        category = hxs.select(
            '//div[contains(@class, "breadcrumbs")]/ul/li/a/text()').extract()
        # hxs.select(u'//div[@id="Breadcrumb"]//a/text()').extract()
        category = category[-1] if category else ''
        options = hxs.select('//select/option[@value!=""]')
        identifier = hxs.select(
            '//input[@name="product" and @value!=""]/@value'
        ).extract(
        )[0]  # re.search(u'poingdestres\.co\.uk/(.*)/', response.url).group(1)
        name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0]
        brand = ''.join(
            hxs.select('//div[contains(@class, "brand-name")]/text()').extract(
            )).strip()
        if options:
            # options
            url = response.url
            for option in options:
                try:
                    name2 = option.select('text()').extract()[0].split(
                        u' +£')[0]
                except:
                    name2 = ''
                option_price = extract_price(
                    option.select('@price').extract()[0])
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', url)
                loader.add_value('name', brand + u' ' + name + u' ' + name2)
                loader.add_value('price', price + option_price)
                loader.add_value(
                    'identifier',
                    identifier + '.%s' % option.select('@value').extract()[0])
                loader.add_value('category', category)
                loader.add_value('brand', brand)
                if image_url:
                    loader.add_value('image_url', image_url)
                yield loader.load_item()
        elif re.search('Product.Config\((.*)\);', response.body):
            options = re.search('Product.Config\((.*)\);', response.body)
            options = json.loads(options.group(1))
            url = response.url
            for attribute in options['attributes'].values():
                for i, option in enumerate(attribute['options'], 1):
                    name2 = option['label']
                    option_price = Decimal(option['price'])
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', url)
                    loader.add_value('name',
                                     brand + u' ' + name + u' ' + name2)
                    loader.add_value('price', price + option_price)
                    loader.add_value(
                        'identifier',
                        identifier + '.%s' % option['products'][0])
                    loader.add_value('category', category)
                    loader.add_value('brand', brand)
                    if image_url:
                        loader.add_value('image_url', image_url)
                    yield loader.load_item()

        else:
            # hxs.select("//div[@class='ProductDetails']/h1/text()")[0].extract().strip()
            url = response.url
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', url)
            loader.add_value('name', brand + ' ' + name)
            loader.add_value('price', price)
            loader.add_value('identifier', identifier)
            loader.add_value('category', category)
            loader.add_value('brand', brand)
            if image_url:
                loader.add_value('image_url', image_url)
            yield loader.load_item()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: machinemart_co_uk.py Proyecto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select('//ul[@id="MainMenu"]//a/@href').extract():
            url = urljoin_rfc(get_base_url(response), url)
            yield Request(url, callback=self.parse_product_list)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: drsfostersmithcom.py Proyecto: oceancloud82/scraping

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        sku = response.meta['sku']
        name = response.meta['name'].encode('ascii', 'ignore')
        sec_number = response.meta['notes']

        prod_name = hxs.select(
            "//h1[contains(@class, 'categoryname')]/text()").extract()
        if not prod_name:
            logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url))
            return
        prod_name = prod_name[0].strip()

        options = hxs.select("//tr[td/input[@type='radio']]")

        found_products = []

        for option in options:
            text = option.select("td[2]/div[1]/span/text()").extract()
            if not text:
                logging.error("OPTIONS TEXT NOT FOUND! '%s'" % url)
                continue
            text = "".join(text)
            m = re.search("(.*),([^,]*)(,([^,]*))?", text)
            if not m:
                logging.error("CAN'T PARSE OPTIONS TEXT! '%s', '%s'" %
                              (text, url))
                continue

            add_name = m.group(1).strip()
            add_number = m.group(2).strip()

            price = option.select(
                './/span[@class="productSave"]/text()').extract()
            if not price:
                price = option.select("td[2]/div[2]/span/text()").extract()
            if not price:
                price = option.select("td[2]/div[1]/span[2]/text()").extract()
            if not price:
                logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' %
                              (sku, prod_name, url))
                return
            price = price[0].strip()

            found_products.append(
                ("%s %s" % (prod_name.encode('ascii', 'ignore'), add_name),
                 add_number, price))

            if add_number == sec_number:
                product = Product()
                loader = ProductLoader(item=product,
                                       response=response,
                                       selector=hxs)
                loader.add_value('url', url)
                loader.add_value('name', name)
                loader.add_value('price', price)

                loader.add_value('sku', sku)

                yield loader.load_item()
                return

        with open("/home/juraseg/src/drsfostersmith_products.txt",
                  'a+') as handle:
            handle.write("======================================\n")
            handle.write("Product not found\n")
            handle.write("SKU: %s, Name: %s\n" % (sku, name))
            for prod in found_products:
                handle.write("Found: %s, %s, %s\n" % prod)
            handle.write("======================================\n\n")

Ejemplo n.º 25

0

Mostrar archivo

Archivo: harveysfurniture_spider.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_titles = hxs.select('//div[@class="product-header"]/h2/text()').extract()
        product_urls = hxs.select('//div[@data-product-id]/@class').re(r'js-product-([\w-]+)')

        products = []

        for l in response.body.split('\n'):
            if 'Harveys.DATA.CDP.Products' in l:
                products.append(l.strip())

        for i, product in enumerate(products):
            data = json.loads(product.split(' = ')[1][:-1])
            product_id = data['product_id']
            product_url = response.url
            for value in data['variants'].values():
                product_name = product_titles[i] + ' - ' + ' - '.join(value['attributes'].values())
                product_price = value['prices']['price']['value']
                variant_id = value[u'variant_id']

                product_identifier = '%s:%s' % (product_id, variant_id)
                product_url = urljoin_rfc(product_url, '#/%s' % product_urls[i])

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', product_url)
                loader.add_value('name', product_name)
                loader.add_value('identifier', product_identifier)
                loader.add_value('price', product_price)
                loader.add_value('shipping_cost', '59')

                if product_url in self.old_data:
                    loader.add_value('category', self.old_data[product_url]['category'])
                    loader.add_value('brand', self.old_data[product_url]['brand'])
                    loader.add_value('sku', self.old_data[product_url]['sku'])

                category_found = bool(loader.get_output_value('category'))
                if not category_found:
                    for category, urls in self.category_products.items():
                        if product_url in urls or product_url + '/' in urls:
                            loader.add_value('category', category.split(','))
                            category_found = True
                            break

                if not category_found:
                    if 'lily-loveseat' in product_url:
                        loader.add_value('category', ['Sofa', 'Fabric', 'armchair'])
                    elif 'lean-to-shelf' in product_url:
                        loader.add_value('category', ['Cabinets', 'Bookcases'])
                    elif 'bench' in product_url:
                        loader.add_value('category', ['Dining', 'Dining Tables'])
                    elif 'console-table' in product_url:
                        loader.add_value('category', ['Cabinets', 'Console Tables'])
                    elif 'coffee-table' in product_url:
                        loader.add_value('category', ['Living', 'Coffee Tables'])
                    elif 'nest-of-table' in product_url:
                        loader.add_value('category', ['Living', 'Nest of Tables'])
                    elif '-sofa' in product_url or 'sofa' in product_name.lower():
                        if 'leather' in product_url or 'leather' in product_name.lower():
                            category = ['Sofa', 'Leather']
                        else:
                            category = ['Sofa', 'Fabric']

                        if '2-seater' in product_url:
                            category.append('2 seater')
                        elif '2.5 seater' in product_name.lower():
                            category.append('2.5 seater')
                        elif '3-seater' in product_url:
                            category.append('3 seater')
                        elif '4-seater' in product_url:
                            category.append('4 seater')
                        elif 'corner' in product_url:
                            category.append('Corner sofas')
                        elif 'recliner' in product_url:
                            category.append('Recliner sofas')

                        if len(category) == 3:
                            loader.add_value('category', category)
                    elif '-corner' in product_url:
                        if 'leather' in product_url or 'leather' in product_name.lower():
                            category = ['Sofa', 'Leather', 'Corner sofas']
                        else:
                            category = ['Sofa', 'Fabric', 'Corner sofas']
                        loader.add_value('category', category)
                    elif '-recliner-chair' in product_url or (('chair' in product_name.lower() or 'seat' in product_name.lower()) and ('recliner' in product_name.lower() or ' no recline' in product_name.lower())) or 'relaxer-chair' in product_url or 'hand-facing' in product_url:
                        if 'leather' in product_url or 'leather' in product_name.lower() or 'reid-hedgemoor' in product_url:
                            category = ['Sofa', 'Leather', 'armchair']
                        else:
                            category = ['Sofa', 'Fabric', 'armchair']
                        loader.add_value('category', category)
                    elif '-footstool' in product_url and not ('chair' in product_url):
                        if 'millan-' in product_url or 'leather' in product_url or 'leather' in product_name.lower():
                            loader.add_value('category', ['Sofa', 'Leather', 'Footstools'])
                        else:
                            loader.add_value('category', ['Sofa', 'Fabric', 'Footstools'])
                    elif '-table' in product_url and '-chairs' in product_url:
                        loader.add_value('category', ['Dining', 'Dining Sets'])
                    elif '-dining-table' in product_url:
                        loader.add_value('category', ['Dining', 'Dining Tables'])
                    elif '-bookcase' in product_url:
                        loader.add_value('category', ['Cabinets', 'Bookcases'])
                    elif '-lamp-table' in product_url:
                        loader.add_value('category', ['Living', 'Lamp Tables'])
                    elif '-sideboard' in product_url:
                        loader.add_value('category', ['Cabinets', 'Sideboards'])
                    elif '-display-unit' in product_url:
                        loader.add_value('category', ['Cabinets', 'Display Units'])
                    elif 'tv unit' in product_name.lower():
                        loader.add_value('category', ['Cabinets', 'Entertainment units'])
                    elif '-shelving-unit' in product_url:
                        loader.add_value('category', ['Cabinets', 'Display Units'])
                    elif '-wine-storage' in product_url:
                        loader.add_value('category', ['Cabinets', 'Display Units'])

                self.products_collected.append(set_product_type(loader.load_item()))

Ejemplo n.º 26

0

Mostrar archivo

    def _process_product_info_product_details(self, response, product_info):
        """
        This needs to be in separate function because used by two methods: parse_product_details and parse_ajax_price
        """
        hxs = HtmlXPathSelector(response)
        categories = hxs.select(
            '//div[@class="bucket"]/div[@class="content"]/ul/li[1]/a/text()'
        ).extract()
        product_info['category'] = categories

        sku = hxs.select(
            '//li[b[contains(text(), "ISBN-13")]]/text()').extract()
        product_info['sku'] = sku[0].strip() if sku else ''

        if response.meta.get(
                'seller_identifier',
                None) and not product_info.get('seller_identifier', None):
            product_info['seller_identifier'] = response.meta[
                'seller_identifier']

        check_match = response.meta.get('check_match', True)

        match = self.match(response.meta, self.current_search_item,
                           product_info)

        if check_match and not match:
            self.log("[AMAZON] WARNING: product does not match: %s" %
                     response.url)
            return

        if self.parse_options:
            if product_info['options'] and response.meta.get(
                    'parse_options', True):
                self.log('[AMAZON] OPTIONS FOUND => %s' % response.url)

                for option in product_info['options']:
                    new_meta = response.meta.copy()
                    new_meta.update({
                        'parse_options': False,
                        'search_string': self.current_search,
                        'search_item': self.current_search_item,
                        'check_match': check_match
                    })
                    yield Request(option['url'],
                                  self.parse_product,
                                  meta=new_meta,
                                  dont_filter=True)
                return
            else:
                if product_info['name_with_options']:
                    product_info['name'] = product_info['name_with_options']
                elif product_info['option_texts']:
                    product_info['name'] += ' [' + ', '.join(
                        product_info['option_texts']) + ']'

        if self.type == 'asins':
            url_asin = AmazonUrlCreator.get_product_asin_from_url(
                product_info['url'])
            if product_info['asin'].lower() != url_asin.lower():
                self.log(
                    "[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s"
                    % (product_info['asin'], url_asin, response.url))
                return

        # Amazon Direct
        if self.amazon_direct:
            if self.collect_reviews and product_info.get(
                    'reviews_url') and response.meta.get(
                        'collect_reviews', True):
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['reviews_url'],
                              callback=self.parse_reviews,
                              meta=new_meta)
            else:
                product = self.construct_product(product_info,
                                                 meta=response.meta)
                self.log("[AMAZON] collect parse product: %s" %
                         product['identifier'])
                if self.type == 'category':
                    yield product
                else:
                    self._collect_amazon_direct(product, response.meta)
        # Buy Box
        elif self.only_buybox:
            if (product_info['price'] and product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                    self.collect_products_with_no_dealer:
                if self.collect_reviews and product_info.get(
                        'reviews_url') and response.meta.get(
                            'collect_reviews', True):
                    new_meta = response.meta.copy()
                    new_meta['found_item'] = product_info
                    if self.type == 'search':
                        new_meta.update({
                            'search_string':
                            response.meta['search_string'],
                            'search_item':
                            self.current_search_item,
                        })
                    yield Request(product_info['reviews_url'],
                                  callback=self.parse_reviews,
                                  meta=new_meta)
                else:
                    product = self.construct_product(product_info,
                                                     meta=response.meta)
                    self.log("[AMAZON] collect parse product: %s" %
                             product['identifier'])
                    if self.type == 'category':
                        yield product
                    else:
                        self._collect_buybox(product, response.meta)
            elif not product_info['vendor'] or not product_info['price']:
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                new_meta.update({
                    'search_string': response.meta['search_string'],
                    'search_item': self.current_search_item,
                })
                yield Request(product_info['mbc_list_url_new'],
                              callback=self.parse_mbc_list,
                              meta=new_meta)
                #self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url)
            else:
                self.log("[AMAZON] WARNING: vendor not allowed: %s" %
                         response.url)
        # all sellers / lowest price
        elif self.all_sellers or self.lowest_product_and_seller:
            # Go to MBC lists to get dealers prices
            collect_mbc = response.meta.get('collect_mbc', True)
            if collect_mbc and product_info.get(
                    'mbc_list_url_new') and self.collect_new_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['mbc_list_url_new'],
                              callback=self.parse_mbc_list,
                              meta=new_meta)
            elif collect_mbc and product_info.get(
                    'mbc_list_url_used') and self.collect_used_products:
                # yield mbc parse
                new_meta = response.meta.copy()
                new_meta['found_item'] = product_info
                if self.type == 'search':
                    new_meta.update({
                        'search_string':
                        response.meta['search_string'],
                        'search_item':
                        self.current_search_item,
                    })
                yield Request(product_info['mbc_list_url_used'],
                              callback=self.parse_mbc_list,
                              meta=new_meta)
            else:
                if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \
                        self.collect_products_with_no_dealer:
                    if self.collect_reviews and product_info.get(
                            'reviews_url') and response.meta.get(
                                'collect_reviews', True):
                        new_meta = response.meta.copy()
                        new_meta['found_item'] = product_info
                        if self.type == 'search':
                            new_meta.update({
                                'search_string':
                                response.meta['search_string'],
                                'search_item':
                                self.current_search_item,
                            })
                        yield Request(product_info['reviews_url'],
                                      callback=self.parse_reviews,
                                      meta=new_meta)
                    else:
                        use_seller_id_in_identifier = False \
                            if self.lowest_product_and_seller and not self.lowest_seller_collect_dealer_identifier else True
                        product = self.construct_product(
                            product_info,
                            meta=response.meta,
                            use_seller_id_in_identifier=
                            use_seller_id_in_identifier)
                        self.log("[AMAZON] collect parse product: %s" %
                                 product['identifier'])
                        if self.type == 'category':
                            yield product
                        else:
                            self._collect(product)
                elif not product_info['vendor']:
                    # TODO: collect vendor from vendor details page
                    self.log(
                        "[AMAZON] WARNING: Could not scrape vendor from product details: %s"
                        % response.url)
                    self.errors.append(
                        "Could not scrape vendor from product details: %s" %
                        response.url)
                else:
                    self.log("[AMAZON] WARNING: vendor not allowed: %s" %
                             response.url)

Ejemplo n.º 27

0

Mostrar archivo

    def parse_product(self, response):
        # inspect_response(response, self)
        # return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        tmp = hxs.select('//p[@class="sku-number"]/span/text()').extract()
        if tmp:
            loader.add_value('identifier', tmp[0])
            loader.add_value('sku', tmp[0])
        else:
            log.msg('### No product ID at ' + response.url, level=log.INFO)
        name = ''
        tmp = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', tmp[0].strip())
        else:
            log.msg('### No name at ' + response.url, level=log.INFO)
        # price
        price = 0
        tmp = hxs.select('//strong[@itemprop="price"]/text()').extract()
        if tmp:
            price = extract_price(tmp[0].strip())
            loader.add_value('price', price)
        # image_url
        tmp = hxs.select('//img[@itemprop="image"]/@src').extract()
        if tmp:
            loader.add_value('image_url', tmp[0])
        # brand
        for brand in self.brands:
            if brand.lower() in name.lower():
                loader.add_value('brand', brand)
                break
        # category
        tmp = hxs.select('//ul[@itemprop="breadcrumb"]/li/a/text()').extract()
        if len(tmp):
            tmp = tmp[1:]
        if len(tmp) > 3:
            tmp = tmp[-3:]
        for s in tmp:
            loader.add_value('category', s)
        # shipping_cost
        shipping_cost = '9.90'

        # stock
        if not price:
            loader.add_value('stock', 0)
        else:
            tmp = hxs.select(
                '//span[contains(@class,"stock-status")]/text()').extract()
            if tmp and 'Out' in tmp[0]:
                loader.add_value('stock', 0)
            else:
                loader.add_value('stock', 1)

        product = loader.load_item()
        # options = hxs.select('//ul[contains(@id,"option-custcol")]/li/a').extract()
        # if not options:
        #    yield product
        #    return
        tmp = hxs.select('//meta[@itemprop="url"]/@content').extract()
        if product['price'] <= 99:
            product['shipping_cost'] = shipping_cost

        if not tmp:
            yield product
            return
        else:
            # process options
            if tmp[0].startswith('/'):
                tmp[0] = tmp[0][1:]
            if tmp[0].startswith('product/'):
                tmp[0] = tmp[0][8:]
                url = 'http://www.cellbikes.com.au/api/items?include=facets&fieldset=details&language=en&country=AU&currency=AUD&pricelevel=5&c=980629&n=3&id=%s' % tmp[
                    0]
            else:
                url = 'http://www.cellbikes.com.au/api/items?include=facets&fieldset=details&language=en&country=AU&currency=AUD&pricelevel=5&c=980629&n=3&url=%s' % tmp[
                    0]
            yield Request(url,
                          meta={'product': product},
                          callback=self.parse_options)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: billigvoks_spider.py Proyecto: ontiyonke/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select('//td[@class="produkt_menu"]/div/table/tr/td/a/@href').extract()
     for category in categories:
         url =  urljoin_rfc(get_base_url(response), category)
         yield Request(url, callback=self.parse_products)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: greenham_com.py Proyecto: oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        name = hxs.select('//div[@class="catBanner"]/h2/text()').extract()[0]
        price = hxs.select(
            '//span[@id="variant-price-header"]/text()').extract()

        if price:
            price = extract_price(price[0])
        else:
            return

        sku = hxs.select('//div[@class="prod"]/p[@class="code"]').re(
            "Code: ([0-9]+)")[0]
        brand = hxs.select(
            '//td[@class="attrib" and text()="Manufacturer"]/following-sibling::td/text()'
        ).extract()

        product_loader.add_value('sku', sku)
        category = " ".join(
            hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()').extract()
            [2:-1])[2:]
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)
        image_url = hxs.select(
            '//div[@id="primary_image"]/a/img/@src').extract()
        if image_url:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url[0]))
        identifier = hxs.select(
            '//input[@name="productCodePost"]/@value').extract()
        product = product_loader.load_item()
        variants = hxs.select('//select[@id="variant"]/option')
        if variants:
            for option in variants:
                value = option.select('./@value').extract()
                if value:
                    variant = parse_variant(value[0])
                    title = option.select('./text()').extract()[0]
                    price = extract_price(variant.get('price', "0"))
                    subid = variant.get('code')
                    if subid:
                        prod = Product(product)
                        prod['identifier'] = "%s_%s" % (identifier[0], subid)
                        prod['price'] = price
                        subname = title.split(u"£")
                        if subname:
                            subname = subname[0].strip().replace(u"\xa0", " ")
                            if subname.endswith(","):
                                subname = subname[:-1]
                        prod['name'] = "%s %s" % (name, subname)
                        yield prod
        else:
            # one option product
            prod = Product(product)
            prod['name'] = name
            o = hxs.select(
                '//div[@class="options_not_available"]/text()').extract()
            if o:
                prod['name'] += ' ' + o[0].strip()
            prod['identifier'] = identifier[0]
            prod['price'] = price

            yield prod

Ejemplo n.º 30

0

Mostrar archivo

Archivo: budgetgolf.py Proyecto: oceancloud82/scraping

    def parse_cat(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        
        search_url = response.xpath('//script/text()').re_first('var refine_filters_server_search_script.*?"(.+)"')
        if search_url:
            yield Request(response.urljoin(search_url), 
                          self.parse_cat,
                          meta=response.meta)

        subcats = response.xpath('//div[@class="no_child_subcats_list"]//a/@href').extract()
        subcats += response.css('div.subcats_list a::attr(href)').extract()
        if subcats:
            for subcat in subcats:
                yield Request(
                    url=urljoin_rfc(base_url, subcat),
                    meta=response.meta,
                    callback=self.parse_cat)

                # AJAX?
                if (
                                    '-' in subcat and
                                    'http' not in subcat and
                                '?' not in subcat and
                            '=' not in subcat
                ):
                    url = urljoin_rfc(base_url,
                                      subcat.replace('-', '_')
                                      .replace('/', '') + '_search.php')

                    request = FormRequest(url=url,
                                          formdata={u'mode': u'get_products',
                                                    u'objects_per_page': u'45',
                                                    u'page': u'1'},
                                          meta=response.meta,
                                          callback=self.parse_cat)
                    yield request

        next_page = response.css('a.right-arrow::attr(href)').extract()

        if not next_page or (next_page and not next_page[0]):
            try:
                next_page = int(response.css('a.right-arrow::attr(onclick)').re(r"\('(\d+)'")[0])

                request = FormRequest(url=response.url,
                                      formdata={u'mode': u'get_products',
                                                u'objects_per_page': u'45',
                                                u'page': unicode(next_page)},
                                      meta=response.meta,
                                      callback=self.parse_cat,
                                      dont_filter=True)
                yield request
            except:
                pass
            else:
                next_page = None
        if next_page:

            url1 = urljoin_rfc(base_url, next_page[0])

            yield Request(
                url=url1,
                meta=response.meta,
                callback=self.parse_cat)

        products = response.css('div#pr_list a::attr(href)').extract()
        
        for product in products:
            # self.cookie_jar += 1
            meta = response.meta.copy()
            meta['dont_merge_cookies'] = True
            # meta['cookiejar'] = self.cookie_jar
            yield Request(
                url=urljoin_rfc(base_url, product),
                meta=meta,
                callback=self.parse_product)