Beispiel #1
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        links = xxs.select(
            "//item/*[local-name()='origLink']/text()").extract()

        return [Request(x, callback=self.parse_item) for x in links]
    def parse(self, response):
        xxs = XmlXPathSelector(response)

        for product in xxs.select('//product'):
            category = product.select('./Category/text()').extract()
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('identifier', './product-id/text()')
            loader.add_xpath('sku', './product-id/text()')
            loader.add_xpath('url', './product-url/text()')
            loader.add_xpath('name', './product-name/text()')
            loader.add_xpath('brand', './brand/text()')
            loader.add_value(
                'price',
                extract_price_eu(' '.join(
                    product.select('./price/text()').extract())))
            if category:
                loader.add_value('category',
                                 category[0].split('/')[-1].strip())
            loader.add_xpath('image_url', './image-url/text()')
            loader.add_xpath('stock', './stock/text()')
            if loader.get_output_value('price') > 499:
                loader.add_value('shipping_cost', '0')
            else:
                loader.add_value('shipping_cost', '25')
            yield loader.load_item()
    def parse(self, response):
        # inspect_response(response, self)
        # return
        # hxs = HtmlXPathSelector(response)
        # file_path = "d:/work/GoogleFeed.xml"
        # f = open(file_path)
        # xxs = XmlXPathSelector(text=f.read())
        xxs = XmlXPathSelector(response)
        for sel in xxs.select('//channel/item'):  # ##
            loader = ProductLoader(item=Product(), response=response)
            tmp = sel.select('link/text()').extract()
            if tmp:
                loader.add_value('url', tmp[0])
            # ID
            tmp = sel.select('*[name()="g:id"]/text()').extract()
            if tmp:
                loader.add_value('identifier', tmp[0])
            # Sku
            tmp = sel.select('*[name()="g:id"]/text()').extract()
            if tmp:
                loader.add_value('sku', tmp[0])
            # Name
            tmp = sel.select('title/text()').extract()
            if tmp:
                loader.add_value('name', tmp[0])
            # price
            tmp = sel.select('*[name()="g:sale_price"]/text()').extract()
            if not tmp:
                tmp = sel.select('*[name()="g:price"]/text()').extract()
            if tmp:
                price = round(extract_price(tmp[0]) / Decimal('1.20'), 2)
                loader.add_value('price', price)
            # image_url
            tmp = sel.select('*[name()="g:image_link"]/text()').extract()
            if tmp:
                loader.add_value('image_url', tmp[0])
            # Brand
            tmp = sel.select('*[name()="g:brand"]/text()').extract()
            if tmp and tmp[0] != 'Alliance':
                loader.add_value('brand', tmp[0])
            # category
            tmp = sel.select('*[name()="g:product_type"]/text()').extract()
            if tmp:
                try:
                    loader.add_value('category', tmp[0].split('>')[1].strip())
                except:
                    loader.add_value('category', tmp[0].strip())
            # shipping_cost
            price = loader.load_item()['price']
            if price and price < 50.00:
                loader.add_value('shipping_cost', 5.90)
            # stock
            tmp = sel.select('*[name()="g:availability"]/text()').extract()
            if tmp and tmp[0] == 'in stock':
                loader.add_value('stock', 1)
            else:
                loader.add_value('stock', 0)

            yield loader.load_item()
Beispiel #4
0
def xmliter_lxml(obj, nodename):
    from lxml import etree
    reader = _StreamReader(obj)
    iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
Beispiel #5
0
    def parse(self, response):
        #print '-----------------------------------------1111111111111111111111111111111111111-------------------------------------'
        #print 'parseparseparseparseparseparse'
        #print type(self)
        #print '--------------------------------------------2222222222222222222222222222222222222------------------------------------'
        if self.scraper.content_type == 'H':
            xs = HtmlXPathSelector(response)
        else:
            xs = XmlXPathSelector(response)
        base_elem = self.scraper.get_base_elem()
        url_elem = self.scraper.get_detail_page_url_elem()
        base_objects = xs.select(base_elem.x_path)
        if (len(base_objects) == 0):
            self.log("No base objects found!", log.ERROR)

        if (self.conf['MAX_ITEMS_READ']):
            items_left = min(
                len(base_objects),
                self.conf['MAX_ITEMS_READ'] - self.items_read_count)
            base_objects = base_objects[0:items_left]

#print '-------------------------------------55555555555555555555555555555555555555555-----------------------------------'
#print 'before for obj in base_objects:'
#print type(base_objects)
#print '------------------------------------6666666666666666666666666666666666666666666------------------------------------'
        for obj in base_objects:
            item_num = self.items_read_count + 1
            self.log("Starting to crawl item %s." % str(item_num), log.INFO)
            item = self.parse_item(response, obj)
            # print '-------------------------------333333333333333333333333333333333333333333-------------------------------------'
            #  print type(item)
            # print '---------------------------------777777777777777777777777777777777777777-------------------------------------'
            #print item
            url_name = url_elem.scraped_obj_attr.name
            if (item and url_name in item):
                url = item[url_name]
                cnt = self.scraped_obj_class.objects.filter(
                    url=item[url_name]).count()
                cnt1 = self.scraper.get_standard_update_elems_from_detail_page(
                ).count()
                cnt2 = self.scraper.get_from_detail_page_scrape_elems().count()
                # Mark item as DOUBLE item
                if cnt > 0:
                    item[url_name] = 'DOUBLE' + item[url_name]
                # (DOUBLE item with no standard update elements to be scraped from detail page) or
                # generally no attributes scraped from detail page
                if (cnt > 0 and cnt1 == 0) or cnt2 == 0:
                    #loader = XPathItemLoader(item=Article(), response=response)
                    #print 111111111111111111
                    #loader.add_xpath('description', '//p[not(@align="center")]/text()')
                    #l.add_value('last_updated', 'today')
                    yield item
                else:
                    yield Request(url,
                                  callback=self.parse_item,
                                  meta={'item': item})
            else:
                self.log("Detail page url elem could not be read!", log.ERROR)
Beispiel #6
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        xxs.register_namespace('soapenv',
                               'http://schemas.xmlsoap.org/soap/envelope/')
        xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema')
        xxs.register_namespace('xsi',
                               'http://www.w3.org/2001/XMLSchema-instance')
        xxs.register_namespace(
            'CurrentsAndMetadata',
            'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl'
        )

        timelist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()'
        ).extract()
        cspdlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()'
        ).extract()
        cdirlist = xxs.select(
            '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()'
        ).extract()

        print len(timelist)

        for i in range(0, len(cdirlist)):
            sql_str = self.SQL_INSERT_STUB.format(
                self.get_current_station().lower(),
                str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]),
                'datafactory_currentdata')
            #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC)
            d_time_unware = datetime.datetime.strptime(
                str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S")
            d_time1 = pytz.utc.localize(d_time_unware)
            d_time = d_time1.astimezone(pytz.utc)
            if self.needStore(d_time):
                self.db.query(sql_str)

        self.db.commit()

        if timelist:
            sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format(
                DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(),
                self.startDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"),
                self.endDate.astimezone(
                    pytz.utc).strftime("%Y-%m-%d %H:%M:%S"))

            self.db.query(sql_str)
            self.db.commit()

        self.station_slot = self.station_slot + 1

        if (self.station_slot < len(self.start_urls)):
            yield self.start_urls[self.station_slot]
Beispiel #7
0
def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
        xs = XmlXPathSelector(text=nodetext)
        if namespace:
            xs.register_namespace('x', namespace)
        yield xs.select(selxpath)[0]
Beispiel #8
0
    def parse(self, response):

        hxs = XmlXPathSelector(response)
        name = hxs.select('//name').extract()

        if self.task_id is not None:
            self.log('Processing item %s' % self.task_id, log.INFO)
            self.alert_context = 'task_id=%s' % self.task_id
            for item in self.process_item(self.bot_task_params(self.task_id)):
                yield item
        else:
            for item in self.process_items():
                yield item
Beispiel #9
0
    def parse(self, response):
        """
        We define a custom parser here because we need to get the link from
        the feed item and then follow it to get the recipe data.

        Getting the data from <content:encoded> seems overly complex, as we
        would have to decode all the encoded characters and then build a DOM
        from that.
        """
        xxs = XmlXPathSelector(response)
        links = xxs.select(
            "//item/*[local-name()='origLink']/text()").extract()
        # self.parse_item comes from OnehundredonecookbooksMixin
        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #10
0
 def parse(self, response):
     x = XmlXPathSelector(response)
     zp_nodes = x.xpath("//stats")
     source = response.meta.get("source", "")
     for zp_node in zp_nodes:
         name = zp_node.xpath("////stats/stat/name/text()").extract()
         xy = zp_node.xpath("//stats/stat/xy/text()").extract()
         for i in range(len(name)):
             gz_item = GJZDItem()
             gz_item["name"] = name[i]
             gz_item["source"] = source
             gz_item["lng"] = xy[i].split(",")[0]
             gz_item["lat"] = xy[i].split(",")[1]
             yield gz_item
 def parsePart(self, response):
     item = response.meta['item']
     xxs = XmlXPathSelector(response)
     if len(xxs.select("//ERRORSEGMENT")) == 0:
         part_num = response.meta['part_num']
         end_range = response.meta['end_range']
         part_prefix = response.meta['part_prefix']
         item['parts'].append(self.part_format % (part_prefix, part_num))
         if part_num < end_range:
             yield self.makePartRequest(part_prefix, part_num + 1, item,
                                        end_range)
         else:
             yield item
     else:
         yield item
Beispiel #12
0
 def detect_feed(self, response):
     """Just detects the feed in the links and returns an Item"""
     xxs = XmlXPathSelector(response);
     '''Need to tweak the feedparser lib to just use the headers from response instead of 
     d/l the feed page again, rather than d/l it again 
     '''
     
     if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']):
         try:
             rssFeed = feedparser.parse(response.url);
             return  self.extract_feed(rssFeed)
         except:
             raise Exception('Exception while parsing/extracting the feed')	
         
     return None
Beispiel #13
0
    def parse_rss(self, response):
        item = response.request.meta['item']

        if response.status != 500:
            xxs = XmlXPathSelector(response)
            xxs.remove_namespaces()

            item['date'] = xxs.select('.//channel/date/text()').extract()
            description = xxs.select('.//channel/description/text()').extract()
            if (len(item.get('description', '')) < 10) and description:
                item['description'] = ''.join(description).strip()

        del (item['subpage_urls'])

        return item
Beispiel #14
0
    def parse(self, response):
        xxs = XmlXPathSelector(response)
        hxs = HtmlXPathSelector(response)
        links = xxs.select('//link/text()').extract()

        log.msg('Link length: %s' % len(links), level=log.ERROR)

        if len(links) <= 0:
            log.msg('no links found, using regular parser', level=log.ERROR)
            links = hxs.select('//a/@href').extract()

        msg = 'Links: %s' % links
        log.msg(msg, level=log.ERROR)

        return [Request(x, callback=self.parse_item) for x in links]
Beispiel #15
0
 def parse(self, response):
     base_url = get_base_url(response)
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         loader.add_xpath('category', 'g:brand/text()')
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         yield loader.load_item()
Beispiel #16
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        products = xxs.select('//item')
        for product in products:
            mpn = product.xpath('mpn/text()')
            if mpn:
                mpn = mpn[0].extract().upper().strip()
            else:
                mpn = None
            row = self.monitored_products.get(mpn) if mpn else None
            if row is None or (row and row['Discontinued'].lower().strip()
                               == 'yes'):
                continue
            loader = ProductLoader(selector=product, item=Product())
            loader.add_xpath('identifier', 'id/text()')
            loader.add_xpath('sku', 'mpn/text()')
            loader.add_xpath('brand', 'brand/text()')
            loader.add_xpath('image_url', 'image_link/text()')
            loader.add_xpath('url', 'link/text()')
            loader.add_xpath('name', 'title/text()')
            price = product.select('sale_price/text()').extract()
            if not price:
                price = product.select('price/text()').extract()

            loader.add_value('price', extract_price(price[0]))

            categories = product.select(
                'product_type/text()').extract()[-1].split('>')
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)

            shipping_cost = product.select('shipping/price/text()').extract()
            shipping_cost = extract_price(
                shipping_cost[0]) if shipping_cost else ''
            loader.add_value('shipping_cost', shipping_cost)

            in_stock = product.select(
                'availability[contains(text(), "in stock")]').extract()
            if not in_stock:
                loader.add_value('price', 0)

            item = loader.load_item()
            item['metadata'] = RHSMeta()
            item['metadata']['cost_price'] = row['Cost Price']
            yield item
Beispiel #17
0
 def parse(self, response):
     x = XmlXPathSelector(response)
     zp_nodes = x.xpath("//lines")
     count = 0
     for zp_node in zp_nodes:
         road = zp_node.xpath("//lines/line/name/text()").extract()
         stats = zp_node.xpath("//lines/line/stats/text()").extract()
         for i in range(len(road)):
             s = stats[i].split(";")
             for j in range(len(s)):
                 count += 1
                 zd_item = ZDCXItem()
                 zd_item["road"] = road[i]
                 zd_item["station_name"] = s[j]
                 zd_item["station_num"] = count
                 yield zd_item
             count = 0
 def parse(self, response):
     item = ArxivOrgItem()
     xxs = XmlXPathSelector(response)
     xxs.remove_namespaces()
     # 需要先将selector对象格式化成str
     xml_data = str(xxs.xpath('//link'))
     #logging.log(logging.INFO, xml_data)
     url_list = re.findall('http://arxiv.org/abs/\d+.\d+', xml_data)
     #logging.log(logging.INFO, url_list)
     for url in url_list:
         logging.log(
             logging.INFO,
             f'**************** crawling link: {url} ***************** ')
         yield Request(url=url,
                       callback=self.parse_single_page,
                       meta={'item': item},
                       dont_filter=True)
Beispiel #19
0
 def populate_vars(self, response=None, request=None, spider=None):
     self.vars['item'] = self.item_class()
     self.vars['settings'] = self.crawler.settings
     self.vars['spider'] = spider
     self.vars['request'] = request
     self.vars['response'] = response
     self.vars['xxs'] = XmlXPathSelector(response) \
         if isinstance(response, XmlResponse) else None
     self.vars['hxs'] = HtmlXPathSelector(response) \
         if isinstance(response, HtmlResponse) else None
     if self.inthread:
         self.vars['fetch'] = self.fetch
     self.vars['view'] = open_in_browser
     self.vars['shelp'] = self.print_help
     self.update_vars(self.vars)
     if not self.code:
         self.print_help()
Beispiel #20
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     base_url = get_base_url(response)
     xxs.register_namespace("f", "http://www.w3.org/2005/Atom")
     products = xxs.select('//f:entry')
     for product in products:
         product.register_namespace("g", "http://base.google.com/ns/1.0")
         product.register_namespace("p", "http://www.w3.org/2005/Atom")
         product_loader = ProductLoader(item=Product(), selector=product)
         name = product.select('./p:title/text()').extract()[0]
         if 'B-STOCK' in name.upper():
             continue
         product_loader.add_value('name', name)
         url = product.select('./p:link/@href').extract()[0]
         product_loader.add_value('url', urljoin_rfc(base_url, url))
         image_url = product.select('./g:image_link/text()').extract()
         if image_url:
             product_loader.add_value('image_url',
                                      urljoin_rfc(base_url, image_url[0]))
         category = product.select('./g:product_type/text()').extract()
         if category:
             product_loader.add_value('category', category[0])
         brand = product.select('./g:brand/text()').extract()
         if brand:
             product_loader.add_value('brand', brand[0])
         price = product.select('./g:sale_price/text()').extract()
         if price:
             product_loader.add_value('price', extract_price(price[0]))
         else:
             price = product.select('./g:price/text()').extract()
             product_loader.add_value('price', extract_price(price[0]))
         # sku = product.select('./g:gtin/text()').extract()
         # if sku:
         #     product_loader.add_value('sku', sku[0])
         identifier = product.select('./g:id/text()').extract()[0]
         product_loader.add_value('identifier', identifier)
         product_loader.add_value('sku', identifier)
         shipping_cost = product.select(
             './g:shipping/g:price/text()').extract()
         if shipping_cost:
             product_loader.add_value('shipping_cost',
                                      extract_price(shipping_cost[0]))
         product = product_loader.load_item()
         yield product
Beispiel #21
0
    def get_products(self, meta, response, colors, colors_ids):
        hxs = XmlXPathSelector(response)
        names, ids = self.get_names(meta['base_name'], meta['product_id'],
                                    meta['current_data'], colors, colors_ids)

        for i, name in enumerate(names):
            p = ProductLoader(item=Product(), response=response)
            p.add_value('identifier', ids[i])
            p.add_value('name', name)
            p.add_value('brand', meta['brand'])
            p.add_value('url', meta['url'])
            p.add_value('image_url', meta['image_url'])
            price = hxs.select('//cmd[@t="discounted_price"]/text()').extract()
            if price:
                price = price[0].replace('.', '').replace(',', '.')
                price = extract_price(price)
            if not price or price == Decimal(1):
                if not price:
                    self.log('Price not found %s' % meta['url'])
                else:
                    self.log('Price is one %s' % meta['url'])

                if not self.retries.get(
                        meta['url']) or self.retries.get(meta['url']) < 3:
                    self.log('Retrying %s' % meta['url'])
                    self.retries[meta['url']] = self.retries.get(
                        meta['url'], 0) + 1
                    p = meta['url']
                    yield Request(p,
                                  meta={
                                      'category':
                                      response.meta.get('category', ''),
                                      'cookiejar':
                                      p + str(self.retries.get(meta['url']))
                                  },
                                  callback=self.parse_product,
                                  dont_filter=True)
                else:
                    self.log('Max retries reached %s' % meta['url'])
                return
            p.add_value('price', price)
            p.add_value('shipping_cost', '0')
            p.add_value('category', response.meta.get('category'))
            yield p.load_item()
    def parse(self, response):
        if self.scraper.content_type == 'H':
            xs = HtmlXPathSelector(response)
        else:
            xs = XmlXPathSelector(response)
        base_elem = self.scraper.get_base_elem()
        url_elem = self.scraper.get_detail_page_url_elem()
        base_objects = xs.select(base_elem.x_path)
        if (len(base_objects) == 0):
            self.log("No base objects found!", log.ERROR)

        if (self.conf['MAX_ITEMS_READ']):
            items_left = min(
                len(base_objects),
                self.conf['MAX_ITEMS_READ'] - self.items_read_count)
            base_objects = base_objects[0:items_left]

        for obj in base_objects:
            item_num = self.items_read_count + 1
            self.log("Starting to crawl item %s." % str(item_num), log.INFO)
            item = self.parse_item(response, obj)
            #print item
            url_name = url_elem.scraped_obj_attr.name
            if (item and url_name in item):
                url = item[url_name]
                cnt = self.scraped_obj_class.objects.filter(
                    url=item[url_name]).count()
                cnt1 = self.scraper.get_standard_update_elems_from_detail_page(
                ).count()
                cnt2 = self.scraper.get_from_detail_page_scrape_elems().count()
                # Mark item as DOUBLE item
                if cnt > 0:
                    item[url_name] = 'DOUBLE' + item[url_name]
                # (DOUBLE item with no standard update elements to be scraped from detail page) or
                # generally no attributes scraped from detail page
                if (cnt > 0 and cnt1 == 0) or cnt2 == 0:
                    yield item
                else:
                    yield Request(url,
                                  callback=self.parse_item,
                                  meta={'item': item})
            else:
                self.log("Detail page url elem could not be read!", log.ERROR)
    def parse(self, response):
        xxs = XmlXPathSelector(response)

        for productxs in xxs.select(
                '//product[attribute_set/text()!="spares-accessories"]'):
            loader = ProductLoader(item=Product(), selector=productxs)
            loader.add_xpath('sku', './product_id/text()')
            loader.add_xpath('identifier', './product_id/text()')
            loader.add_xpath('price', './product_price/text()')
            loader.add_xpath('name', './product_name/text()')
            loader.add_xpath('url', './product_url/text()')
            loader.add_xpath('category', './attribute_set/text()')
            loader.add_xpath('brand', './manufacturer/text()')
            brand = loader.get_output_value('brand').strip().upper()

            if brand in self.ignore_brands:
                log.msg('Ignoring product %s because of brand %s' %
                        (loader.get_output_value('identifier'), brand))
                continue

            loader.add_value('stock', '1')

            item = loader.load_item()
            item['identifier'] = item['identifier'].upper()

            cost_price = productxs.select('./cost/text()').extract()
            metadata = CSCateringMeta()
            cost_price = cost_price[0].strip() if cost_price else '0.00'
            metadata['cost_price'] = cost_price
            item['metadata'] = metadata

            category = loader.get_output_value('category').strip().lower()

            if category in ignore_categories and not self.has_sku(
                    item.get('sku', '')):
                log.msg('Ignoring product %s because of category %s' %
                        (loader.get_output_value('identifier'), category))
                continue

            yield Request(item['url'],
                          callback=self.parse_img,
                          meta={'item': item})
Beispiel #24
0
    def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured(
                'You must define parse_node method in order to scrape this XML feed'
            )

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = XmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = HtmlXPathSelector(response)
            self._register_namespaces(selector)
            nodes = selector.select('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Beispiel #25
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     xxs.register_namespace("g", "http://base.google.com/ns/1.0")
     products = xxs.select('//channel/item')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('url', 'link/text()')
         loader.add_xpath('name', 'title/text()')
         loader.add_xpath('image_url', 'g:image_link/text()')
         loader.add_xpath('price', 'g:price/text()')
         loader.add_xpath('brand', 'g:brand/text()')
         categories = product.select(
             'g:product_type/text()').extract()[0].split(' &gt; ')
         loader.add_value('category', categories)
         loader.add_xpath('sku', 'g:id/text()')
         loader.add_xpath('identifier', 'g:id/text()')
         stock = product.select(
             'g:availability/text()').extract()[0].lower()
         if stock != 'in stock':
             loader.add_value('stock', 0)
         yield loader.load_item()
Beispiel #26
0
def scrape_rss(response):
    log.msg("inside scrape rss")
    xxs = XmlXPathSelector(response)
    items = []
    requests = []
    for item_tag in xxs.select('//item'):
        items.append(ArticleItem())
        if len(item_tag.select("title")) > 0:
            items[-1]["title"] = item_tag.select("title/text()")[0].extract()
        if len(item_tag.select("pubDate")) > 0:
            items[-1]["time_published"] = [
                item_tag.select("pubDate/text()")[0].extract()
            ]
        if len(item_tag.select("link")) > 0:
            items[-1]["url"] = item_tag.select("link/text()")[0].extract()
        if len(item_tag.select("description")) > 0:
            items[-1]["summary"] = item_tag.select(
                "description/text()")[0].extract()

        request = Request(items[-1]["url"], callback=extract_author_from_link)
        request.meta["item"] = items[-1]
        yield request
Beispiel #27
0
def xmliter(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
Beispiel #28
0
    def parse(self, response):
        hxs = XmlXPathSelector(response)
        shows = hxs.select('//show')
        date_from = datetime.now()
        date_to = date_from + timedelta(days=7 * 6)

        for show in shows:
            name = show.select('./name/text()').extract()[0]
            url = show.select('./@href').extract()[0]
            show_id = url.split('/')[-1]
            show_data = SHOWS_DATA % (show_id, date_from.strftime('%Y-%m-%d'),
                                      date_to.strftime('%Y-%m-%d'))
            r = Request(
                'https://api.entstix.com/api/v1/xlive/booking/book/availability/show',
                method='POST',
                body=show_data,
                callback=self.parse_products,
                meta={
                    'name': name,
                    'id': show_id
                })
            yield r
Beispiel #29
0
    def parse(self, response):

        xxs = XmlXPathSelector(response)
        xxs.remove_namespaces()
        urls = xxs.select('//loc/text()').extract()
        for url in urls:
            if 'brands-sitemap.xml' in url:
                continue

            if 'productbrand' in url:
                prod_id = re.findall('productbrand_(\d+).html', url)
                prod_id = prod_id[0] if prod_id else ''
                if prod_id:
                    if prod_id in self.product_ids:
                        continue
                    else:
                        self.product_ids.append(prod_id)
                yield Request(url,
                              callback=self.parse_product,
                              meta={"dont_merge_cookies": True})
            else:
                yield Request(url, meta={"dont_merge_cookies": True})
        '''
Beispiel #30
0
 def parse(self, response):
     xxs = XmlXPathSelector(response)
     for title in xxs.select("//item/title/text()").extract()
         log.msg(title)