def parse_item(self, response): """This function parses a property page @url https://nj.58.com/ershoufang/pn3/?PGTID=0d30000c-000a-c568-cd81-f02b4ffbea21&ClickID=1 @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath( 'title', '//div[@class="list-info"][1]/h2[@class="title"]/a/text()') l.add_xpath('price', '//p[@class="sum"][1]/b/text()') l.add_xpath( 'description', '//div[@class="list-info"][1]/p[@class="baseinfo"][1]//text()', MapCompose(str.strip), Join()) l.add_xpath( 'address', '//div[@class="list-info"][1]/p[@class="baseinfo"][2]/span//text()', MapCompose(str.strip), Join()) l.add_xpath('image_urls', '//div[@class = "pic"][1]/a/img/@src') # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, selector, response): # Create the loader using the selector l = ItemLoader(item=PropertiesItem(), selector=selector) # Load fields using XPath expressions l.add_xpath('title', './/*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.strip)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', './/*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', './/*[@itemtype="http://schema.org/Place"]' '[1]/*/text()', MapCompose(str.strip)) make_url = lambda i: urlparse.urljoin(response.url, i) l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src', MapCompose(make_url)) # Housekeeping fields l.add_xpath('url', './/*[@itemprop="url"][1]/@href', MapCompose(make_url)) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse(self, response): """This function parses a property page(contract爬虫可行测试 items是一不是L). @url https://voice.hupu.com/nba @returns items 1 @scrapes title toptitle topnews news @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) #load fields using XPath expressions l.add_xpath( 'title', '/html/body/div[3]/div[1]/div[1]/h2/text()') l.add_xpath( 'news', '/html/body/div[3]/div[1]/div[2]/ul/li/div[1]/h4/a/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( 'toptitle', '//*[@class="hd"]/h2/text()') l.add_xpath( 'topnews', '//*[@class="bd"]//a/text()') # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostbyname('nba.hupu.com')) l.add_value('date', datetime.datetime.now()) return l.load_item() """学习代码
def parse_pageItem(self, response): item = PropertiesItem() # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract() # item['price'] = response.xpath('//*[@itemprop="price"][1]/text()').re('[.0-9]+') # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract() # item['address'] = response.xpath('//*[@itemtype="http://schema.org/''Place"][1]/text()').extract() # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract() sel = Selector(response) xConts = sel.xpath('//div[@class="joke-list-item-main"]') for xCont_each in xConts: item['description'] = xCont_each.xpath( '//div[@class="joke-main-content clearfix"]/p/text()').extract( ) item['image_urls'] = xCont_each.xpath( '//a/img[@class="joke-main-img"]/@src').extract() item['vot'] = xCont_each.xpath( '//div/div/div/a[@data="g"]/text()').extract() # xItem['vot']['bad'] = xCont_each.xpath(‘//div/div/div/a[@data="b"]/text()').extract() # xItem['vot']['comm'] = xCont_each.xpath('//div/div/div//a[@data="c"]/text()').extract() next_selector = response.xpath( '//a[@class="pagination-link pagination-next"]/@href').extract( ) for each_next_selector in next_selector: Request('https://www.haha.mx' + each_next_selector, callback=self.parse_pageItem, method='GET') yield item
def parse_item(self, response): """ This function parses a property page @url http://localhost:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') loader.add_xpath( 'description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip, lambda i: i.replace('\r\n', ' '))) loader.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) loader.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: parse.urljoin(response.url, i))) loader.add_value('title', response.meta['title'], MapCompose(str.strip, str.title)) loader.add_value('url', response.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now()) yield loader.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://pdrfinessetools.com/index.php?route=extension/list/latest @returns items 1 @scrapes product_title price description product_code image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('product_title', '//*[@id="content"]//h1/text()', MapCompose(str.strip, str.title)) l.add_xpath('price', '//*[@class="list-unstyled"]//h2/text()') l.add_xpath('description', '//*[@id="tab-description"]//text()', MapCompose(str.strip), Join()) l.add_xpath( 'product_code', '//*[@id="content"]//*[@class="list-unstyled"]//li[2]/text()', MapCompose(str.strip)) l.add_xpath( 'image_urls', '//*[@id="content"]//a[@class="thumbnail"]//img/@src', MapCompose(lambda i: urllib.parse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath("title", '//*[@class="title"]/text()') l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()', MapCompose(float)) return l.load_item()
def parse(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//*[@itemprop="name"]/text()', MapCompose(str.strip, str.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"]/span/text()', MapCompose(str.strip)) l.add_xpath( 'image_urls', '//*[@itemprop="image"]/@src', MapCompose(lambda i: urllib.parse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url http://localhost:9312/properties/property_000001.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', gethostname()) l.add_value('date', datetime.now()) return l.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ if not response: self.log("RESPONSE IS NONE") # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(str.strip, str.title)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath("title", '//*[@class="tab-cont clearfix"]/h1/text()') l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()', MapCompose(float)) l.add_xpath("image_url", '//*[@class="bigImg"]/img[1]/@src') return l.load_item()
def parse(self, response): """ This function parses a property page. @url https://www.gumtree.com/p/property-to-rent/one-bedroom-property-near-chiswick-park-tube-station./1405437559 @returns items 1 @scrapes title price description address image_urls @scrapes url project server spider date """ loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_xpath( 'title', '//h1[@class="css-1uk1gs8 e1pt9h6u3"]/text()', MapCompose(str.strip)) loader.add_xpath( 'price', '//h2[@itemprop="price"]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+' ) loader.add_xpath( 'description', '//p[@itemprop="description"]/text()', MapCompose(str.strip), Join()) loader.add_xpath( 'address', '//h4[@itemprop="addressLocality"]/text()', MapCompose(str.strip)) loader.add_xpath( 'image_urls', '//*[@class="carousel-item"]/img/@src', MapCompose(lambda i: urljoin(response.url, i)) ) loader.add_value('url', response.url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now()) return loader.load_item()
def parse_item(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath("title", '//*[@class="tab-cont clearfix"]/h1/text()') l.add_xpath("price", '//*[@class="trl-item sty1"]/i/text()', MapCompose(float)) l.add_value("nextpage_url2", response.meta["urlll"]) return l.load_item()
class BasicSpider(scrapy.Spider): name = "basictest" allowed_domains = ["web"] start_urls = ( 'https://developers.facebook.com/blog/post/2021/01/26/introducing-instagram-content-publishing-api/?utm_source=email&utm_medium=fb4d-newsletter-february21&utm_campaign=organic&utm_offering=business-tools&utm_product=instagram&utm_content=body-button-instagram-graph-API&utm_location=2', ) def parse(self, response): """ @url https://developers.facebook.com/blog/post/2021/01/26/introducing-instagram-content-publishing-api/?utm_source=email&utm_medium=fb4d-newsletter-february21&utm_campaign=organic&utm_offering=business-tools&utm_product=instagram&utm_content=body-button-instagram-graph-API&utm_location=2 @return item 1 @scrapes title price @scrapes url project""" l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath( 'title', '/html/body/div[1]/div[5]/div[2]/div/div/div/div[2]/div[2]/div[2]/div[1]/div/div/div[2]/div/div/p[1]/text()', MapCompose(unicode.strip, unicode.title)) # l.add_xpath('price', './/*[@itemprop="price"][1]/text()', # MapCompose(lambda i: i.replace(',', ''), # float), # re='[,.0-9]+') # l.add_xpath('description', '//*[@itemprop="description"]' # '[1]/text()', # MapCompose(unicode.strip), Join()) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//h1[@id="ad-title"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', '//strong[contains(@class, "ad-price")]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//p[@class="ad-description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath('address', '//span[@itemprop="address"]/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '/descendant::img[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item() # item = PropertiesItem() # item['title'] = response.xpath('//h1[@id="ad-title"]/text()').extract() # item['price'] = response.xpath('//strong[contains(@class, "ad-price")]/text()').extract()[0].strip() # item['description'] = response.xpath('//p[@class="ad-description"][1]/text()').extract() # item['address'] = response.xpath('//span[@itemprop="address"]/text()').extract() # item['image_urls'] = response.xpath('/descendant::img[@itemprop="image"][1]/@src').extract() # return item
def parse(self, response): ids = response.xpath( '//div[@class="star_hotstar"]//a[@class="name_hotstar"]/@href' ).extract() # pdb.set_trace() return_items = [] # pdb.set_trace() for id in ids: try: item = PropertiesItem() star_id = re.findall("\d+", id)[0] sex = response.xpath( '//div[@class="star_hotstar"]//a[@class="name_hotstar" and @href="http://www.happyjuzi.com/star-{}/"]/following-sibling::a[1]/text()' .format(star_id)).extract_first() item['id'] = star_id if sex == '男': item['sex'] = 1 elif sex == '女': item['sex'] = 0 else: item['sex'] = -1 print('id:{}'.format(star_id)) return_items.append(item) except: print('error') return return_items
def parse(self, response): # 直接在日志中输出结果 方式: #self.log("address:%s" %response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()').extract()) ''' #实例化 item 可以输出到指定的文件类型 中保存结果 # item = PropertiesItem() item['title']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h1/text()').extract() item['price']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h2/span/strong/text()').re('[.0-9]+') item['address']=response.xpath('//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()').extract() return item ''' #用处理器进行处理 更强大 dl = ItemLoader(item = PropertiesItem(),response = response) # 用 css 选择器 进行提取数据 # 用ID属性进行定位 #dl.add_css('price','#baseinfo_top_layout strong::text') #用 class 属性定位 等同于上面用ID定位 dl.add_css('price','.price strong::text') #用 xpath 选择器 进行提取数据 dl.add_xpath('title','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h1/text()') #dl.add_xpath('price','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/div[1]/h2/span/strong/text()') dl.add_xpath('address','//*[@id="baseinfo_top_layout"]/div[3]/div[2]/div/ul[2]/li[1]/text()') #add_value() 方法 获取python 计算获取的单个值 而不是xpath or css 表达式 dl.add_value('url',response.url) dl.add_value('spider', self.name) dl.add_value('server', socket.gethostname()) dl.add_value('h_date', datetime.datetime.now()) #item = dl.load_item() print(dl.load_item()) yield dl.load_item()
def parse(self, response): l = ItemLoader(item = PropertiesItem(), response=response) l.add_xpath('title', '//*[@id="productTitle"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', '//*[@id="priceblock_ourprice"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+') l.add_xpath('description','//*[@id="productDescription"]//p/text()', MapCompose(unicode.strip), Join()) l.add_xpath('availability', '//*[@id="availability"]//span/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '//*[@id="imgTagWrapperId"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse(self, response): # Convert rows into items rows = response.xpath('//tbody/tr[@id]') for row in rows: # Load fields using XPath expressions l = ItemLoader(item=PropertiesItem(), selector=row, response=response) l.add_xpath('build', './td[3]/text()') l.add_xpath('city', './td/text()[following-sibling::br]') l.add_xpath('house_number', './td/a/strong/text()', re='[0-9]+.') l.add_xpath('living_space', './td[4]/text()') l.add_xpath('plot_space', './td[5]/text()') l.add_xpath('postal_code', './td/text()[following-sibling::br]') l.add_xpath('price', './td/strong/text()') l.add_xpath('street', './td/a/strong/text()', re=r'\D+\S[A-Za-z][^- ]') yield l.load_item() # Get the next index URL urls = response.xpath( '//a[contains(@class, "volgende")]//@href').extract() if len(urls) > 0: yield response.follow(urls[0], callback=self.parse)
def parse_item(self, response): item = PropertiesItem() item['url'] = response.url item['title'] = response.xpath('/html/head/title/text()').extract()[0] yield item
def parse(self, response): # 创建 contract """ This function parase a property page. @url http://web:3912/properties/property_000000.html @returns items L @scrapes title price description address image_urls @scrapes url project spider server date """ # 比较实用的水平爬取和垂直爬取URL # 水平 urls = response.xpath('//*[contains(@class,"next")]//@href').extract() absUrls = [urlparse.urljoin(response.url, i) for i in urls] # 垂直 urls = response.xpath('//*[@itemprop="url"]/@href').extract() absUrls = [urlparse.urljoin(response.url, i) for i in urls] # 使用爬虫预定义的方法log(),输出在基本字段表中总结的所有内容 self.log("title: %s" % response.xpath('//*[@itemprop="name"][1]/text()').extract()) self.log("price: %s" % response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+')) self.log("description: %s" % response.xpath('//*[@itemprop="description"][1]/text()').extract()) self.log("address: %s" % response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract()) self.log("image_urls: %s" % response.xpath('//*[@itemprop="image"][1]/@src').extract()) # # 填充Item # item = PropertiesItem() # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract() # item['price'] = response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+') # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract() # item['address'] = response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract() # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract() # return item # 清理 item 装载器与管理字段 L = ItemLoader(item=PropertiesItem(), response=response) L.add_xpath('title', '//*[@itemprop="name"][1]/text()') L.add_xpath('price', '//*[@itemprop="price"][1]/text()', re='[.0-9]+') L.add_xpath('description', '//*[@itemprop="description"][1]/text()') L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()') L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src') # 使用处理器 用来对我们的Xpath/CSS结果进行处理。 # 在爬虫中使用几个这样子的处理器,并按照我们想要的方式输出 L.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) L.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+') L.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i : urlparse.urljoin(response.url, i))) # 使用add_value方法设置管理字段 L.add_value('url', reponse.url) L.add_value('project', self.settings.get('BOT_NAME')) L.add_value('spider', self.name) L.add_value('server', socket.gethostname) L.add_value('date', datetime.datetime.now()) return L.load_item() pass
def parse(self, response): l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath("title", "//*[@title]/@title") l.add_xpath("image_urls", "//*[starts-with(@src,'http://image.woshipm.com')]/@src", MapCompose(lambda i: i.strip())) # l.add_xpath("urls","//*[starts-with(@href,'http://www.woshipm.com')]/@href",MapCompose(lambda i:i[22:])) return l.load_item()
def parse_item(self, selector, response): l = ItemLoader(item=PropertiesItem(), selector=selector) l.add_xpath("title", './/*[@class="title"]/a/text()') l.add_xpath("price", './/*[@class="price"]/text()', MapCompose(float)) make_url = lambda i: urllib.parse.urljoin(response.url, i) l.add_xpath("item_url", './/*[@class="title"]/a/@href', MapCompose(make_url)) l.add_value("nextpage_url", response.url) return l.load_item()
def parse_item(self, response): # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # image l.add_xpath('image_urls', '//*[@class="artical-importantPic"][1]/img/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) return l.load_item()
def parse2(self, response): msg = response.xpath('//*[@class="txt-box"]').extract() for i in msg: print('hi', i) l = ItemLoader(item=PropertiesItem(), response=response) l.add_value("title", '题目') l.add_value("describe", '介绍') return l.load_item()
def parse_item(self, response): # 定义loader 装饰器 loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_value('link', response.url) loader.add_xpath('text', 'xpath表达式', MapCompose(unicode.strip, unicode.titile), Join()) return loader.load_item()
def parse(self, response): self.log("I am running") item = PropertiesItem() item['url'] = response.url item['project'] = self.settings.get('BOT_NAME') item['spider'] = self.name item['server'] = socket.gethostname() item['date'] = datetime.datetime.now() yield item
def parse(self, response): try: item = PropertiesItem() item['id'] = re.findall("\d+", response.request.url)[0] # pdb.set_trace() img_src = response.xpath( '//img[@class="i_starimg_starindex"]/@src').extract_first() if '?' in img_src: img_src = img_src[:img_src.index('?')] item['key_src'] = img_src print('success:{}'.format(item['id'])) return item except: item = PropertiesItem() item['id'] = re.findall("\d+", response.request.url)[0] return item
def parse(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ item = PropertiesItem() item['title'] = response.xpath('//title/text()').extract() print(item['title'])
def parse_item(self, response): # 定义loader 装饰器 print(response.url) # 定义loader 装饰器 loader = ItemLoader(item=PropertiesItem(), response=response) loader.add_value('link', response.url) loader.add_xpath('text', '//div[@class="vulners-card-text"]//text()', MapCompose(str.strip), Join()) return loader.load_item()