def parse(self, response): for sel in response.xpath('//ol[@class="product-layout"]/li'): item = AdItem() empty = "" item['source'] = "eloue" item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("@name").extract()[0] except: item['title'] = empty try: item['media'] = "https:" + sel.xpath( 'div/div/a/img/@style').extract()[0].split(')')[0].split( ':')[-1] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div[@class="info"]/p[@class="full_description"]/text()' ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div/div[@class="info"]/p/text()').extract()[0] item['postal_code'] = int(item['location'].split(', ')[1]) except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = sel.xpath("@locationx").extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath("@locationy").extract()[0] except: item['longitude'] = empty try: price = sel.xpath('div/div/span[@class="badge price"]/text()' ).extract()[0].split('/') item['price'] = price[0].strip(' ').encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//item[@id]'): item = AdItem() item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory empty = "" try: item['title'] = sel.xpath('title/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('image/palm/@url').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('link/text()').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('subtitle/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'location/locality/text()').extract()[0] except: item['location'] = empty try: item['postal_code'] = sel.xpath( 'location/postal_code/text()').extract()[0] except: item['postal_code'] = empty try: item['latitude'] = sel.xpath('location/lat/text()').extract( )[0] if len(sel.xpath('location/lat/text()').extract( )[0]) > 1 else sel.xpath('/search/lat/text()').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath('location/lng/text()').extract( )[0] if len(sel.xpath('location/lng/text()').extract( )[0]) > 1 else sel.xpath('/search/lng/text()').extract()[0] except: item['longitude'] = empty try: item['price'] = sel.xpath('price/text()').extract()[0] except: item['price'] = empty try: item['currency'] = sel.xpath('price/@currency').extract()[0] except: item['currency'] = empty try: item['evaluations'] = sel.xpath( 'evaluation_number/text()').extract()[0] except: item['evaluations'] = empty item['period'] = "jour" yield item
def parse(self, response): for sel in response.xpath("//ul[@id='results']/li"): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( "div/div[2]/div[@class='offer__details']/h3/a/text()" ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'div/div/a/img[2]/@data-src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( "div/div[2]/div[@class='offer__details']/div[@class='offer__description']/text()" ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( "div/div[2]/div[@class='offer__details']/div[@class='offer__subtitle']/text()" ).extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath( "div/div[2]/div[@class='price price--mini js-price-per-night']/div/text()[2]" ).extract()[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( "div/div[2]/div[@class='price price--mini js-price-per-night']/div[2]/text()" ).extract()[0] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//ul[@class="users"]/li'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div/div[@class="content"]/h3[@class="orange"]/a/text()' ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div/a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath( 'div/div[@class="content"]/h3[@class="orange"]/a/@href' ).extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div[@class="content"]/p[@class="description"]/text()' ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div/div[@class="content"]/div[@class="map_information"]/p/text()[2]' ).extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = response.xpath( 'div/p[@class="price orange"]/text()').extract()[0].split( '/')[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = response.xpath( 'div/div[@class="content"]/p[@class="meta"]/text()[3]' ).extract()[0] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@data-id]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category try: item['title'] = sel.xpath('@data-name').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div/a/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('@data-url').extract()[0].split('?')[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[2]/div/div[@itemprop="description"]/a/text()').extract()[0] except: item['description'] = sel.xpath('@data-name').extract()[0] if "Chambre" in item['description']: item['subcategory'] = "room" else: item['subcategory'] = "apartment" try: item['evaluations'] = 0 find = re.search(pattern, item['description']) if find: item['evaluations'] = int(find.group()) except: item['evaluations'] = 0 item['latitude'] = sel.xpath('@data-lat').extract()[0] item['longitude'] = sel.xpath('@data-lng').extract()[0] try: item['location'] = urllib.unquote(response.url.split('?')[0].split('s/')[-1]) except: item['location']= empty item['postal_code'] = 0 try: item['price'] = sel.xpath('div/a[2]/div/span/text()').extract()[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "nuit" yield item
def parse(self, response): print response.url for sel in response.xpath("//div[@id='loginbox']"): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/div/div/div/p/a/text()').extract()[0] except: item['title'] = empty try: item['location'] = response.url.split('/')[-1].split('.')[0] except: item['location'] = empty item['postal_code'] = 0 try: item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0] except: item['media'] = self.allowed_domains[0] + "/images/parking-orange-26x26.png" try: item['url'] = self.allowed_domains[0] + sel.xpath("div/div/div/div/p/a/@href").extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div/div/div/div/span/text()').extract()[0] desc1 = sel.xpath('div/div/div/div/span[2]/text()').extract()[0] item['description'] = desc0 + ", " + desc1 except: item['description'] = empty try: item['latitude'] = float(self.geo_cities[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo_cities[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[0].encode('utf-8').split('€')[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[1] except: item['period'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="card"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[@class="content"]/div[@class="vehicle-info"]/p/text()' ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'a/div[@class="image"]/img/@src').extract()[0] except: item['title'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[@class="content"]/div[@class="vehicle-info"]/p[@class="description"]/text()' ).extract()[0].strip("\n ") except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="content"]/div[@class="vehicle-info"]/p[@class="city"]/strong/@title' ).extract()[0] except: item['location'] = empty try: res = sel.xpath( 'div[@class="content"]/div[@class="vehicle-info"]/p[@class="city"]/span/text()' ).extract()[0] item['postal_code'] = int(res.split(')')[0].split('(')[1]) except: item['postal_code'] = 0 item['latitude'] = empty item['longitude'] = empty item['evaluations'] = empty try: item['price'] = sel.xpath( 'a/div[@class="image"]/span[@class="price"]/strong/text()' ).extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "jour" yield item
def parse(self, response): for sel in response.xpath('//div[@class="block"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/h2/a/text()').extract()[0].strip("\n ") except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div/div[@class="boat-info"]/text()').extract()[0].strip("\n ") except: item['description'] = empty try: item['evaluations'] = sel.xpath('div/div[@class="boat-skipper"]/div[@class="nb-commentaires"]/span[@class="nb-com"]/text()').extract()[0].strip("\n ") except: item['evaluations'] = empty try: item['location'] = sel.xpath('div/div/h4/strong/text()').extract()[0].strip(' -') except: item['location'] = empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="hosting-meta"]/div/span/strong/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = '€' except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[3]/span/text()').extract()[0] except: item['period'] = empty item['postal_code'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="box-parking-dispo"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/span[@class="title-parking"]/text()').extract()[0] item['postal_code'] = searchZip(item['title']) except: item['title'] = empty item['postal_code'] = 0 try: item['media'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-right"]/div[2]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@alt').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div/div/div/div/h1/span/text()').extract()[0] except: item['location'] = empty try: item['latitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lon'] except: item['longitude'] = empty try: item['price'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/span/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/text()').extract()[0].strip('/') except: item['period'] = empty try: item['evaluations'] = re.search(self.pattern, sel.xpath('div/div/div[@class="detail-parking-left"]/div/div/span/text()').extract()[0]).group() except: item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="rentResult ad-list-item"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="detail"]/img/@alt').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="detail"]/meta/@content').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="detail"]/div/p/span/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('//div[@class="rentResult ad-list-item"]/div[@class="detail"]/div/div[@itemprop="address"]/span[@class="location"]/span/text()').extract()[0] except: item['location'] = empty item['postal_code'] = 0 item['latitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="latitude"]/@content').extract()[0] item['longitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="longitude"]/@content').extract()[0] try: price0 = sel.xpath('table/tr[2]/td[1]/text()').extract()[0].encode('utf-8').strip('€') price1 = sel.xpath('table/tr[2]/td[2]/text()').extract()[0].encode('utf-8').strip('€') price2 = sel.xpath('table/tr[2]/td[3]/text()').extract()[0].encode('utf-8').strip('€') item['price'] = price0 + ", " + price1 + ", " + price2 item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: period0 = sel.xpath('table/tr/td[1]/text()').extract()[0] period1 = sel.xpath('table/tr/td[2]/text()').extract()[0] period2 = sel.xpath('table/tr/td[3]/text()').extract()[0] item['period'] = period0 + ", " + period1 + ", " + period2 except: item['period'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//tr'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'td[@id="colonne4"]/div[@id="title_ad"]/a/text()').extract( )[0].strip(' ').title() except: item['title'] = empty item['media'] = empty try: item['url'] = sel.xpath( 'td[@id="colonne4"]/div[@id="title_ad"]/a/@href').extract( )[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'td[@id="colonne4"]/div[@id="text_ad"]/a/text()').extract( )[0] except: item['description'] = empty try: item['location'] = sel.xpath('td[@id="colonne3"]/text()' ).extract()[0].strip(' ').title() except: item['location'] = empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'td[@id="colonne5"]/div/text()').extract()[0] except: item['period'] = empty item['postal_code'] = empty item['evaluations'] = empty yield item
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) results = jsonresponse["ads"] for sel in results: item = AdItem() empty = "" item['category'] = sel['category'] item['subcategory'] = sel['subcategory'] try: item['title'] = sel['title'] except: item['title'] = empty try: item['media'] = sel["media"] except: item['media'] = empty try: item['url'] = sel['url'] except: item['url'] = empty try: item['description'] = sel['description'] except: item['description'] = empty try: item['location'] = sel['location'] except: item['location'] = empty try: item['latitude'] = sel['latitude'] except: item['latitude'] = empty try: item['longitude'] = sel['longitude'] except: item['longitude'] = empty try: item['price'] = sel["price"] item['currency'] = "EUR" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel["period"] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="itemInside event-box p15"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('a/img/@title').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/h2/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath( 'div[@class="dateHeureEvent"]/a/text()').extract()[0] item['description'] = desc0 except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="author"]/div[@class="authorRight"]/a[2]/text()' ).extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath("div/div[2]/div/text()").extract( )[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[@class="dateHeureEvent"]/a/text()').extract()[0] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="row"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div/div/div/h4/text()").extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div/div/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div/div/div/div/h5/text()').extract()[0] except: item['description'] = empty item['location'] = response.url.split('terms=')[1].split('+')[0] item['postal_code'] = response.url.split('terms=')[1].split( '+')[1].split(')')[0].strip('(') item['evaluations'] = empty try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: price = sel.xpath( 'div/div/div/div/div[3]/h5/text()').extract()[0].split('/') item['price'] = price[0].strip(' ').encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty yield item
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) #print jsonresponse results = jsonresponse["hits"] for sel in results: item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel['title'] except: item['title'] = empty try: item['media'] = sel["pictures"][0] except: item['media'] = empty try: item['url'] = sel['url'] except: item['url'] = empty try: item['description'] = sel['comment'] except: item['description'] = empty try: item['location'] = sel['address'] except: item['location'] = empty try: item['postal_code'] = int(sel['postal_code']) except: item['postal_code'] = empty try: item['latitude'] = sel['lat'] except: item['latitude'] = empty try: item['longitude'] = sel['lng'] except: item['longitude'] = empty try: item['price'] = sel["unit_month_price_with_fee"] item['currency'] = "€" item['period'] = "par mois" except: item['price'] = empty item['currency'] = empty item['period'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//ul[@id="results"]/li'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/h3[@class="resultUserName"]/text()').extract()[0].strip('\n ') except: item['title'] = empty try: item['media'] = "https:"+sel.xpath('div[@class="resultContainer"]/div[@class="resultImgContainer"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div[@class="resultContainer"]/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultType"]/text()').extract()[0].strip('\n ') desc1 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ') item['description'] = desc0 + " "+ desc1 except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ').split(' ')[-1] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].encode('utf-8').strip('\n €') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].strip("\n' /") except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//article'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'a/div[2]/header/h1/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'a/aside/figure/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'a/div[2]/p[@class="description"]/text()').extract( )[0].strip('\n') except: item['description'] = empty try: item['latitude'] = sel.xpath('a/@data-latitude').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath('a/@data-longitude').extract()[0] except: item['longitude'] = empty try: item['location'] = sel.xpath( 'a/aside/div[@class="user-city"]/text()').extract()[0] except: item['location'] = empty item['postal_code'] = empty item['evaluations'] = empty item['price'] = empty item['currency'] = empty item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@itemtype]'): item = AdItem() empty = "" item['source'] = self.name category = response.url.split('?')[0].split('/')[-1] item['category'] = self.categories[category]["category"] item['subcategory'] = self.categories[category]["subcategory"] try: item['title'] = sel.xpath( 'div/div[@class="nsadtitle"]/text()').extract()[0] except: item['title'] = empty try: item['media'] = "https:" + sel.xpath( 'div/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[@class="nsadprice"]/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div[@class="nsadsub"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[2]/div[3]/span[2]/text()').extract()[0] item['postal_code'] = int( item['location'].test.split(' - ')[0]) except: item['location'] = empty item['postal_code'] = 0 item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath( 'div[@class="nsadprice"]/div[@class="nsofferamount"]/text()' ).extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "jour" item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@data-element-id]'): item = AdItem() empty = '' item['source'] = "zilok" item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[2]/h3/a/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[1]/@style').extract()[0].split('(')[1].split(')')[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[2]/h3/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0] desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0] desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0] item['description'] = desc0 + " " + desc1 + " " + desc2 except: item['description'] = empty try: item['location'] = sel.xpath('div[2]/h4/text()').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[3]/div/p/text()').extract()[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[3]/div/p[2]/text()').extract()[0] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//ul[@class="items"]/li'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="description"]/h3/a/text()').extract()[0] except: item['title'] =empty try: item['location'] = item['title'].split(' (')[0].split(' ')[-1] except: item['location'] = empty try: item['media'] = sel.xpath('div[@class="thumbnail"]/img/@src').extract()[0] except: item['media'] =empty try: item['evaluations'] = sel.xpath('div[@class="thumbnail"]/div[@class="comment-count"]/text()').extract()[0] except: item['evaluations'] =empty try: item['url'] = sel.xpath('div[@class="description"]/h3/a/@href').extract()[0] except: item['url'] =empty try: item['description'] = sel.xpath('div[@class="description"]/text()[3]').extract()[0] except: item['description'] =empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty item['longitude'] = empty item['price'] = empty item['currency'] = empty item['period'] = empty try: item['postal_code'] = re.search(self.pattern, item['title']).group() except: item['postal_code'] = empty yield item
def parse(self, response): for sel in response.xpath( '//ul[@data-view="card"]/li[@itemtype="http://schema.org/Organization"]' ): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('@data-name').extract()[0] except: item['title'] = empty try: item['location'] = urllib2.unquote(response.url.split('/')[-2]) except: item['location'] = empty try: item['media'] = sel.xpath('div/a/@style').extract()[0].split( '(')[-1].split(');')[0] except: item['media'] = empty try: item['url'] = sel.xpath('div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/a[2]/div[2]/p/text()').extract()[0].strip('\t') except: item['description'] = empty item['postal_code'] = empty if item['location'] is not empty: item['latitude'] = self.geo[item['location']]['lat'] item['longitude'] = self.geo[item['location']]['lon'] else: item['latitude'] = empty item['longitude'] = empty item['price'] = empty item['currency'] = empty item['period'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="ligne_simple"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="annonce_detail"]/p/a/@title').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="annonce_img"]/a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="annonce_img"]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="annonce_detail"]/span[@class="desc"]/a/text()[2]').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="annonce_detail"]/span[@class="desc"]/a/span/text()').extract()[0] except: item['location'] = empty try: item['postal_code'] = searchZip(sel.xpath('div[@class="annonce_detail"]/span[@class="desc"]/a/span[2]/text()').extract()[0]) except: item['postal_code'] = empty item['latitude'] = empty item['longitude'] = empty item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[@class="annonce_detail"]/span[@class="desc"]/a/text()[2]').extract()[0].split('-')[0] except: item['period'] = empty item['evaluations'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="annonces"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="cadre-in"]/div[@class="coordonnees"]/b/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="cadre-in"]/div[@class="coordonnees"]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="cadre-in"]/div[@class="descriptif"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="cadre-in"]/div[@class="coordonnees"]/text()[3]').extract()[0].strip('\n \t') except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="cadre-in"]/div[@class="coordonnees"]/text()[8]').extract()[0].split(':')[-1].strip('\n').encode('utf-8').split('€')[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@data-car-id]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div[@class='search_card_content car_content']/a[@class='car_title']/@title").extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="search_card_aside car_photo"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="search_card_content car_content"]/a[@class="car_title"]/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_subtitle"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_location"]/text()[2]').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/strong/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/text()').extract()[0] except: item['period'] = empty yield item
def parse(self, response): for sel in response.xpath("//div[@class='home-list-item']"): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div[2]/h2/a/text()").extract()[0] item['location'] = self.France.city_from_title(item['title']) except: item['title'] = empty item['location'] = empty try: item['media'] = sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[3]/div[2]/a/text()').extract()[0] except: item['description'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div/div/div/text()').extract( )[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "day" yield item
def parse(self, response): for sel in response.xpath('//div[@class="row"]'): item = AdItem() empty = "unknown" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div[2]/h2/a/text()").extract()[0] except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath( 'div/a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[2]/p/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[2]/p/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div[2]/text()[3]').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[3]/div/span/text()').extract( )[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//li[@data-id_product]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('a/p/strong/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'a/figure/span/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'a/figure/span/img/@alt').extract()[0] except: item['description'] = empty item['latitude'] = empty item['longitude'] = empty item['location'] = empty try: item['price'] = sel.xpath( 'a/p/span[2]/span/text()').extract()[0] except: item['price'] = empty item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//table[@class="annonces"]/tr'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('td/a/@title').extract()[0] except: item['title'] = empty item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('td/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('td[3]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('td[2]/span/span/text()').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: price = sel.xpath('td[@style="text-align: right;"]/text()').extract()[0].split('/') item['price'] = price[0].encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = empty yield item
def parse(self, response): for sel in response.xpath('//div[@class="community-events-results-item"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="community-events-results-right"]/h3/a/strong/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="community-events-results-left"]/a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div[@class="community-events-results-left"]/a/@href').extract()[0] except: item['url'] = empty try: start = "De " + sel.xpath('div[@class="community-events-results-right"]/div[@class="eventTime"]/text()').extract()[0] end = "a " + sel.xpath('div[@class="community-events-results-right"]/div[@class="eventTime"]/text()[2]').extract()[0] item['description'] = start + ' ' + end item['period'] = item['description'] except: item['description'], item['period'] = empty, empty try: item['location'] = xpath('div[@class="community-events-results-right"]/div[@class="eventLocation"]/text()').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty item['price'] = empty item['currency'] = empty yield item
def parse(self, response): for sel in response.xpath('//li[@data-element-id]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[2]/div[1]/h3/a/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[1]/@style').extract()[0].split( '(')[1].split(')')[0].strip("'") except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[2]/div[1]/h3/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0] desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0] #desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0] item['description'] = desc0 + " " + desc1 + " " except: item['description'] = empty try: item['location'] = sel.xpath( 'div[2]/div[1]/h4/text()').extract()[0] except: item['location'] = empty item['postal_code'] = empty item['evaluations'] = empty url_city = response.url.split('?')[0].split('/')[-1] try: item['latitude'] = float(self.geo[url_city]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[url_city]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath('div[2]/div[3]/p/text()').extract( )[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[2]/div[3]/p[2]/text()').extract()[0] except: item['period'] = empty yield item