Beispiel #1
0
class DrivySpider(scrapy.Spider):
    name = "lamachineduvoisin"
    category = "daily"
    subcategory = "washing"
    allowed_domains = ["http://www.lamachineduvoisin.fr"]
    # scrap lamachineduvoisin par villes
    France = France()
    cities = France.cities
    
    start_urls = list(map(lambda x: "http://www.lamachineduvoisin.fr/fr/find/"+str(x), cities))


    def parse(self, response):
        
        for sel in response.xpath('//div[@data-car-id]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath("div[@class='search_card_content car_content']/a[@class='car_title']/@title").extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath('div[@class="search_card_aside car_photo"]/img/@src').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = sel.xpath('div[@class="search_card_content car_content"]/a[@class="car_title"]/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_subtitle"]/text()').extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_location"]/text()[2]').extract()[0]
            except:
                item['location'] = empty
            
            item['latitude'] = empty
            item['longitude'] = empty
            
            try:
                item['price'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/strong/text()').extract()[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            
            try:
                item['period'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/text()').extract()[0]
            except:
                item['period'] = empty

            yield item
Beispiel #2
0
class EloueBricoSpider(scrapy.Spider):
    name = "zealguide"
    category = "leisure"
    subcategory = "visiting"
    France = France()
    allowed_domains = ["https://www.zealguide.com"]
    start_urls = [
        "https://www.zealguide.com/fr?q=france&transaction_type=offering&view=list"
    ]

    def parse(self, response):
        for sel in response.xpath("//div[@class='home-list-item']"):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath("div[2]/h2/a/text()").extract()[0]
                item['location'] = self.France.city_from_title(item['title'])
            except:
                item['title'] = empty
                item['location'] = empty

            try:
                item['media'] = sel.xpath('a/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                item['description'] = sel.xpath(
                    'div[3]/div[2]/a/text()').extract()[0]

            except:
                item['description'] = empty

            item['latitude'] = empty
            item['longitude'] = empty

            try:
                item['price'] = sel.xpath('div/div/div/text()').extract(
                )[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            item['period'] = "day"
            yield item
class SharedparkingSpider(scrapy.Spider):
    name = "sharedparking"
    category = "parking"
    subcategory = "parking"
    allowed_domains = ["http://www.sharedparking.fr/"]
    France = France()
    cities = France.cities
    urls = list(map(lambda x: "http://www.sharedparking.fr/search?sc-cat=2&w="+str(x), cities))
    start_urls = [url+"&page="+str(i) for url in urls for i in range(1,4)]

    def parse(self, response):
        for sel in response.xpath('//table[@class="annonces"]/tr'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('td/a/@title').extract()[0]

            except:
                item['title'] = empty
            
            item['media'] = empty
            
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath('td/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('td[3]/text()').extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath('td[2]/span/span/text()').extract()[0]
            except:
                item['location'] = empty
            
            item['latitude'] = empty
            item['longitude'] = empty
            
            try:
                price = sel.xpath('td[@style="text-align: right;"]/text()').extract()[0].split('/')
                item['price'] = price[0].encode('utf-8').strip('€')
                item['period'] = price[1]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty
                item['period'] = empty
            
            yield item
class BricolibSpider(scrapy.Spider):
    name = "bricolib"
    category = "daily"
    subcategory = "brico"
    allowed_domains = ["http://www.bricolib.net"]
    # scrap zilok by categories
    start_urls = list(
        map(lambda x: "http://www.bricolib.net/annonces/page/" + str(x),
            range(1, 200)))
    France = France()
    geo = France.geo

    def parse(self, response):
        for sel in response.xpath('//div[@class="post-block"]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div[@class="post-left"]/a/@title').extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = "https:" + sel.xpath(
                    'div[@class="post-left"]/a/@data-rel').extract()[0]

            except:
                item['media'] = empty
            try:
                item['url'] = sel.xpath(
                    'div[@class="post-left"]/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div[@class="post-right"]/p[@class="post-desc"]/text()'
                ).extract()[0]

            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div[@class="post-right"]/p[@class="post-meta"]/span[@class="cp_city"]/text()'
                ).extract()[0]
            except:
                item['location'] = empty
            try:
                item['postal_code'] = sel.xpath(
                    'div[@class="post-right"]/p[@class="post-meta"]/span[@class="cp_zipcode"]/text()'
                ).extract()[0]
            except:
                item['postal_code'] = 0

            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty
            try:
                price = sel.xpath(
                    'div[@class="post-right"]/div[@class="price-wrap"]/p[@class="post-price"]/text()'
                ).extract()[0].split('/')

                item['price'] = price[0].strip(' ').encode('utf-8').strip('€')
                item['period'] = price[1]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['period'] = empty
                item['currency'] = empty

            item['evaluations'] = empty

            yield item
class EloueBricoSpider(scrapy.Spider):
    name = "eloue"
    category = "daily"
    subcategory = "brico"
    allowed_domains = ["https://www.e-loue.com"]
    # scrap zilok by categories
    start_urls0 = list(map(lambda x: "https://www.e-loue.com/location/page/%s/?r=9"%str(x), range(1,20)))
    France = France()
    cities = France.cities
    start_urls= [url+'&l='+city for url in start_urls0 for city in cities ]

    def parse(self, response):
        for sel in response.xpath('//ol[@class="product-layout"]/li'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath("@name").extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = "https:"+sel.xpath('div/div/a/img/@style').extract()[0].split(')')[0].split(':')[-1]

            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath('div/div/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('div/div[@class="info"]/p[@class="full_description"]/text()').extract()[0]

            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath('div/div[@class="info"]/p/text()').extract()[0]
                item['postal_code'] = int(item['location'].split(', ')[1])
            except:
                item['location'] = empty
		item['postal_code'] = 0
            
            try:
                item['latitude'] = sel.xpath("@locationx").extract()[0]
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = sel.xpath("@locationy").extract()[0]
            except:
                item['longitude'] = empty
            try:
                price = sel.xpath('div/div/span[@class="badge price"]/text()').extract()[0].split('/')

                item['price'] = price[0].strip(' ').encode('utf-8').strip('€')
                item['period'] = price[1]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['period'] = empty
                item['currency'] = empty
	    item['evaluations'] = empty            
            yield item
Beispiel #6
0
class HousetripSpider(scrapy.Spider):
    name = "housetrip"
    category = "housing"
    subcategory = "apartment"
    allowed_domains = ["http://www.housetrip.fr"]
    France = France()
    geo = France.geo
    cities = geo.keys()
    start_urls_0 = list(
        map(
            lambda x:
            "http://www.housetrip.fr/fr/chercher-appartements-vacances/" + str(
                x), cities))
    start_urls = [
        url + "?page=" + str(x) for url in start_urls_0 for x in range(100)
    ]

    def parse(self, response):
        for sel in response.xpath('//li[@data-element-id]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div[2]/div[1]/h3/a/text()').extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = sel.xpath('div[1]/@style').extract()[0].split(
                    '(')[1].split(')')[0].strip("'")
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div[2]/div[1]/h3/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0]
                desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0]
                #desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0]
                item['description'] = desc0 + " " + desc1 + " "
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    'div[2]/div[1]/h4/text()').extract()[0]
            except:
                item['location'] = empty

            item['postal_code'] = empty
            item['evaluations'] = empty

            url_city = response.url.split('?')[0].split('/')[-1]

            try:
                item['latitude'] = float(self.geo[url_city]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[url_city]['lon'])
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath('div[2]/div[3]/p/text()').extract(
                )[0].strip('\n').encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'div[2]/div[3]/p[2]/text()').extract()[0]
            except:
                item['period'] = empty

            yield item
class ParkadomSpider(scrapy.Spider):
    name = "parkadom"
    category = "parking"
    subcategory = "parking"
    allowed_domains = ["http://www.parkadom.com"]
    #start_urls = list(map(lambda x: "http://www.parkadom.com/location-parking/resultat-de-recherche?page"+str(x), range(1,52)))
    start_urls = ["http://www.parkadom.com/location-parking/resultat-de-recherche?group=100"]
    pattern = re.compile("\d{1,}")
    France = France()
    geo = France.geo

    def parse(self, response):
        for sel in response.xpath('//div[@class="box-parking-dispo"]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('div/span[@class="title-parking"]/text()').extract()[0]
		item['postal_code'] = searchZip(item['title'])
            except:
                item['title'] = empty
		item['postal_code'] = 0
            try:
                item['media'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@src').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-right"]/div[2]/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@alt').extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath('div/div/div/div/h1/span/text()').extract()[0]
            except:
                item['location'] = empty
	    try:            
            	item['latitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lat']
	    except:
		item['latitude'] = empty
            try:
		
		item['longitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lon']
	    except:
	        item['longitude'] = empty
            
            try:
                item['price'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/span/text()').extract()[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty
            
            try:
                item['period'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/text()').extract()[0].strip('/')
            except:
                item['period'] = empty
	    try:
                item['evaluations'] = re.search(self.pattern, sel.xpath('div/div/div[@class="detail-parking-left"]/div/div/span/text()').extract()[0]).group()
            except:
                item['evaluations'] = empty

		
            yield item
Beispiel #8
0
class SailsharingSpider(scrapy.Spider):
    name = "sailsharing"
    category = "leisure"
    subcategory = "boat"
    allowed_domains = ["http://www.sailsharing.com"]
    # scrap zilok by categories
    start_urls = list(map(lambda x: "http://www.sailsharing.com/fr/location-bateau/search?page="+str(x), range(1,36)))

    France = France()
    geo = France.geo
    def parse(self, response):
        for sel in response.xpath('//div[@class="block"]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('div/h2/a/text()').extract()[0].strip("\n ")

            except:
                item['title'] = empty
            try:
                item['media'] = self.allowed_domains[0] + sel.xpath('a/img/@src').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath('a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('div/div[@class="boat-info"]/text()').extract()[0].strip("\n ")

            except:
                item['description'] = empty
	    try:
                item['evaluations'] = sel.xpath('div/div[@class="boat-skipper"]/div[@class="nb-commentaires"]/span[@class="nb-com"]/text()').extract()[0].strip("\n ")

            except:
                item['evaluations'] = empty

            try:
                item['location'] = sel.xpath('div/div/h4/strong/text()').extract()[0].strip(' -')
            except:
                item['location'] = empty
            try:
		item['latitude'] = self.geo[item['location']]['lat']
	    except:    
            	item['latitude'] = empty
	    try:
		item['longitude'] = self.geo[item['location']]['lon']
	    except:
            	item['longitude'] = empty
            
            try:
                item['price'] = sel.xpath('div[@class="hosting-meta"]/div/span/strong/text()').extract()[0].encode('utf-8').strip('€')
                item['currency'] = '€'
            except:
                item['price'] = empty
                item['currency'] = empty
            
            try:
                item['period'] = sel.xpath('div[3]/span/text()').extract()[0]
            except:
                item['period'] = empty
	    item['postal_code'] = empty
            yield item
Beispiel #9
0
class ZilokManutentionSpider(scrapy.Spider):
	name = "zilokmanutention"
	category = "daily"
	subcategory = "brico"
	allowed_domains = ["http://www.housetrip.fr"]
	France = France()
	cities = France.cities
	start_urls_0 = list(map(lambda x: "http://www.housetrip.fr/fr/rechercher/"+str(x), cities))
	start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(100)]

	def parse(self, response):
		for sel in response.xpath('//div[@data-element-id]'):
			item = AdItem()
			empty = ''
			item['source'] = "zilok"
			item['category'] = self.category
			item['subcategory'] = self.subcategory

			try:
				item['title'] = sel.xpath('div[2]/h3/a/text()').extract()[0]
			except: 
				item['title'] = empty

			try:	
				item['media'] = sel.xpath('div[1]/@style').extract()[0].split('(')[1].split(')')[0]
			except: 
				item['media'] = empty

			try:
				item['url'] = sel.xpath('div[2]/h3/a/@href').extract()[0]
			except:
				item['url'] = empty
			
			try:		
				desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0]
				desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0]
				desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0]
				item['description'] = desc0 + " " + desc1 + " " + desc2
			except:
				item['description'] = empty

			try:
				item['location'] = sel.xpath('div[2]/h4/text()').extract()[0]
			except:
				item['location'] = empty

			
			item['latitude'] = empty
			item['longitude'] = empty

			try:
				item['price'] = sel.xpath('div[3]/div/p/text()').extract()[0].strip('\n').encode('utf-8').strip('€')
				item['currency'] = "€"
			except:
				item['price'] = empty
				item['currency'] = empty

			try:
				item['period'] = sel.xpath('div[3]/div/p[2]/text()').extract()[0]
			except:
				item['period'] = empty
			
			yield item
class ClickandboatSpider(scrapy.Spider):
    name = "clickandboat"
    category = "leisure"
    subcategory = "boat"
    allowed_domains = ["https://www.clickandboat.com"]
    France = France()
    cities = France.cities
    urls = list(
        map(
            lambda x:
            "https://www.clickandboat.com/location-bateau/search?where=" + str(
                x), cities))
    start_urls = [url + "&_page=" + str(i) for i in range(30) for url in urls]

    def parse(self, response):

        for sel in response.xpath('//ul[@id="results"]/li'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div/div[2]/a/h2[@class="titre-annonce"]/text()').extract(
                    )[0]

            except:
                item['title'] = empty

            try:
                item['media'] = self.allowed_domains[0] + sel.xpath(
                    'div/div/a/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div/div/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                item['description'] = "capacite " + sel.xpath(
                    'div/div[2]/div/div/div[2]/div[2]/p/span/text()').extract(
                    )[0] + " personnes"

            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    'div/div[2]/div/div/div[1]/div[2]/p/span/text()').extract(
                    )[0]
            except:
                item['location'] = empty
            item['postal_code'] = 0
            try:
                item['latitude'] = sel.xpath(
                    'div/input[@class="annonce_lat"]/@value').extract()[0]
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = sel.xpath(
                    'div/input[@class="annonce_ltd"]/@value').extract()[0]
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    'div/div[3]/h2/b/span[@class="prix"]/text()').extract()[0]
                item['currency'] = '€'
            except:
                item['price'] = empty
                item['currency'] = empty
            try:
                item['evaluations'] = sel.xpath(
                    'div/div[2]/div/div/div[4]/div[2]/p/span[1]').extract()[0]
            except:
                item['evaluations'] = empty
            try:
                item['period'] = sel.xpath('div/div[3]/h2/small[2]/sup/text()'
                                           ).extract()[0].strip('/')
            except:
                item['period'] = empty

            yield item
Beispiel #11
0
class MonsieurParkingSpider(scrapy.Spider):
	name = "monsieurparking"
	category = "parking"
	subcategory = "parking"
	allowed_domains = ["http://www.monsieurparking.com"]
	# scrap by cities
	France = France()
	geo_cities = France.geo
	cities = geo_cities.keys()
	start_urls = list(map(lambda x: "http://www.monsieurparking.com/location/"+str(x)+".html", cities))

	def parse(self, response):
		print response.url
		for sel in response.xpath("//div[@id='loginbox']"):
			item = AdItem()
			empty = ''
			item['source'] = self.name
			item['category'] = self.category
			item['subcategory'] = self.subcategory

			try:
				item['title'] = sel.xpath('div/div/div/div/p/a/text()').extract()[0]
				
			except: 
				item['title'] = empty

			try:
				item['location'] = response.url.split('/')[-1].split('.')[0]

			except:
				item['location'] = empty
			item['postal_code'] = 0
			try:	
				item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0]
			except: 
				item['media'] = self.allowed_domains[0] + "/images/parking-orange-26x26.png"

			try:
				item['url'] = self.allowed_domains[0] + sel.xpath("div/div/div/div/p/a/@href").extract()[0]
			except:
				item['url'] = empty
			
			try:		
				desc0 = sel.xpath('div/div/div/div/span/text()').extract()[0]
				desc1 = sel.xpath('div/div/div/div/span[2]/text()').extract()[0]
				item['description'] = desc0 + ", " + desc1
			except:
				item['description'] = empty
			try:
				item['latitude'] = float(self.geo_cities[item['location']]['lat'])
			except:
				item['latitude'] = empty

			try:
				item['longitude'] = float(self.geo_cities[item['location']]['lon'])
			except:
				item['longitude'] = empty
			try:
				item['price'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[0].encode('utf-8').split('€')[0]
				item['currency'] = "€"
			except:
				item['price'] = empty
				item['currency'] = empty
			try:
				item['period'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[1] 
			except:
				item['period'] = empty
			item['evaluations'] = empty			
			yield item
class HousetripSpider(scrapy.Spider):
    name = "pretersonjardin"
    category = "eating"
    subcategory = "gardens"
    allowed_domains = ["http://www.pretersonjardin.com"]
    pages = 18 * range(1, 1000)
    start_urls = list(
        map(
            lambda x:
            "http://www.pretersonjardin.com/annonces/toutes-les-annonces/Page-%s.html"
            % str(x), range(1, 100)))
    France = France()
    geo = France.geo

    def parse(self, response):
        for sel in response.xpath('//tr'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'td[@id="colonne4"]/div[@id="title_ad"]/a/text()').extract(
                    )[0].strip(' ').title()
            except:
                item['title'] = empty

            item['media'] = empty

            try:
                item['url'] = sel.xpath(
                    'td[@id="colonne4"]/div[@id="title_ad"]/a/@href').extract(
                    )[0]
            except:
                item['url'] = empty

            try:
                item['description'] = sel.xpath(
                    'td[@id="colonne4"]/div[@id="text_ad"]/a/text()').extract(
                    )[0]
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath('td[@id="colonne3"]/text()'
                                             ).extract()[0].strip(' ').title()
            except:
                item['location'] = empty

            try:
                item['latitude'] = self.geo[item['location']]['lat']
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = self.geo[item['location']]['lon']
            except:
                item['longitude'] = empty
            item['price'] = empty
            item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'td[@id="colonne5"]/div/text()').extract()[0]
            except:
                item['period'] = empty
            item['postal_code'] = empty
            item['evaluations'] = empty
            yield item
Beispiel #13
0
class OuicarSpider(scrapy.Spider):
    name = 'ouicar'
    category = 'moving'
    subcategory = "car"
    allowed_domains = ["http://www.ouicar.fr"]
    France = France()
    cities = France.cities

    start_urls_0 = list(
        map(lambda x: "http://www.ouicar.fr/car/search?where=" + str(x),
            cities))
    start_urls = [
        url + "&page=" + str(x) for url in start_urls_0 for x in range(100)
    ]

    def parse(self, response):
        for sel in response.xpath('//tr[@data-dpt]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('td/div/a/h3/text()').extract(
                )[0] + sel.xpath('td/div/a/h3/small/text()').extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = "https:" + sel.xpath(
                    'td/span/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = sel.xpath('td/div/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                desc0 = sel.xpath(
                    'td/div/p[@class="ZAuto_content"]/text()').extract()[0]
                desc1 = sel.xpath(
                    'td/div/div[@class="z-car-search-livraison"]/text()'
                ).extract()[0]
                item['description'] = desc0 + "\n" + desc1
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath('@data-city').extract()[0]
            except:
                item['location'] = empty
            try:
                item['latitude'] = sel.xpath('@data-lat').extract()[0]
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = sel.xpath('@data-lng').extract()[0]
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath('td[2]/p/text()').extract(
                )[0].encode('utf-8').split('€')[0].strip('\n\t')
                item['currency'] = '€'
            except:
                item['price'] = empty
                item['currency'] = empty

            item['period'] = "jour"

            try:
                res = sel.xpath(
                    'td/div/p[@class="ZAuto_location"]/text()').extract()[0]
                item['postal_code'] = searchZip(res)
            except:
                item['postal_code'] = empty
            try:
                item['evaluations'] = sel.xpath(
                    'td/div/a/h3/small[@class="ZAuto_title_ratings"]/text()'
                ).extract()[0].strip('( )')
            except:
                item['evaluations'] = empty
            yield item
class ZilokSpider(scrapy.Spider):
    name = "zilok"
    category = "daily"
    subcategory = "brico"
    allowed_domains = ["http://www.zilok.fr"]
    France = France()
    cities = France.geo
    start_urls = []
    for k, v in cities.items():
        url = "http://fr.zilok.com/apiv2/index.php/item/search/api/?action=item.search&api_key=akaka12JHKLAs455saasasa54sJLJLA&distance=15000&language=2&lat=" + str(
            v["lat"]) + "&limit=1000&lng=" + str(
                v["lon"]) + "&real_search=1&where=" + k
        start_urls.append(url)
    #start_urls = list(map(lambda x:"http://fr.zilok.com/apiv2/index.php/item/search/api/?action=item.search&api_key=akaka12JHKLAs455saasasa54sJLJLA&distance=15000&language=2&lat=%s&limit=30&lng=%s&real_search=1&where=%s"%(_geo[x]["lat"], _geo[x]["lon"], x), cities))
    print start_urls

    def parse(self, response):
        for sel in response.xpath('//item[@id]'):
            item = AdItem()
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory
            empty = ""
            try:
                item['title'] = sel.xpath('title/text()').extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath('image/palm/@url').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = sel.xpath('link/text()').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath('subtitle/text()').extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'location/locality/text()').extract()[0]
            except:
                item['location'] = empty
            try:
                item['postal_code'] = sel.xpath(
                    'location/postal_code/text()').extract()[0]
            except:
                item['postal_code'] = empty
            try:
                item['latitude'] = sel.xpath('location/lat/text()').extract(
                )[0] if len(sel.xpath('location/lat/text()').extract(
                )[0]) > 1 else sel.xpath('/search/lat/text()').extract()[0]
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = sel.xpath('location/lng/text()').extract(
                )[0] if len(sel.xpath('location/lng/text()').extract(
                )[0]) > 1 else sel.xpath('/search/lng/text()').extract()[0]
            except:
                item['longitude'] = empty
            try:
                item['price'] = sel.xpath('price/text()').extract()[0]
            except:
                item['price'] = empty
            try:
                item['currency'] = sel.xpath('price/@currency').extract()[0]
            except:
                item['currency'] = empty
            try:
                item['evaluations'] = sel.xpath(
                    'evaluation_number/text()').extract()[0]
            except:
                item['evaluations'] = empty
            item['period'] = "jour"
            yield item
class EzilizeSpider(scrapy.Spider):
    name = "ezilize"
    categories = {
        "bricolage": {
            "category": "daily",
            "subcategory": "brico"
        },
        "evenements": {
            "category": "meet",
            "subcategory": "events"
        },
        "mode-vetements": {
            "category": "daily",
            "subcategory": "dressing"
        },
        "sports-loisirs": {
            "category": "leisure",
            "subcategory": "sport"
        },
        "vehicules": {
            "category": "moving",
            "subcategory": "car"
        }
    }
    allowed_domains = ["https://ezilize.fr"]

    France = France()
    cities = France.cities
    start_urls_0 = list(
        map(lambda x: "https://ezilize.fr/location/" + str(x), categories))

    start_urls = [
        url + "?p=" + str(x) for url in start_urls_0 for x in range(10)
    ]

    def parse(self, response):
        for sel in response.xpath('//div[@itemtype]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            category = response.url.split('?')[0].split('/')[-1]

            item['category'] = self.categories[category]["category"]
            item['subcategory'] = self.categories[category]["subcategory"]

            try:
                item['title'] = sel.xpath(
                    'div/div[@class="nsadtitle"]/text()').extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = "https:" + sel.xpath(
                    'div/div/img/@src').extract()[0]

            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div[@class="nsadprice"]/div/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div/div[@class="nsadsub"]/text()').extract()[0]

            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div[2]/div[3]/span[2]/text()').extract()[0]
                item['postal_code'] = int(
                    item['location'].test.split(' - ')[0])
            except:
                item['location'] = empty
                item['postal_code'] = 0

            item['latitude'] = empty
            item['longitude'] = empty
            try:
                item['price'] = sel.xpath(
                    'div[@class="nsadprice"]/div[@class="nsofferamount"]/text()'
                ).extract()[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty
            item['period'] = "jour"
            item['evaluations'] = empty
            yield item
Beispiel #16
0
class WimduSpider(scrapy.Spider):
    name = "wimdu"
    category = "housing"
    subcategory = "apartment"
    allowed_domains = ["http://www.wimdu.fr"]
    # scrap by cities
    France = France()
    cities = France.cities
    start_urls_0 = list(map(lambda x: "http://www.wimdu.fr/" + str(x), cities))
    start_urls = [
        url + "?page=" + str(x) for url in start_urls_0 for x in range(10)
    ]

    def parse(self, response):
        for sel in response.xpath("//ul[@id='results']/li"):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    "div/div[2]/div[@class='offer__details']/h3/a/text()"
                ).extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = sel.xpath(
                    'div/div/a/img[2]/@data-src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div/div/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                item['description'] = sel.xpath(
                    "div/div[2]/div[@class='offer__details']/div[@class='offer__description']/text()"
                ).extract()[0]
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    "div/div[2]/div[@class='offer__details']/div[@class='offer__subtitle']/text()"
                ).extract()[0]
            except:
                item['location'] = empty

            item['latitude'] = empty
            item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    "div/div[2]/div[@class='price price--mini js-price-per-night']/div/text()[2]"
                ).extract()[0].strip('\n').encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty
            try:
                item['period'] = sel.xpath(
                    "div/div[2]/div[@class='price price--mini js-price-per-night']/div[2]/text()"
                ).extract()[0]
            except:
                item['period'] = empty

            yield item
Beispiel #17
0
class OwlcampSpider(scrapy.Spider):
    name = "owlcamp"
    category = "housing"
    subcategory = "camping"
    allowed_domains = ["http://owlcamp.com"]
    France = France()
    geo = France.geo
    start_urls = list(
        map(lambda x: "http://owlcamp.com/fre/gardens/all/page:%s" % str(x),
            range(2, 15)))
    start_urls.append("http://owlcamp.com/fre/gardens/all")

    def parse(self, response):
        for sel in response.xpath('//div[@class="garden-card"]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div[@class="garden-card__location"]/text()').extract(
                    )[0].strip(' \n')
            except:
                item['title'] = empty
            try:
                item['media'] = self.allowed_domains[0] + sel.xpath(
                    'a[@rel]/img/@src').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'a[@rel]/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div[@class="garden-card__location"]/text()').extract(
                    )[0].strip(' \n')
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div[@class="garden-card__location"]/text()').extract(
                    )[0].strip(' \n')
            except:
                item['location'] = empty
            try:
                item['latitude'] = str(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = str(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty

            try:
                price = sel.xpath('div[@class="garden-card__price"]/div/text()'
                                  ).extract()[0].strip(' ')
                if price == "gratuit":
                    item['price'] = 0
                else:
                    item['price'] = price.split('/')[0]
                    item['period'] = price.split('/')[-1]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['period'] = empty
                item['currency'] = empty
            item['postal_code'] = empty
            item['evaluations'] = empty
            yield item
Beispiel #18
0
class CookeningSpider(scrapy.Spider):
    name = "cookening"
    category = "eating"
    subcategory = "meals"
    allowed_domains = ["https://www.cookening.com"]
    # scrap by cities
    France = France()
    cities = France.cities
    start_urls = list(
        map(lambda x: "https://www.cookening.com/fr/explore/" + str(x),
            cities))
    geo = France.geo

    def parse(self, response):
        for sel in response.xpath("//ul[@id='MealCards']/li"):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    "a/div[@id='myCarouselGroup']/div[@class='Title myCarousel']/div[@class='Info']/h3/text()"
                ).extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = sel.xpath('a/div/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                desc0 = sel.xpath(
                    "a/div[@class='Host']/span[@class='Name']/text()").extract(
                    )[0]
                desc1 = sel.xpath(
                    "a/div[@class='Host']/span[@class='Bio']/text()").extract(
                    )[0]
                item['description'] = desc0 + " " + desc1
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    "a/div[2]/div[2]/div/span[@class='Place']/text()").extract(
                    )[0]
            except:
                item['location'] = empty
            item['postal_code'] = 0

            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    "a/div[2]/div[2]/div/span[@class='Contribution']/strong/text()"
                ).extract()[0].strip('\n').encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    "a/div[2]/div[2]/div/span[@class='Contribution']/span/text()"
                ).extract()[0]
            except:
                item['period'] = empty
            item['evaluations'] = empty
            yield item
class SailsharingSpider(scrapy.Spider):
    name = "wikicampers"
    category = "moving"
    subcategory = "camping car"
    allowed_domains = ["http://www.wikicampers.fr"]
    # scrap zilok by categories
    France = France()
    geo = France.geo
    cities = geo.keys()
    start_urls = list(
        map(
            lambda x:
            "http://www.wikicampers.fr/annonces-location-camping-car/" + str(
                x), cities))

    def parse(self, response):
        for sel in response.xpath('//div[@class="annonces"]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div/h3/a/text()').extract()[0].strip("\n ")

            except:
                item['title'] = empty
            try:
                item['media'] = self.allowed_domains[0] + sel.xpath(
                    'div/a/img/@src').extract()[0].split('..')[-1]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['evaluations'] = int(
                    sel.xpath('div/h3/a[2]/text()[2]').extract()[0].strip(' '))
            except:
                item['evaluations'] = empty
            try:
                item['description'] = sel.xpath(
                    'div[@class="grid_inner annonce"]/p/text()').extract(
                    )[0].strip("\n ")

            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div/div[@class="city"]/text()').extract()[0].strip("\n ")

            except:
                item['location'] = empty

            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty
            try:
                item['price'] = sel.xpath('div/span/text()').extract(
                )[0].strip("\n ").split(' ')[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'div/span/text()').extract()[0].strip("\n ").split(' ')[-1]
            except:
                item['period'] = empty
            item['postal_code'] = empty
            yield item
class MobyparkSpider(scrapy.Spider):
    name = "mobypark"
    category = "storing"
    subcategory = "space"
    allowed_domains = ["http://www.mobypark.fr"]
    France = France()
    cities = France.cities
    start_urls = list(
        map(
            lambda x:
            "https://www.mobypark.fr/api/offers?format=json?distance=15&radius=15&q="
            + str(x), cities))

    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        result = jsonresponse["result"]
        if result.has_key('offers'):
            results = result["offers"]

            for sel in results:
                item = AdItem()
                empty = ""
                item['source'] = self.name
                item['category'] = self.category
                item['subcategory'] = self.subcategory

                try:
                    item['title'] = sel['car_park']['location'][
                        'formatted_address']
                except:
                    item['title'] = empty
                try:
                    item['media'] = sel["car_park"]["first_picture"]["url"]
                    print item['media']
                except:
                    item['media'] = empty
                try:
                    url_id = sel["car_park"]["id"]
                    item['url'] = self.allowed_domains[0] + "/carpark/" + str(
                        url_id) + "/show"
                except:
                    item['url'] = empty
                try:
                    item['description'] = sel["car_park"]["description"]
                except:
                    item['description'] = empty
                try:
                    item['location'] = sel['car_park']['location'][
                        'formatted_address']
                    item['postal_code'] = searchZip(item['location'])
                except:
                    item['location'] = empty
                    item['postal_code'] = 0
                try:
                    item['latitude'] = sel['car_park']['location']['latitude']
                except:
                    item['latitude'] = empty

                try:
                    item['longitude'] = sel['car_park']['location'][
                        'longitude']
                except:
                    item['longitude'] = empty

                try:
                    item['price'] = sel["car_park"]["day_rate"]
                    item['currency'] = "€"
                except:
                    item['price'] = empty
                    item['currency'] = empty

                try:
                    item['period'] = sel["car_park"]["minimal_duration"]
                except:
                    item['period'] = empty

                item['evaluations'] = empty
                yield item
Beispiel #21
0
class yoopiesSpider(scrapy.Spider):
    name = "yoopies"
    category = "daily"
    subcategory = "babysitting"
    allowed_domains = ["https://yoopies.fr"]
    # scrap by cities
    France = France()
    cities = France.cities

    start_urls = list(
        map(
            lambda x: "https://yoopies.fr/recherche-baby-sitting/results?c=" +
            str(x), cities))

    def parse(self, response):
        for sel in response.xpath('//article'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory
            try:
                item['title'] = sel.xpath(
                    'a/div[2]/header/h1/text()').extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = sel.xpath(
                    'a/aside/figure/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = sel.xpath('a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                item['description'] = sel.xpath(
                    'a/div[2]/p[@class="description"]/text()').extract(
                    )[0].strip('\n')

            except:
                item['description'] = empty

            try:
                item['latitude'] = sel.xpath('a/@data-latitude').extract()[0]
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = sel.xpath('a/@data-longitude').extract()[0]
            except:
                item['longitude'] = empty

            try:
                item['location'] = sel.xpath(
                    'a/aside/div[@class="user-city"]/text()').extract()[0]
            except:
                item['location'] = empty

            item['postal_code'] = empty
            item['evaluations'] = empty
            item['price'] = empty
            item['currency'] = empty
            item['period'] = empty

            yield item
Beispiel #22
0
class HousetripSpider(scrapy.Spider):
    name = "cavientdujardin"
    category = "eating"
    subcategory = "vegetables"
    allowed_domains = ["http://www.cavientdujardin.com"]
    start_urls = list(
        map(
            lambda x:
            "http://www.cavientdujardin.com/petites-annonces/0-0-0-0-%s.html" %
            str(x), range(1, 10)))
    France = France()
    geo = France.geo

    def parse(self, response):
        for sel in response.xpath('//div[@class="LigneAnnonce"]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    'div[@class="ListDet"]/a[@class="ListTitre1"]/text()'
                ).extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath(
                    'div[@class="ListImg"]/img/@src').extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div[@class="ListDet"]/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                item['description'] = sel.xpath(
                    'div[@class="ListDet"]/a[@class="ListTitre"]/text()'
                ).extract()[0]
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    'div[@class="ListDet"]/span[@class="ville"]/text()'
                ).extract()[0]
            except:
                item['location'] = empty
            item['postal_code'] = empty
            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty
            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    'div[@class="ListDet"]/span[@class="ListPrix"]/text()'
                ).extract()[0]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'div[@class="ListCol1"]/text()').extract()[0]
            except:
                item['period'] = empty
            item['evaluations'] = empty
            yield item
Beispiel #23
0
class HousetripSpider(scrapy.Spider):
	name = "chambrealouer"
	category = "housing"
	subcategory = "room"
	allowed_domains = ["http://fr.chambrealouer.com"]
	France = France()
	cities = France.cities
	start_urls = list(map(lambda x: "http://fr.chambrealouer.com/location/FR-France/"+str(x), cities))

	def parse(self, response):
		for sel in response.xpath('//div[@class="rentResult ad-list-item"]'):
			item = AdItem()
			empty = ''
			item['source'] = self.name
			item['category'] = self.category
			item['subcategory'] = self.subcategory

			try:
				item['title'] = sel.xpath('div[@class="detail"]/img/@alt').extract()[0]
			except: 
				item['title'] = empty

			try:	
				item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0]
			except: 
				item['media'] = empty

			try:
				item['url'] = sel.xpath('div[@class="detail"]/meta/@content').extract()[0]
			except:
				item['url'] = empty
			
			try:		
				item['description'] = sel.xpath('div[@class="detail"]/div/p/span/text()').extract()[0]
			except:
				item['description'] = empty

			try:
				item['location'] = sel.xpath('//div[@class="rentResult ad-list-item"]/div[@class="detail"]/div/div[@itemprop="address"]/span[@class="location"]/span/text()').extract()[0]
			except:
				item['location'] = empty
                        item['postal_code'] = 0
			
			item['latitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="latitude"]/@content').extract()[0]
			item['longitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="longitude"]/@content').extract()[0]

			try:
				price0 = sel.xpath('table/tr[2]/td[1]/text()').extract()[0].encode('utf-8').strip('€')
				price1 = sel.xpath('table/tr[2]/td[2]/text()').extract()[0].encode('utf-8').strip('€')
				price2 = sel.xpath('table/tr[2]/td[3]/text()').extract()[0].encode('utf-8').strip('€')
				item['price'] = price0 + ", " + price1 + ", " + price2 
				item['currency'] = "€"

			except:
				item['price'] = empty
				item['currency'] = empty

			try:
				period0 = sel.xpath('table/tr/td[1]/text()').extract()[0]
				period1 = sel.xpath('table/tr/td[2]/text()').extract()[0]
				period2 = sel.xpath('table/tr/td[3]/text()').extract()[0]
				item['period'] = period0 + ", " + period1 + ", " + period2 
			except:
				item['period'] = empty
			item['evaluations'] = empty			
			yield item
Beispiel #24
0
class BandbikeSpider(scrapy.Spider):
    name = "bandbike"
    category = "moving"
    subcategory = "velo"
    allowed_domains = ["http://bandbike.com"]
    start_urls = []
    France = France()
    geo = France.geo
    cities = geo.keys()
    for city in cities:
        url = "http://bandbike.com/ref/city/" + city
        req = requests.get(url=url)
        res = json.loads(req.text)
        for r in res:
            url = "http://bandbike.com/ad/search?terms=%s+(%s)&searchCityId=%s" % (
                r['name'], r['zipcode'], r['id'])
            urls = [url + "&currentPage=" + str(x) for x in xrange(1, 10)]
            start_urls += urls

    #print start_urls
    def parse(self, response):
        for sel in response.xpath('//div[@class="row"]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath("div/div/div/h4/text()").extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath('div/div/div/img/@src').extract()[0]

            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div/div/div/a/@href').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div/div/div/div/div/h5/text()').extract()[0]

            except:
                item['description'] = empty

            item['location'] = response.url.split('terms=')[1].split('+')[0]
            item['postal_code'] = response.url.split('terms=')[1].split(
                '+')[1].split(')')[0].strip('(')
            item['evaluations'] = empty

            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty

            try:
                price = sel.xpath(
                    'div/div/div/div/div[3]/h5/text()').extract()[0].split('/')
                item['price'] = price[0].strip(' ').encode('utf-8').strip('€')
                item['period'] = price[1]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['period'] = empty
                item['currency'] = empty
            yield item
class VizeatSpider(scrapy.Spider):
    name = "vizeat"
    category = "eating"
    subcategory = "meals"
    allowed_domains = ["https://fr.vizeat.com"]
    # scrap by cities
    France = France()
    cities = France.cities
    start_urls = list(
        map(lambda x: "https://fr.vizeat.com/events/search?q=" + str(x),
            cities))

    def parse(self, response):
        for sel in response.xpath('//div[@class="itemInside event-box p15"]'):
            item = AdItem()
            empty = ''
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('a/img/@title').extract()[0]
            except:
                item['title'] = empty

            try:
                item['media'] = sel.xpath('a/img/@src').extract()[0]
            except:
                item['media'] = empty

            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div/div/h2/a/@href').extract()[0]
            except:
                item['url'] = empty

            try:
                desc0 = sel.xpath(
                    'div[@class="dateHeureEvent"]/a/text()').extract()[0]
                item['description'] = desc0
            except:
                item['description'] = empty

            try:
                item['location'] = sel.xpath(
                    'div[@class="author"]/div[@class="authorRight"]/a[2]/text()'
                ).extract()[0]
            except:
                item['location'] = empty

            item['latitude'] = empty
            item['longitude'] = empty

            try:
                item['price'] = sel.xpath("div/div[2]/div/text()").extract(
                )[0].strip('\n').encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'div[@class="dateHeureEvent"]/a/text()').extract()[0]
            except:
                item['period'] = empty

            yield item
class DrivySpider(scrapy.Spider):
    name = "drivy"
    category = "moving"
    subcategory = "car"
    allowed_domains = ["https://www.drivy.com"]
    # scrap zilok by categories
    France = France()
    geo = France.geo
    urls = []
    for k, v in geo.items():
        url = "https://www.drivy.com/search?latitude=" + str(
            v["lat"]) + "&longitude=" + str(
                v["lon"]) + "&city_display_name=" + k + "&area_type=city"
        urls.append(url)
    start_urls = [
        url + "&page=" + str(i) for url in urls for i in xrange(1, 51)
    ]

    def parse(self, response):
        for sel in response.xpath('//div[@data-car-id]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath(
                    "div[@class='search_card_content car_content']/a[@class='car_title']/@title"
                ).extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath(
                    'div[@class="search_card_aside car_photo"]/img/@src'
                ).extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = self.allowed_domains[0] + sel.xpath(
                    'div[@class="search_card_content car_content"]/a[@class="car_title"]/@href'
                ).extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div[@class="search_card_content car_content"]/div[@class="car_subtitle"]/text()'
                ).extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div[@class="search_card_content car_content"]/div[@class="car_location"]/text()[2]'
                ).extract()[0].strip('\n')
            except:
                item['location'] = response.url.split(
                    'city_display_name=')[1].split('&')[0]

            try:
                item['evaluations'] = float(
                    sel.xpath('div[2]/div[3]/div/span/text()').extract()[0])
            except:
                item['evaluations'] = empty

            item['postal_code'] = empty
            url_city = response.url.split('city_display_name=')[1].split(
                '&')[0]
            try:
                item['latitude'] = float(self.geo[url_city]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[url_city]['lon'])
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    'div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/strong/text()'
                ).extract()[0].encode('utf-8').strip('€')
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty
            try:
                item['period'] = "jour"
            except:
                item['period'] = empty

            yield item
class OuistockSpider(scrapy.Spider):
	name = "ouistock"
	category = "storing"
	subcategory ="space"
	allowed_domains = ["https://www.ouistock.fr"]
	# scrap by cities
	France = France()
	cities = France.cities

	start_urls_0 = list(map(lambda x: "https://www.ouistock.fr/s/"+str(x), cities))
	start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(100)]


	def parse(self, response):
		for sel in response.xpath('//ul[@id="results"]/li'):
			item = AdItem()
			empty = ''
			item['source'] = self.name
			item['category'] = self.category
			item['subcategory'] = self.subcategory

			try:
				item['title'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/h3[@class="resultUserName"]/text()').extract()[0].strip('\n ')

			except: 
				item['title'] = empty

			try:	
				item['media'] = "https:"+sel.xpath('div[@class="resultContainer"]/div[@class="resultImgContainer"]/img/@src').extract()[0]
			except: 
				item['media'] = empty

			try:
				item['url'] = self.allowed_domains[0] + sel.xpath('div[@class="resultContainer"]/a/@href').extract()[0]
			except:
				item['url'] = empty
			
			try:		
				desc0 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultType"]/text()').extract()[0].strip('\n ')
				desc1 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ')

				item['description'] = desc0 + " "+ desc1
			except:
				item['description'] = empty

			try:
				item['location'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ').split(' ')[-1]

			except:
				item['location'] = empty
				
			item['latitude'] = empty
			item['longitude'] = empty

			try:
				item['price'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].encode('utf-8').strip('\n €')
				item['currency'] = "€"
			except:
				item['price'] = empty
				item['currency'] = empty

			try:
				item['period'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].strip("\n' /")

			except:
				item['period'] = empty
			
			yield item
class CostockageSpider(scrapy.Spider):
    name = "costockage"
    category = "storing"
    subcategory = "space"
    allowed_domains = ["https://www.costockage.fr"]
    France = France()
    cities = France.cities
    geo = France.geo
    start_urls_0 = list(
        map(
            lambda x:
            "https://www.costockage.fr/garde-meuble/%s-5&plus-proche=10" % str(
                x), cities))
    start_urls = [
        url + "&" + "page=" + str(x) for url in start_urls_0 for x in range(10)
    ]

    def parse(self, response):
        for sel in response.xpath(
                '//div[@itemtype="http://schema.org/Product"]'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('@title').extract()[0]
            except:
                item['title'] = empty
            try:
                item['media'] = sel.xpath(
                    'div[1]/div[@class="customer_name_search"]/p/img/@src'
                ).extract()[0]
            except:
                item['media'] = empty
            try:
                item['url'] = sel.xpath('@id').extract()[0]
            except:
                item['url'] = empty
            try:
                item['description'] = sel.xpath(
                    'div[1]/div[@class="address"]/text()[2]').extract()[0]
            except:
                item['description'] = empty
            try:
                item['location'] = sel.xpath(
                    'div[1]/div[@class="address"]/a/text()').extract()[0]
                item['postal_code'] = int(item['location'].split('- ')[1])
            except:
                item['location'] = empty
                item['postal_code'] = 0
            try:
                item['latitude'] = float(self.geo[item['location']]['lat'])
            except:
                item['latitude'] = empty

            try:
                item['longitude'] = float(self.geo[item['location']]['lon'])
            except:
                item['longitude'] = empty

            try:
                item['price'] = sel.xpath(
                    'div[3]/div[@class="price_div"]/div[@class="new_price"]/b/text()'
                ).extract()[0].encode('utf-8').split('€')[0]
                item['currency'] = "€"
            except:
                item['price'] = empty
                item['currency'] = empty

            try:
                item['period'] = sel.xpath(
                    'div[3]/div[@class="price_div"]/div[@class="new_price"]/text()[2]'
                ).extract()[0].strip('/')
            except:
                item['period'] = empty
            item['evaluations'] = empty
            yield item
class PrendsmaplaceSpider(scrapy.Spider):
    name = "prendsmaplace"
    category = "parking"
    subcategory = "parking"
    allowed_domains = ["http://www.prendsmaplace.fr"]
    pattern = re.compile('\d{2}')
    France = France()
    cities = France.cities
    geo = France.geo	
    start_urls = list(map(lambda x: "http://www.prendsmaplace.fr/page/%s/?s&geo-radius=100&geo-lat&geo-lng&categories=0&locations=0&dir-search=yes"%str(x), range(1,25)))

    def parse(self, response):
        for sel in response.xpath('//ul[@class="items"]/li'):
            item = AdItem()
            empty = ""
            item['source'] = self.name
            item['category'] = self.category
            item['subcategory'] = self.subcategory

            try:
                item['title'] = sel.xpath('div[@class="description"]/h3/a/text()').extract()[0]
            except:
                item['title'] =empty
            try:
                item['location'] = item['title'].split(' (')[0].split(' ')[-1]
            except:
                item['location'] = empty
            try:
                item['media'] = sel.xpath('div[@class="thumbnail"]/img/@src').extract()[0]
            except:
                item['media'] =empty
	    try:
                item['evaluations'] = sel.xpath('div[@class="thumbnail"]/div[@class="comment-count"]/text()').extract()[0]
            except:
                item['evaluations'] =empty

            try:
                item['url'] = sel.xpath('div[@class="description"]/h3/a/@href').extract()[0]
            except:
                item['url'] =empty
            try:
                item['description'] = sel.xpath('div[@class="description"]/text()[3]').extract()[0]
            except:
                item['description'] =empty
	    try:        
            	item['latitude'] = self.geo[item['location']]['lat']
	    except:
		item['latitude'] = empty
	    try:
                item['longitude'] = self.geo[item['location']]['lon']
            except:
                item['longitude'] = empty

            item['longitude'] = empty           
            item['price'] = empty
            item['currency'] = empty
            item['period'] = empty
	    try:
            	item['postal_code'] = re.search(self.pattern, item['title']).group()
	    except:
		item['postal_code'] = empty
            yield item
Beispiel #30
0
class AirbnbSpider(scrapy.Spider):
	name = "airbnb"
	category = "housing"
	#subcategory = "room"
	allowed_domains = ["https://www.airbnb.com"]
	# scrap by cities
	France = France()
	cities = France.cities
	start_urls_0 = list(map(lambda x: "https://www.airbnb.fr/s/"+str(x), cities))
	start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(10)]
	
	
	def parse(self, response):
		for sel in response.xpath('//div[@data-id]'):
			item = AdItem()
			empty = ''
			item['source'] = self.name
			item['category'] = self.category
			
			try:
				item['title'] = sel.xpath('@data-name').extract()[0]
			except: 
				item['title'] = empty

			try:	
				item['media'] = sel.xpath('div/a/div/img/@src').extract()[0]
			except: 
				item['media'] = empty

			try:
				item['url'] = self.allowed_domains[0] + sel.xpath('@data-url').extract()[0].split('?')[0]
			except:
				item['url'] = empty

			try:		
				item['description'] = sel.xpath('div[2]/div/div[@itemprop="description"]/a/text()').extract()[0]

			except:
			
				item['description'] = sel.xpath('@data-name').extract()[0]
			
			if "Chambre" in item['description']:
				item['subcategory'] = "room"
			else:
				item['subcategory']	= "apartment"
			
			try:
				item['evaluations'] = 0
				find = re.search(pattern, item['description'])
				if find:
					item['evaluations'] = int(find.group())
			except:
				item['evaluations'] = 0
			
			item['latitude'] = sel.xpath('@data-lat').extract()[0]
			item['longitude'] = sel.xpath('@data-lng').extract()[0]
 			try:
 				item['location'] = urllib.unquote(response.url.split('?')[0].split('s/')[-1])
			except:
				item['location']= empty
		        item['postal_code'] = 0		
			try:
				item['price'] = sel.xpath('div/a[2]/div/span/text()').extract()[0]
				item['currency'] = "€"
			except:
				item['price'] = empty
				item['currency'] = empty
			
			item['period'] = "nuit"
			
			yield item