def parse_detail(self, response): # State = response.meta['State'] # City = response.meta['City'] # Numberof_reviews = response.meta['Numberof_reviews'] # Review_ratings = response.meta['Review_ratings'] # Population = response.meta['Population'] State = response.xpath('//div[@class="profile__bucket--3"]//span/text()').extract()[0] City = response.xpath('//div[@class="blank__bucket"]//a/text()').extract()[0] Numberof_reviews = response.xpath('//div[@class="review__stars review__stars--white"]//span/text()').extract() Review_ratings = response.xpath('//div[@class="review__stars review__stars--white"]//span/@class').extract() Population = response.xpath('//div[@class="scalar__value"]//span/text()').extract()[0] Median_Home_Value = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[1] Median_Rent = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[2] Area_Feel = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[3] Crime_Safty = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[3] Diversity = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[11] PublicSchool_level = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[1] Children_percent = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[5] Master_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[1] Bachelor_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[4] Associate_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[7] Jobs_Level = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[13] Median_salary = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[4] Cost_of_Living = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[17] Weather = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[15] NightLife = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[7] Outdoor_Activity = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[21] item = RealestateItem() item['State'] = State item['City'] = City item['Median_Home_Value'] = Median_Home_Value item['Median_Rent'] = Median_Rent item['Population'] = Population item['Area_Feel'] = Area_Feel item['Crime_Safty'] = Crime_Safty item['Diversity'] = Diversity item['PublicSchool_level'] = PublicSchool_level item['Children_percent'] = Children_percent item['Master_Degree'] = Master_Degree item['Bachelor_Degree'] = Bachelor_Degree item['Associate_Degree'] = Associate_Degree item['Jobs_Level'] = Jobs_Level item['Median_salary'] = Median_salary item['Cost_of_Living'] = Cost_of_Living item['Weather'] = Weather item['NightLife'] = NightLife item['Outdoor_Activity'] = Outdoor_Activity item['Numberof_reviews'] = Numberof_reviews item['Review_ratings'] = Review_ratings yield item
def parse(self, response): json_data = json.loads(response.body) for data in json_data['results']: try: item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://media2.nexity.fr/nfr2016/picto/nexity-logo.svg' item['url'] = 'https://www.nexity.fr/' + data['0']['url'] item['pieces'] = data['0']['nb_piece'] item['description'] = data['0']['description'].replace( '<br>', '') item['title'] = data['0']['visuel_alt'] if 'surface' in data['0'].keys(): item['size'] = data['0']['surface'] if 'location' in item['url']: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' item['type'] = 'appartment' item['city'] = data['0']['ville'].lower() item['district'] = data['0']['code_postal'] item['price'] = data['c_prix_min'] imgs = [] for img in data['0']['photos']: imgs.append(img['direct']) item['images'] = ','.join(imgs) item['deposit'] = data['0']['depot_garantie'] if 'etage' in data['0'].keys(): item['floor'] = data['0']['etage'] # if 'honoraires' in data['0'].keys() and data['0']['honoraires'] != 0: # item['agency_fee'] = int(data['0']['honoraires']) # if 'honoraires_part_etat_des_lieux' in data['0'].keys() and data['0']['honoraires_part_etat_des_lieux'] != 0: # item['other_charges'] = int(data['0']['honoraires_part_etat_des_lieux']) # item['agency_fee'] = int(data['0']['honoraires']) - int(data['0']['honoraires_part_etat_des_lieux']) self.count += 1 print("Total Count: " + str(self.count)) yield item except Exception as e: print("err: " + e.args[0]) self.count += 1 print("Total Count: " + str(self.count)) yield item total = int(json_data['pagination']['pageCount']) current = int(json_data['pagination']['current']) if current < total: next = current + 1 next_page_url = 'https://www.nexity.fr/ws-rest/offre/biens/moteur.json?1=1&types_bien=Appartement,Maison/Villa&type_commercialisation=Location&pageNumber={}&pageSize=12&anciennete=0&locations=104&sortField=prix&sortOrder=asc&withPartners=1'.format( next) yield Request(next_page_url, self.parse)
def final_parse(self, response): json_data = json.loads(response.body) item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://dc0r5opm7495b.cloudfront.net/assets/logos/logo_white.fr-d5e56db342eda1a81e02b633d1a339a708e5ed1d823ffa8bdd16db6eab5cc405.png' item['url'] = json_data['full_url'] desc = json_data['description'] if desc: desc = desc.replace('<br/>', '\n') item['description'] = desc item['title'] = json_data['listing_title_string'] item['price'] = json_data['cost_total_rent'] item['size'] = json_data['lodging_surface'] item['type'] = json_data['lodging_type_string'] item['deposit'] = json_data['cost_caution'] item['other_charges'] = json_data['cost_charges'] item['city'] = json_data['address_city'] address_list = [json_data['address_city']] if json_data['address_street']: address_list.append(json_data['address_street']) item['address'] = ' '.join(address_list) # available_from = json_data['lodging_availability_string'] pieces = re.findall('[\d.,]+', json_data['lodging_size_string']) if pieces: pieces = pieces[0] item['pieces'] = pieces imgs = json_data['pictures'] images = [] for img in imgs: if 'image_large' in img.keys(): images.append(img['image_large']) elif 'image_medium' in img.keys(): images.append(img['image_medium']) if images: image_urls = ','.join(images) item['images'] = image_urls item['rent_buy'] = 'rent' self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://en.location-etudiant.fr/images/logo.png' item['url'] = response.url item['description'] = response.xpath( '//p[@itemprop="description"]/text()').extract_first() item['title'] = response.xpath( '//h1[@itemprop="name"]/text()').extract_first() price = response.xpath('//div[@class="aPartirDe"]/span/text()').re( r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass item['district'] = str( response.body).split('"postalCode":')[-1].split(',')[0].replace( '"', '') item['city'] = str(response.body).split( '"addressLocality":')[-1].split(',')[0].replace( '"', '').strip().split(' ')[0] item['address'] = response.xpath( '//span[@itemprop="addressLocality"]/text()').extract_first() images = response.xpath( '//div[@class="photoVignette"]/img/@src').extract() image_urls = [] for img in images: img = 'https://www.location-etudiant.fr' + img.replace( 'h=81&w=81', 'h=410&w=525') image_urls.append(img) item['images'] = ','.join(image_urls) if 'location' in response.url: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' self.count += 1 print("Total Count: " + str(self.count)) yield item
def parse(self, response): item = RealestateItem() sel = Selector(response) item['name'] = ''.join( sel.xpath( '//section[@id="_mg_listing_detail"]//h1/text()').extract()) #item['price'] = ''.join(sel.xpath('///section[@id="_mg_listing_detail"]/div/div[1]/div[1]/p/text()').extract()) item['price'] = ''.join( sel.xpath( '///section[@id="_mg_listing_detail"]/div/div[2]/div[2]/p[3]/text()' ).extract()) item['url'] = response.url yield item
def parse_items(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() self.logger.info('Item Page %s', response.url) for sel in response.xpath('.//article[contains(@class,"resultBody")]'): item = RealestateItem() item['url'] = sel.xpath( './/a[contains(@rel,"listingName")]/@href').extract_first() item['address'] = sel.xpath( './/a[contains(@rel,"listingName")]/text()').extract_first() item['priceText'] = sel.xpath( './/p[@class="priceText"]/text()').extract_first() item['bedrooms'] = sel.xpath('.//dd[1]/text()').extract_first() item['bathrooms'] = sel.xpath('.//dd[2]/text()').extract_first() item['cars'] = sel.xpath('.//dd[3]/text()').extract_first() yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://static.paruvendu.fr/2018073108/communfo/img/structuresite/home/logoparuvendufr2016.png' item['url'] = response.url item['description'] = '' title = ' '.join( response.xpath('//h1[@class="auto2012_dettophead1txt1"]//text()'). extract()).replace('\n', '').replace('\r', '').strip() item['title'] = title price = response.xpath('//div[@id="autoprix"]/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass type1 = response.xpath( '//h1[@class="auto2012_dettophead1txt1"]/span/text()' ).extract_first() type1 = response.url.split('/')[-3] item['type'] = type1 addr_text = response.xpath( '//h1[@class="auto2012_dettophead1txt1"]/text()').extract( )[-1].strip() if addr_text: addr = addr_text.strip().split('\n')[-1].strip().split(' ') item['city'] = addr[0] district = re.findall(r'[\d]+', addr_text) if len(district) > 1: try: item['district'] = int(district[-1]) except: pass images = response.xpath( '//div[@class="imdet15-ConteneurMiniGlob"]//img/@src').extract() if images: imgs = [] for img in images: img = img.replace('88x88', '1000x1000') imgs.append(img) image_urls = ','.join(imgs) item['images'] = image_urls # agency_fee = response.xpath('//span[@class="lbl-agencyfees"]/text()').re(r'[\d.,]+') # if agency_fee: # agency_fee = ''.join(agency_fee) # item['agency_fee'] = agency_fee.replace(',', '.') desc = ''.join( response.xpath( '//div[@class="im12_txt_ann im12_txt_ann_auto"]/text()'). extract()).strip() if desc != '': item['description'] = desc if addr_text: addr = addr_text.strip().split('\n')[0].strip().split(' ') size = re.findall(r'[\d.,]+', addr[0]) if size: item['size'] = size[0] pieces = response.xpath( '//h1[@class="auto2012_dettophead1txt1"]/strong/text()').re( r'[\d]+') if pieces: pieces = pieces[0] item['pieces'] = pieces options = response.xpath('//div[@class="im11_hd_det"]') for option in options: key = option.xpath('./span/text()').extract_first() if 'Dont charges/mois' in key: other_charges = option.xpath('./strong/text()').re(r'[\d.,]+') if other_charges: item['other_charges'] = other_charges[0] elif 'Dépôt de garantie' in key: deposit = option.xpath('./strong/text()').re(r'[\d.,]+') if deposit: item['deposit'] = ''.join(deposit) elif 'Honoraires' in key: agency_fee = option.xpath('./strong/text()').re(r'[\d.,]+') if agency_fee: item['agency_fee'] = ''.join(agency_fee) agency_name = response.xpath( '//div[@class="contact16-lheig"]/strong/text()').extract_first() if agency_name: item['agency_name'] = agency_name.strip().replace('\n', ' ') agency_logo = response.xpath( '//div[@clas="im11_blc_visite_R"]/a/img/@src').extract_first() if agency_logo: item['agency_logo'] = agency_logo agency_address = response.xpath( '//div[@class="contact16-lheig contact16-lname"]/span/text()' ).extract_first() if agency_address: item['agency_address'] = agency_address.strip() else: agency_address = response.xpath( '//div[@class="contact16-adr"]/text()').extract() if agency_address: item['agency_address'] = '\n'.join(agency_address) li_attrs = response.xpath('//ul[@class="imdet15-infoscles"]/li') for li_attr in li_attrs: furnished = li_attr.xpath('./strong/text()').extract_first() if 'Meublé' == furnished: item['furnished'] = 1 floors = response.xpath( '//div[@class="im11_col_enr"]/dd/text()').extract() for text_floor in floors: if 'étage' in text_floor or 'Etage' in text_floor: item['rent_buy'] = re.findall(r'[\d]+', text_floor)[-1] if 'location' in response.url: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://www.fnaim.fr/uploads/Image/6e/SIT_FNAIM_637_SIT_FNAIM_537_LOGOFNAIM-SSBASELINE-AGENCE.png' item['url'] = response.url item['description'] = '' title = response.xpath('//h1[@itemprop="name"]/text()').extract_first() if title: title = title.strip() item['title'] = title price = response.xpath( '//span[@itemprop="price"]/text()').extract_first() if price: try: item['price'] = price.replace(',', '.').replace(' ', '') except: pass images = response.xpath('//div[@id="carousel"]//img/@src').extract() if images: image_urls = ','.join(images) item['images'] = image_urls temp_data = response.xpath( '//p[@id="chemin"]/span//span[@itemprop="title"]/text()').extract( ) if temp_data: temp_data = temp_data[3] item['city'] = temp_data.split(' ')[0] district = temp_data.split(' ')[1] district = re.findall('[\d.,]+', district) district = ''.join(district) try: item['district'] = int(district) except: pass area = response.xpath('//li[@class="picto surface"]/b/text()').re( r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area pieces = response.xpath('//li[@class="picto pieces"]/b/text()').re( r'[\d.,]+') if pieces: pieces = pieces[0] item['pieces'] = pieces agency_name = response.xpath( '//div[@class="libelle"]/a/text()').extract_first() if agency_name: agency_name = agency_name.strip() item['agency_name'] = agency_name agency_address = response.xpath( '//div[@class="coordonnees"]/p/text()').extract() if agency_address: new_agency_address = [] for addr in agency_address: new_agency_address.append(addr.strip()) if new_agency_address: item['agency_address'] = ' '.join(new_agency_address) item['agency_logo'] = response.xpath( '//a[@class="visuel"]/img/@src').extract_first() # other_charges_tages = response.xpath('//div[@class="description"]/p/text()').re(r'[\d.,]+') # other_charges = response.xpath('//div[@class="description"]/p[@itemprop="description"]/text()').re(r'[\d.,]+') temps = response.xpath( '//div[@class="description"]/p[not(@itemprop="description")]/text()' ).extract() if temps: for t in temps: if 'provision pour charges' in t: other_charges = re.findall('[\d.,]+', t) other_charges = ''.join(other_charges) item['other_charges'] = other_charges.replace(',', '.') elif 'Honoraires charge locataire' in t: agency_fee = re.findall('[\d.,]+', t) agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee.replace(',', '.') elif 'Dépôt de garantie' in t: deposit = re.findall('[\d.,]+', t) deposit = ''.join(deposit) item['deposit'] = deposit.replace(',', '.') descs = response.xpath( '//div[@class="description"]/p[@itemprop="description"]/text()' ).extract_first() if descs: descs = descs.strip() item['description'] = descs characteristics_tds = response.xpath( '//div[@class="caracteristique tab-left"]/ul/li') for td in characteristics_tds: spans_strs = td.xpath('./label/text()').extract_first() if spans_strs: if 'Type d’habitation' in spans_strs: type1 = td.xpath('./text()').extract_first() if type1: type1 = type1.strip() item['type'] = type1 elif 'Surface habitable' in spans_strs: area = td.xpath('./text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area elif 'Meublé' in spans_strs: furnished = td.xpath('./text()').extract_first() if furnished: if 'Oui' == furnished.strip(): furnished = 1 else: furnished = 0 item['furnished'] = furnished elif 'chambre' in spans_strs: rooms = td.xpath('./text()').re(r'[\d.,]+') if rooms: rooms = rooms[0] item['rooms'] = rooms elif 'construction' in spans_strs: construction_year = td.xpath('./text()').re(r'[\d.,]+') if construction_year: construction_year = construction_year[0] item['construction_year'] = construction_year elif 'Nombre d’étages:' in spans_strs: total_floors = td.xpath('./text()').re(r'[\d.,]+') if total_floors: total_floors = total_floors[0] # item['toilettes'] = total_floors elif 'Étage' in spans_strs: floors = td.xpath('./text()').re(r'[\d.,]+') if floors: floors = floors[0] item['floor'] = floors rent = "rent" item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://header-figaroimmobilier.figarocms.com/img/logo-figimmo.4f72456.svg' item['url'] = response.url item['description'] = '' title = response.xpath( '//div[@id="contenu"]/div/h1/text()').extract_first() if title: title = title.strip() item['title'] = title images = response.xpath( '//div[@class="item js-img-popup"]/a/@href').extract() if images: image_urls = ','.join(images) item['images'] = image_urls location = response.xpath( '//div[@id="contenu"]/div/h1/span/text()').extract_first() if location: location = location.strip() locations = location.split(' ') if len(locations) > 2: city = locations[1] district = locations[2].replace('me', '').replace('è', '').replace( 'er', '') item['city'] = city try: item['district'] = int(district) except: pass else: city = locations[1] item['city'] = city price = response.xpath( '//div[@id="js-complements-infos"]//span[@class="price"]/text()' ).re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass other_charges = response.xpath( '//div[@id="js-complements-infos"]//span[@class="charges"]/text()' ).re(r'[\d.,]+') if other_charges: other_charges = ''.join(other_charges) item['other_charges'] = other_charges.replace(',', '.') deposit = response.xpath( '//div[@id="js-complements-infos"]//span[@class="garantie"]/span[@class="value"]/text()' ).re(r'[\d.,]+') if deposit: deposit = ''.join(deposit) item['deposit'] = deposit.replace(',', '.') agency_fee = response.xpath( '//div[@id="js-complements-infos"]//span[@class="honoraires"]/span[@class="value"]/text()' ).re(r'[\d.,]+') if agency_fee: agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee.replace(',', '.') agency_name = response.xpath( '//div[@class="container-agency-infos "]/span[@class="agency-name"]/text()' ).extract_first() if agency_name: agency_name = agency_name.strip() item['agency_name'] = agency_name agency_address_xpaths = response.xpath( '//div[@class="agency-location"]') if agency_address_xpaths: agency_address_xpath = agency_address_xpaths[0] agency_address = agency_address_xpath.xpath('./text()').extract() new_agency_address = [] for addr in agency_address: new_agency_address.append(addr.strip()) if new_agency_address: item['agency_address'] = ' '.join(new_agency_address) ageny_logo = response.xpath( '//a[@class="agencyInformation"]/img/@src').extract_first() if ageny_logo: item['agency_logo'] = ageny_logo descs = response.xpath( '//p[@id="js-clicphone-description"]/text()').extract_first() if descs: descs = descs.strip() item['description'] = descs type1 = response.xpath( '//div[@id="js-container-secondary-infos"]//ul[@class="unstyled flex"]/li/text()' ).extract() if type1: try: type1 = type1[1].strip() item['type'] = type1 except: pass characteristics_tds = response.xpath( '//div[@class="container-features"]/ul[@class="list-features"]/li') for td in characteristics_tds: spans_strs = td.xpath('./text()').extract_first() if spans_strs: if 'm²' in spans_strs: area = td.xpath('./text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area elif 'pièce' in spans_strs: pieces = td.xpath('./text()').re(r'[\d.,]+') if pieces: pieces = pieces[0] item['pieces'] = pieces elif 'chambre' in spans_strs: rooms = td.xpath('./text()').re(r'[\d.,]+') if rooms: rooms = rooms[0] item['rooms'] = rooms elif 'salle de bain' in spans_strs: bath_rooms = td.xpath('./text()').re(r'[\d.,]+') if bath_rooms: bath_rooms = bath_rooms[0] item['rooms'] = rooms elif 'Toilettes:' in spans_strs: toilettes = td.xpath('./text()').re(r'[\d.,]+') if toilettes: toilettes = toilettes[0] # item['toilettes'] = toilettes elif 'étage' in spans_strs: floors = td.xpath('./text()').re(r'[\d.,]+') if floors: floors = floors[0] item['floor'] = floors if 'Meublé' in spans_strs: furnished = 1 else: furnished = 0 furnished = td.xpath('./text()').extract_first() if furnished: if 'Non' in furnished: furnished = 0 else: furnished = 1 item['furnished'] = furnished rent = "rent" item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_url'] = 'https://d26q4asbryw2nm.cloudfront.net/2390803/bundles/sahapp/favicon/largetile.png' item['url'] = response.url item['description'] = '' title = response.xpath('//div[@class="property-title-section"]/h1/text()').extract_first() if title: item['title'] = title price = response.xpath('//span[@class="rentable-unit-price"]/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass item['city'] = response.url.split('https://www.spotahome.com/')[-1].split('/')[0] district = response.xpath('//div[@class="property-title-section"]/h1/text()').re(r'[\d.,]+') if district: try: item['district'] = int(district[-1].replace(',', '')) except: pass addr = response.xpath('//section[@class="l-main-section l-property-main-section"]/div/div[@class="breadcrumb"]/span/text()').extract() if addr: item['address'] = addr[-1] avaiable_from = response.xpath('//div[@class="room--availability ga-detail-room-availability"]/text()').extract_first() if avaiable_from: avaiable_from = avaiable_from.split('Available: ')[-1] if avaiable_from: avaiable_from = avaiable_from addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first() if addr: addr = addr.split(' ') item['city'] = addr[0] district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d.,]+') if len(district) > 1: item['district'] = district[-1] images = response.xpath('//meta[@itemprop="image"]/@content').extract() if images: new_imgs = [] for img in images: img = img.strip() if img: new_imgs.append(img) if new_imgs: item['images'] = ','.join(new_imgs) area = response.xpath('//div[@class="left-panel"]//div[@class="btn btn-default btn-rounded btn-top-cover-default bold btn-shadow"]/text()').re(r'[\d.,]+') if area: area = area[-1].replace(',', '.') item['size'] = area furnished = response.xpath('//div[@class="AvailableRoomFeatures"]/text()').extract_first() if furnished == 'Furnished': item['furnished'] = 1 temp = response.xpath('//div[@class="left-panel"]/p/text()').extract() for t in temp: if 'Disponibilité' in t: avaiable_deposit = t.split(' ')[-1] descs = response.text.split('"description":"') if descs: new_desc = [] for d in descs: if d[:3] == '<p>': d = d.split('"')[0] d = d.strip().replace('</p>', '') ds = d.split('<p>') for dd in ds: dd = dd.strip() if dd: new_desc.append(dd) break if new_desc: item['description'] = '\n'.join(new_desc) if 'Property type:' in response.text: type1 = response.text.split('Property type:')[-1].split('</li>')[0] if type1: item['type'] = type1.strip() if 'Floor area:' in response.text: area = response.text.split('Floor area:')[-1].split('</li>')[0] if area: area = re.findall('[\d.,]+', area) if area: area = area[0].replace(',', '.') item['size'] = area if 'Floor:' in response.text: floor = response.text.split('Floor:')[-1].split('</li>')[0] if floor: floor = re.findall('[\d.,]+', floor) if floor: floor = floor[0] item['floor'] = floor if 'Number of bathrooms:' in response.text: bathrooms = response.text.split('Number of bathrooms:')[-1].split('</li>')[0] if bathrooms: bathrooms = bathrooms.strip() self.count += 1 print("Total Count: " + str(self.count)) item['rent_buy'] = 'rent' yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_logo'] = 'https://www.century21.fr/theme/generic/css/images/logo_century21-header.png' item['url'] = response.url item['description'] = '' title = response.xpath('//h1[@class="h1_page"]//text()').extract_first() if title: item['title'] = title price = response.xpath('//div[@id="focusAnnonceV2"]/section/span[@class="yellow"]/b/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass images = response.xpath('//div[@class="zone-galerie"]/div/a//img/@src').extract() new_images = [] if images: for img in images: new_images.append(response.urljoin(img)) image_urls = ','.join(new_images) item['images'] = image_urls desc = response.xpath('//div[@class="desc-fr"]/p/text()').extract_first() if desc: item['description'] = desc.strip() address = title.split(' - ') if address: item['city'] = address[len(address) - 3] try: item['district'] = int(address[len(address) - 2]) except: pass characteristics_tds = response.xpath('//div[@class="col-gauche-slide"]/div/ul/li') item['furnished'] = 0 for td in characteristics_tds: spans_strs = ''.join(td.xpath('./span/text()').extract()) if spans_strs: if 'Location meublée' in spans_strs: item['furnished'] = 1 elif 'Nombre de pièces' in spans_strs: pieces = td.xpath('./text()').extract_first() if pieces: pieces = pieces.strip() item['pieces'] = pieces elif 'Type d\'appartement' in spans_strs: pieces = td.xpath('./text()').extract_first() if pieces: pieces = pieces.strip() item['pieces'] = pieces elif 'Surface totale' in spans_strs: area = td.xpath('./text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area elif 'Année construction' in spans_strs: construction_year = td.xpath('./text()').re(r'[\d.,]+') if construction_year: construction_year = construction_year[0] item['construction_year'] = construction_year elif 'Honoraires charge locataire' in spans_strs: agency_fee = td.xpath('./text()').re(r'[\d.,]+') if agency_fee: agency_fee = agency_fee[0].replace(',', '.') item['agency_fee'] = agency_fee elif 'Dépôt de garantie' in spans_strs: deposit = td.xpath('./text()').re(r'[\d.,]+') if deposit: deposit = deposit[0].replace(',', '.') item['deposit'] = deposit elif 'Détail du loyer' in spans_strs: other_charges = td.xpath('./ul/li/text()').re(r'[\d.,]+') if other_charges: other_charges = other_charges[0].replace(',', '.') item['other_charges'] = other_charges rent = "buy" tt = response.xpath('//div[@id="filAriane"]//span[@itemprop="title"]/text()').extract() if 'Location Appartement' in tt: item['type'] = 'Appartement' if 'Location' in tt: rent = 'rent' item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['url'] = response.url item['description'] = '' title = response.xpath( '//div[@class="fd-title"]/h1/span[@class="mainh1"]/text()' ).extract_first() if title: title = title.strip() item['title'] = title temp_data = title.split(' ') item['type'] = temp_data[1] try: item['district'] = int( re.findall('[\d]+', title.split('Paris')[-1])[-1]) except: pass item['city'] = temp_data[len(temp_data) - 4] images = response.xpath( '//div[@id="bxSliderContainer"]//img[contains(@id, "media")]/@src' ).extract() if images: image_urls = ','.join(images) item['images'] = image_urls price = response.xpath('//span[@id="fd-price-val"]/text()').re( r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price except: pass descs = response.xpath( '//div[@id="propertyDesc"]/text()').extract_first() if descs: descs = descs.strip() item['description'] = descs item['furnished'] = 0 characteristics_tds = response.xpath( '//div[@class="property-description-characteristics"]/table//td') for td in characteristics_tds: spans_strs = td.xpath('./span/text()').extract() if len(spans_strs) > 1: if 'Surface:' in spans_strs[0]: area = td.xpath('./span[2]/text()').re(r'[\d.,]+') if area: area = area[0] item['size'] = area elif 'Pièce(s):' in spans_strs[0]: pieces = td.xpath('./span[2]/text()').re(r'[\d.,]+') if pieces: pieces = pieces[0] item['pieces'] = pieces elif 'Chambre(s):' in spans_strs[0]: rooms = td.xpath('./span[2]/text()').re(r'[\d.,]+') if rooms: rooms = rooms[0] item['rooms'] = rooms elif 'Salle(s)' in spans_strs[0]: bath_rooms = td.xpath('./span[2]/text()').re(r'[\d.,]+') if bath_rooms: bath_rooms = bath_rooms[0] # item['rooms'] = rooms elif 'Toilettes:' in spans_strs[0]: toilettes = td.xpath('./span[2]/text()').re(r'[\d.,]+') if toilettes: toilettes = toilettes[0] # item['toilettes'] = toilettes elif 'Nombre d\'étages:' in spans_strs[0]: floors = td.xpath('./span[2]/text()').re(r'[\d.,]+') if floors: floors = floors[0] item['floor'] = floors elif 'Construit en:' in spans_strs[0]: construction_year = td.xpath( './span[2]/text()').extract_first() if construction_year: item['construction_year'] = construction_year elif 'Meublé' == spans_strs[0]: item['furnished'] = 1 pricing_data_spans = response.xpath( '//div[@class="pricing-data"]/ul/li/span') for span in pricing_data_spans: spans_strs = span.xpath('./text()').extract_first() if spans_strs: if 'Loyer mensuel:' in spans_strs: rent = span.xpath('./text()').re(r'[\d.,]+') if rent: rent = ''.join(rent) # item['rent_price'] = rent elif 'Charges mensuelles:' in spans_strs: other_charges = span.xpath('./text()').re(r'[\d.,]+') if other_charges: other_charges = ''.join(other_charges) item['other_charges'] = other_charges elif 'Honoraires à la charge du locataire:' in spans_strs: agency_fee = re.findall('[\d.,]+', spans_strs.split('(')[0]) if agency_fee: agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee # elif 'Règlement des charges:' in spans_strs: # other_agency_fee = re.findall('[\d.,]+', spans_strs.split('(')[0]) # if other_agency_fee: # other_agency_fee = ''.join(other_agency_fee) # item['other_agency_fee'] = other_agency_fee elif 'Dépôt de garantie:' in spans_strs: deposit = re.findall('[\d.,]+', spans_strs.split('(')[0]) if deposit: deposit = ''.join(deposit) item['deposit'] = deposit agency_name = response.xpath( '//div[@class="agency-title"]/span/@title').extract_first() if agency_name: item['agency_name'] = agency_name agency_address = response.xpath( '//div[@class="agency-address"]/span/text()').extract_first() if agency_address: item['agency_address'] = agency_address agency_logo = response.xpath( '//div[@class="agency-logo"]/img/@src').extract_first() if agency_logo: item['agency_logo'] = agency_logo rent = "buy" if 'location' in str(response.url): rent = "rent" item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) item[ 'website_logo'] = 'https://www.avendrealouer.fr/Content/Default/Images/57x57-logoAVAL.png' yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_logo'] = 'https://www.pap.fr/images/logos/logo-pap.png' item['url'] = response.url item['description'] = '' title = response.xpath( '//h1[@class="item-title"]/span[@class="h1"]/text()' ).extract_first() if title: item['title'] = title price = response.xpath( '//h1[@class="item-title"]/span[@class="item-price"]/text()').re( r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace('.', '') except: pass temp = response.xpath( '//a[@itemprop="item"]/span[@itemprop="name"]/text()' ).extract_first() t = temp.split(' ')[0] if t == 'Location': item['rent_buy'] = 'rent' type1 = temp.split(' ')[-1] if type1: type1 = type1.strip() if type1: item['type'] = type1 characteristics_tds = response.xpath( '//ul[@class="item-tags"]/li/strong') for td in characteristics_tds: spans_strs = td.xpath('./text()').extract_first() if spans_strs: if 'pièce' in spans_strs: pieces = td.xpath('./text()').re(r'[\d.,]+') if pieces: pieces = pieces[0] item['pieces'] = pieces elif 'm²' in spans_strs: area = td.xpath('./text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area elif 'chambre' in spans_strs: rooms = td.xpath('./text()').re(r'[\d.,]+') if rooms: rooms = rooms[0] item['rooms'] = rooms addr = response.xpath( '//div[@class="item-description margin-bottom-30"]/h2/text()' ).extract_first() if addr: addr = addr.strip().split(' ') item['city'] = addr[0] district = response.xpath( '//div[@class="item-description margin-bottom-30"]/h2/text()' ).re(r'[\d.,]+') if len(district) > 0: try: item['district'] = int(district[-1]) except: pass images = response.xpath( '//div[@class="owl-thumbs"]/a/img/@src').extract() if images: image_urls = ','.join(images) item['images'] = image_urls desc = response.xpath( '//div[@class="margin-bottom-30"]/p/text()').extract() if desc: new_desc = [] for d in desc: d = d.strip() if d: new_desc.append(d) if new_desc: item['description'] = '\n'.join(new_desc) self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['url'] = response.url item['description'] = '' title = response.xpath( '//div[@class="col-md-8 hidden-xs hidden-sm"]/h3/text()' ).extract_first() if title: item['title'] = title temp = response.xpath('head/title/text()').extract_first() if temp: try: temp = temp.split(' - ') price = temp[-1] price = re.findall('[\d.,]+', price) price = ''.join(price) item['price'] = price.replace(',', '.') item['city'] = temp[len(temp) - 2] item['district'] = int(temp[len(temp) - 3]) except: pass type1 = response.url.replace('https://www.flatlooker.com/', '').split('/')[0] if type1: if 'appartement' in type1: item['type'] = 'appartement' else: item['type'] = type1 # addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first() # if addr: # addr = addr.split(' ') # item['city'] = addr[0] # # district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d.,]+') # if len(district) > 1: # item['district'] = district[-1] images = response.xpath( 'body/div[@class="container-fluid"]/img/@src').extract() if images: image_urls = ','.join(images) item['images'] = image_urls addr = response.xpath( '//div[@class="left-panel"]/h4[@class="orange bold"]/text()' ).extract_first() if addr: item['address'] = addr.strip() area = response.xpath( '//div[@class="left-panel"]//div[@class="btn btn-default btn-rounded btn-top-cover-default bold btn-shadow"]/text()' ).re(r'[\d.,]+') if area: pe = area[0] item['pieces'] = pe area = area[-1].replace(',', '.') item['size'] = area furnished = response.xpath( '//div[@class="left-panel"]/div[@class="flex-vcenter"]/p/text()' ).extract_first() if 'Non' in furnished: item['furnished'] = 0 else: item['furnished'] = 1 temp = response.xpath( '//div[@class="left-panel"]/p/text()').extract() for t in temp: if 'Disponibilité' in t: avaiable_deposit = t.split(' ')[-1] descs = response.xpath( '//div[@id="annonce"]/div[2]//div[@class="block-with-text"]//text()' ).extract() if descs: new_desc = [] for d in descs: d = d.strip() if d: new_desc.append(d) if new_desc: item['description'] = '\n'.join(new_desc) other_charges = response.xpath( '//span[@id="valueChargeRentProperty"]/text()').re(r'[\d.,]+') if other_charges: other_charges = ''.join(other_charges) item['other_charges'] = other_charges.replace(',', '.') temp = response.xpath('//table[@id="table-essentials"]//tr') for t in temp: strs = t.xpath('./td/text()').extract() if '\nDépôt de garantie\n' in strs: deposit = strs[1] if deposit and deposit.strip(): deposit = deposit.strip() deposit = re.findall('[\d.,]+', deposit) deposit = ''.join(deposit) item['deposit'] = deposit elif '\nHonoraires de location\n' in strs: agency_fee = strs[1] if agency_fee and agency_fee.strip(): agency_fee = agency_fee.strip() agency_fee = re.findall('[\d.,]+', agency_fee) agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee temp = response.xpath( '//table[@id="table-mesure"]/tbody//td/text()').extract() for i, t in enumerate(temp): strs = t if 'Étage' in strs: floor = temp[i + 1] if floor and floor.strip(): floor = floor.strip() floor = re.findall('[\d.,]+', floor) floor = floor[0] item['floor'] = floor if 'Location' in title: item['rent_buy'] = 'rent' self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://fr.foncia.com/bundles/fonciainternet/images/logos/[email protected]' item['url'] = response.url item['description'] = response.xpath( '//div[@class="OfferDetails-content"]/p/text()').extract_first() item['title'] = response.xpath( '//title/text()').extract_first().replace('- Foncia', '').strip() price = response.xpath('//p[@class="OfferTop-price"]/text()').re( r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass type1 = response.xpath( '//p[@class="Breadcrumbs-inner"]/span/text()').extract() type1 = response.url.split('/')[-3] item['type'] = type1[1] item['city'] = 'Paris' item['district'] = response.xpath( '//p[@class="OfferTop-loc"]/@data-gtm-zipcode').extract_first() images = response.xpath( '//ul[@class="OfferSlider-main-slides"]//img/@src').extract() image_urls = ','.join(images) item['images'] = image_urls other_tags = response.xpath('//*[@class="List List--data"]/li') for li in other_tags: key = ''.join(li.xpath('./span//text()').extract()) if 'Honoraires charge' in key: item['agency_fee'] = ''.join( li.xpath('./strong/text()').re(r'[\d.,]+')) elif 'Dépôt de garantie' in key: item['deposit'] = ''.join( li.xpath('./strong/text()').re(r'[\d.,]+')) elif 'Année de construction' in key: item['construction_year'] = li.xpath( './strong/text()').extract_first() attrs = response.xpath( '//div[@class="MiniData-row MiniData-row--bg"]/p/text()').extract( ) for txt_attr in attrs: if txt_attr[-1] == 'm': item['size'] = re.findall(r'[\d.,]+', txt_attr)[0] elif 'pièce' in txt_attr: item['pieces'] = re.findall(r'[\d]+', txt_attr)[0] elif 'chambre' in txt_attr: item['rooms'] = re.findall(r'[\d]+', txt_attr)[0] agency_name = response.xpath( '//p[@class="OfferContact-address OfferContact-address--center rwd--noMobile rwd--noTablet"]/a/strong/text()' ).extract_first() if agency_name: item['agency_name'] = agency_name.strip().replace('\n', ' ') agency_address = ''.join( response.xpath( '//p[@class="OfferContact-address OfferContact-address--center rwd--noMobile rwd--noTablet"]/a/text()' ).extract()).replace(' ', '').strip().replace('\n', ' ') if agency_address: item['agency_address'] = agency_address.strip() address = response.xpath( '//p[@data-behat="adresseBien"]/text()').extract_first() if address: item['address'] = address.strip().replace(' ', '').replace('\n', ' ') if 'location' in response.url: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['description'] = '' title = response.xpath('//h1[@class="_1KQme"]/text()').extract_first() if title: title = title.strip() item['title'] = title images = response.xpath( '//div[@data-qa-id="adview_gallery_container"]//img/@src' ).extract_first() if images: item['images'] = images if 'colocations' in response.url: item['type'] = 'Appartement' else: item['type'] = response.xpath( '//div[@data-qa-id="adview_price"]/span/text()').extract_first( ) item['url'] = response.url price = response.xpath( '//div[@data-qa-id="adview_price"]/div/span/text()').extract_first( ) if price: try: price = float(price.replace(' ', '')) item['price'] = price except: pass date_added_str_list = response.xpath( '//div[@data-qa-id="adview_date"]/text()').re('\d+') if len(date_added_str_list) > 3: year = int(date_added_str_list[2]) month = int(date_added_str_list[1]) day = int(date_added_str_list[0]) hour = int(date_added_str_list[3]) minute = int(date_added_str_list[4]) item["date_added"] = datetime.datetime(year, month, day, hour, minute) elif len(date_added_str_list) > 0: year = int(date_added_str_list[2]) month = int(date_added_str_list[1]) day = int(date_added_str_list[0]) item["date_added"] = datetime.datetime(year, month, day) type1 = response.xpath( '//div[@data-qa-id="criteria_item_real_estate_type"]/div/div[@class="_3Jxf3"]/text()' ).extract_first() if type1: item['type'] = type1 area = response.xpath( '//div[@data-qa-id="criteria_item_square"]/div/div[@class="_3Jxf3"]/text()' ).re(r'[\d.,]+') if area: area = area[0] item['size'] = area rooms_count = response.xpath( '//div[@data-qa-id="criteria_item_rooms"]/div/div[@class="_3Jxf3"]/text()' ).re(r'[\d.,]+') if rooms_count: rooms_count = rooms_count[0] item['pieces'] = rooms_count furnished = response.xpath( '//div[@data-qa-id="criteria_item_furnished"]/div/div[@class="_3Jxf3"]/text()' ).extract_first() if furnished: if 'Non' in furnished: furnished = 0 else: furnished = 1 item['furnished'] = furnished descs = response.xpath( '//div[@data-qa-id="adview_description_container"]/div/span[@class="_2wB1z"]/text()' ).extract() if descs: descs = '\n'.join(descs) item['description'] = descs locations = response.xpath( '//div[@data-qa-id="adview_location_informations"]/span/text()' ).extract() if len(locations) > 1: city = locations[0] district = locations[-1] item['city'] = city try: item['district'] = int(district) except: pass if len(locations) > 0: city = locations[0] item['city'] = city rent = "buy" if 'location' in str(response.url): rent = "rent" item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_logo'] = 'https://www.orpi.com/mstile-310x310.png' item['url'] = response.url item['description'] = '' title = response.xpath( '//div[@class="synopsis-textcell"]/h1/span//text()').extract() if title: title = ''.join(title) item['title'] = title price = response.xpath('//span[@class="price"]/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass type1 = response.xpath( '//span[@class="c-vignette__type"]/text()').extract_first() if type1: type1 = type1.split(' ')[0] if type1: item['type'] = type1 addr = response.xpath( '//span[@class="c-vignette__address"]/text()').extract_first() if addr: addr = addr.split(' ') item['city'] = addr[0] if len(addr) > 1: try: item['district'] = int(addr[1]) except: pass images = response.xpath( '//ul[@class="estate-carousel-nav-dots show-for-large js-estate-carousel-nav"]/li/img/@src' ).extract() if images: image_urls = ','.join(images) item['images'] = image_urls desc = response.xpath( '//div[@class="estateNeighborhood gutters brd-rg estateDescription"]//div[@class="paragraphs-textcell"]/p/text()' ).extract_first() if desc: item['description'] = desc address = response.xpath( '//address[@class="address"]/text()').extract_first() if address: address = address.split(' ') if len(address) > 1: item['city'] = address[0] item['district'] = address[1] elif len(address) == 1: item['city'] = address[0] characteristics_tds = response.xpath( '//ul[@class="dotted-list dotted-list--ocom"]/li') for td in characteristics_tds: spans_strs = td.xpath('./mark[1]/text()').extract_first() if spans_strs: if 'Nombre de pièce(s)' in spans_strs: pieces = td.xpath('./mark[2]/text()').extract_first() if pieces: pieces = pieces.strip() item['pieces'] = pieces elif 'Surface' in spans_strs: area = td.xpath('./mark[2]/text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area elif 'Nombre de chambre(s)' in spans_strs: rooms = td.xpath('./mark[2]/text()').re(r'[\d.,]+') if rooms: rooms = rooms[0] item['rooms'] = rooms elif 'Année de construction' in spans_strs: construction_year = td.xpath( './mark[2]/text()').extract_first() if construction_year: item['construction_year'] = construction_year elif 'Nombre d\'étages de l\'immeuble' in spans_strs: total_floors = td.xpath('./mark[2]/text()').extract_first() if total_floors: total_floors = total_floors item['toilettes'] = total_floors elif 'Nombre de salle(s) de bain/d’eau' in spans_strs: bath_rooms = td.xpath('./mark[2]/text()').extract_first() if bath_rooms: bath_rooms = bath_rooms item['bath_rooms'] = bath_rooms elif 'Étage' in spans_strs: floors = td.xpath('./mark[2]/text()').extract_first() if floors: floors = floors[0] item['floor'] = floors characteristics_tds = response.xpath( '//div[@class="onusBlock onusBlock--ocom"]/ul/li') for td in characteristics_tds: spans_strs = td.xpath('./text()').extract_first() if spans_strs: if 'Dépôt' in spans_strs: deposit = td.xpath('./text()').re(r'[\d.,]+') if deposit: deposit = ''.join(deposit) item['deposit'] = deposit.replace(',', '.') elif 'Honoraires TTC à la charge du locataire' in spans_strs: agency_fee = td.xpath('./text()').re(r'[\d.,]+') if agency_fee: agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee.replace(',', '.') elif 'd\'honoraires d\'état des lieux' in spans_strs: other_agency_fee = td.xpath('./text()').re(r'[\d.,]+') if other_agency_fee: other_agency_fee = ''.join(other_agency_fee) item['other_agency_fee'] = other_agency_fee.replace( ',', '.') elif 'Provisions pour charges' in spans_strs: other_charges = td.xpath('./text()').re(r'[\d.,]+') if other_charges: other_charges = ''.join(other_charges) item['other_charges'] = other_charges.replace(',', '.') rent = "rent" item['rent_buy'] = rent self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() properties = '' try: properties = str( response.body).split('var __property = {};')[-1].split( '__property.serverUrl')[0].split(';') except: return main_image = '' for prop in properties: if 'property.title' in prop: item['title'] = prop.split('=')[-1].replace('\\', '').replace( ';', '').strip() elif 'property.rooms' in prop: item['pieces'] = prop.split('=')[-1].replace(';', '').strip() elif 'property.description' in prop: item['description'] = prop.split('=')[-1].strip()[:-1].replace( '\\\\', "\\").replace('"', '').encode('utf-8').decode('unicode-escape') elif 'property.description' in prop: item['description'] = prop.split('=')[-1].replace( '\\', '').strip()[:-1] elif 'property.size' in prop: item['size'] = prop.split('=')[-1].replace('\\', '').replace( ';', '').strip() elif 'property.loyer' in prop: item['price'] = prop.split('=')[-1].replace('\\', '').replace( ';', '').strip() elif 'property.charges' in prop: item['other_charges'] = prop.split('=')[-1].replace( '\\', '').replace(';', '').strip() # elif 'property.disponibilite' in prop: # data = prop.split('=')[-1].replace('\\', '').replace(';', '').strip() # if data !='': # item['caf_avaliable'] = 1 elif 'property.main_photo' in prop: main_image = prop.split('=')[-1].replace('\\', '').replace( ';', '').strip() elif 'property.photos_all' in prop: datas = prop.split('=')[-1].replace('\\', '').replace( ';', '').strip().replace(']', '').replace('[', '').replace('"', '') prifix = 'https://www.ommi.fr/image/by/w/900/h/900/i/' + main_image.replace( main_image.split('/')[-1], '').replace('/', '_').replace( '"', '') images = [] for img in datas.split(','): image = prifix + img images.append(image) item['images'] = ','.join(images) elif 'property.short_address' in prop: address = prop.split('=')[-1].replace('\\', '').replace(';', '').strip() if address and 'Paris' in address: item['city'] = 'Paris' # item['parisian_district'] = address.split(' ')[1].replace('er', '').replace('e', '').replace('"', '') try: item['district'] = int( address.split(' ')[1].replace('er', '').replace( 'e', '').replace('"', '')) except: try: item['district'] = int( re.findall('[\d]+', address)[0]) except: pass item['url'] = response.url title = response.xpath('//title/text()').extract_first() if title: item['title'] = title.split('-')[0].strip() type = response.url.split('/')[-1].split('-')[0] item['type'] = type rent = "rent" if 'achat' in str(response.url): rent = "buy" item['rent_buy'] = rent item['online'] = 1 item['website'] = 'ommi' self.count += 1 print(self.count) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_logo'] = 'https://www.lesiteimmo.com/images/logo.svg?id=409b2bdfb76416b8f554' item['url'] = response.url item['description'] = response.xpath('//div[@itemprop="description"]/text()').extract_first().strip() item['title'] = response.xpath('//title/text()').extract_first() price = response.xpath('//span[@class="text-xl font-medium"]/span[@class="value"]/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass type1 = response.xpath('//h1[@itemprop="name"]/text()').extract_first() item['type'] = type1.strip().split(' ')[0] item['city'] = 'Paris' item['district'] = response.url.split('/')[-2].split('-')[-1] images = response.xpath('//div[contains(@class,"bg-cover h-64 lg:h-128 w-full")]/@style').extract() image_urls = [] for img in images: img_url = img.split("url('")[-1].split("')")[0] image_urls.append(img_url) item['images'] = ','.join(image_urls) agency_name = response.xpath('//div[@class="font-medium text-grey-darkest"]/text()').extract_first() if agency_name: item['agency_name'] = agency_name.strip().replace('\n', ' ') agency_address = response.xpath('//div[@class="text-grey"]/text()').extract_first() if agency_address: item['agency_address'] = agency_address.strip() agency_logo = response.xpath('//div[@class="mb-2"]/img/@src').extract_first() if agency_logo: item['agency_logo'] = agency_logo attr_tags = response.xpath('//div[@class="p-4 flex flex-wrap justify-start items-start"]//div[@class="flex w-full p-2 bg-grey-lightest"]') for div in attr_tags: key = div.xpath('./div[@class="w-2/3 text-grey-darker mr-2"]/text()').extract_first() val = div.xpath('./div[@class="w-1/3 text-grey text-right"]/text()').extract_first() if 'Étage' in key: item['floor'] = re.findall(r'[\d]+', str(val))[0] elif 'Nbre. de chambres' in key: item['rooms'] = val.strip() elif 'Adresse' == key: item['address'] = val.strip() elif 'Nb. de pièces' in key: item['pieces'] = val.strip() elif 'Charges' == key: item['other_charges'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '') elif 'Dépôt de garantie' in key: item['deposit'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '') elif 'Honoraires' == key: item['agency_fee'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '') elif 'Année de construction' in key: item['construction_year'] = val.strip() elif 'Surface habitable' in key: item['size'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '') if 'location' in response.url: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' item['rent_buy'] = 'rent' self.count += 1 print("Total Count: " + str(self.count)) yield item
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item['url'] = response.url item['description'] = '' title = response.xpath('//table[@class="licom-breadcrumb"]//td/h1/text()').extract_first() if title: item['title'] = title price = response.xpath('//div[@itemprop="price"]/h2/text()').re(r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass type1 = response.xpath('//div[@class="col-xs-3 offer-type"]/p/text()').extract_first() if type1: type1 = type1.strip() if type1: item['type'] = type1 addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first() if addr: addr = addr.split(' ') item['city'] = addr[0] district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d]+') if len(district) > 1: try: item['district'] = int(district[-1]) except: pass images = response.xpath('//div[@class="carousel-content noSlider"]//img/@src').extract() if images: image_urls = ','.join(images) item['images'] = image_urls agency_fee = response.xpath('//span[@class="lbl-agencyfees"]/text()').re(r'[\d.,]+') if agency_fee: agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee.replace(',', '.') desc = response.xpath('//div[@class="offer-description-text"]/meta/@content').extract_first() if desc: item['description'] = desc area = response.xpath('//span[@class="offer-area-number"]/text()').re(r'[\d.,]+') if area: area = area[0].replace(',', '.') item['size'] = area pieces = response.xpath('//span[@class="offer-rooms-number"]/text()').re(r'[\d.,]+') if pieces: pieces = pieces[0] item['pieces'] = pieces other_charges = response.xpath('//span[@id="valueChargeRentProperty"]/text()').re(r'[\d.,]+') if other_charges: other_charges = ''.join(other_charges) item['other_charges'] = other_charges.replace(',', '.') deposit = response.xpath('//span[@id="valueDepotRentGarantee"]/text()').re(r'[\d.,]+') if deposit: deposit = ''.join(deposit) item['deposit'] = deposit.replace(',', '.') agency_fee = response.xpath('//span[@id="valueFeesRentAgency"]/text()').re(r'[\d.,]+') if agency_fee: agency_fee = ''.join(agency_fee) item['agency_fee'] = agency_fee.replace(',', '.') agency_name = response.xpath('//span[@itemprop="seller"]/text()').extract_first() if agency_name: item['agency_name'] = agency_name agency_address = response.xpath('//p[@class="agency-infos size_12 nomargin"]//span/text()').extract() if agency_address: item['agency_address'] = '\n'.join(agency_address) characteristics_tds = response.xpath('//ul[@itemprop="description"]/li') for td in characteristics_tds: spans_strs = td.xpath('./div[1]/text()').extract_first() if spans_strs: if 'Nombre d\'étages de l\'immeuble' in spans_strs: total_floors = td.xpath('./div[2]/text()').extract_first() if total_floors: total_floors = total_floors item['toilettes'] = total_floors elif 'Nombre de salle de bain' in spans_strs: bath_rooms = td.xpath('./div[2]/text()').extract_first() if bath_rooms: bath_rooms = bath_rooms item['bath_rooms'] = bath_rooms elif 'Etage du bien' in spans_strs: floors = td.xpath('./div[2]/text()').extract_first() if floors: item['floor'] = floors.replace('e', '') elif 'Meublé' in spans_strs: furnished = td.xpath('./div[2]/text()').extract_first() if furnished == 'Oui': item['furnished'] = 1 t = response.xpath('//li[@class="columns current"]/a/span/text()').extract_first() if t == 'Location': item['rent_buy'] = 'rent' self.count += 1 print("Total Count: " + str(self.count)) yield item
def parse(self, response): json_data = json.loads(response.body) for data in json_data['realEstateAds']: try: item = RealestateItem() item['online'] = 1 item['website'] = self.name item['website_logo'] = 'https://www.bienici.com/cacheForever/45ee97a38fe6a64ae66' \ 'c7a2310cf2192ec35f538/logos/logo_bienici.svg' if 'roomsQuantity' in data.keys(): item['url'] = 'https://www.bienici.com/annonce/location/{}/appartement/{}pie' \ 'ces/{}?q=%2Frecherche%2Flocation%2Fparis-75000'\ .format(data['city'].strip().replace(' ', '-'), data['roomsQuantity'], data['id']) item['pieces'] = data['roomsQuantity'] item['rooms'] = data['roomsQuantity'] else: item['url'] = 'https://www.bienici.com/annonce/location/{}/appartement/{}?q=%' \ '2Frecherche%2Flocation%2Fparis-75000'\ .format(data['city'].strip().replace(' ', '-'), data['roomsQuantity'], data['id']) item['description'] = data['description'] item['title'] = data['title'] if 'surfaceArea' in data.keys(): item['size'] = data['surfaceArea'] if item['title'] == "": item['title'] = "Appartement {} pièces {} m²".format( item['pieces'], item['size']) item['rent_buy'] = data['adType'] item['city'] = data['city'].strip().split(' ')[0] item['district'] = data['postalCode'] item['price'] = data['price'] if 'floor' in data.keys(): item['floor'] = data['floor'] if 'agencyRentalFee' in data.keys(): item['agency_fee'] = data['agencyRentalFee'] if 'safetyDeposit' in data.keys(): item['deposit'] = data['safetyDeposit'] if 'isFurnished' in data.keys(): item['furnished'] = 1 if 'yearOfConstruction' in data.keys(): item['construction_year'] = data['yearOfConstruction'] imgs = [] for img in data['photos']: imgs.append(img['url']) item['images'] = ','.join(imgs) # self.count += 1 # print("Total Count: " + str(self.count)) yield Request( 'https://www.bienici.com/realEstateAd.json?id={}&access_token=2lWi9yZU%2FR%' '2FuoEAybaCQI7Q0CMe3RD5aquaK7rLs63Y%3D%3A5b543410ac93c7009bfa3572' .format(data['id']), self.final_parse, meta={'item': item}) # yield item except Exception as e: print("err: " + e.args[0]) self.count += 1 print("Total Count: " + str(self.count)) yield item total = int(json_data['total']) current = int(json_data['from']) if current < total: next = current + 24 page = int((current / 24) + 1) next_page_url = 'https://www.bienici.com/realEstateAds.json?filters=%7B%22size%22%3A24%2C%22fr' \ 'om%22%3A{}%2C%22filterType%22%3A%22rent%22%2C%22propertyType%22%3A%5B%22house%' \ '22%2C%22flat%22%5D%2C%22page%22%3A{}%2C%22resultsPerPage%22%3A24%2C%22maxAuth' \ 'orizedResults%22%3A2400%2C%22sortBy%22%3A%22relevance%22%2C%22sortOrder%22%3A%' \ '22desc%22%2C%22onTheMarket%22%3A%5Btrue%5D%2C%22showAllModels%22%3Afalse%2C%22z' \ 'oneIdsByTypes%22%3A%7B%22zoneIds%22%3A%5B%22-7444%22%5D%7D%7D&extensionType=exte' \ 'ndedIfNoResult&leadingCount=2'.format(next, page) yield Request(next_page_url, self.parse) if next_page_url: yield Request(response.urljoin(next_page_url), callback=self.parse, dont_filter=True)
def parse(self, response): url_tags = response.xpath('//ul[@class="results-compact"]/li') for tag in url_tags: data_json = tag.xpath('./@data-json').extract_first() data_json = json.loads(data_json) url = tag.xpath('./a/@href').extract_first() item = RealestateItem() item['title'] = data_json['title'] +' ' + data_json['title_city'] item['online'] = 1 item['website'] = self.name item['website_logo'] = 'http://www.laforet.com/sites/default/themes/laforet/logo.png' try: item['description'] = data_json['description'] except: pass try: item['price'] = data_json['price'].replace(' ', '') except: pass try: item['type'] = data_json['title'].split(' ')[0].lower() except: pass try: item['city'] = data_json['title_city'].split(' ')[0] except: pass try: item['district'] = data_json['postalCode'] except: try: item['district'] = re.findall(r'[\d]+', data_json['title_city'])[-1] except: pass try: item['city'] = data_json['title_city'].split(' ')[0] except: pass imgurl1 = data_json['imageUrl'] imgurl = imgurl1 +',' + imgurl1.split('.jpg')[0][0:-1] + 'b.jpg' imgurl = imgurl +',' + imgurl1.split('.jpg')[0][0:-1] + 'c.jpg' imgurl = imgurl +',' + imgurl1.split('.jpg')[0][0:-1] + 'd.jpg' item['images'] = imgurl try: item['size'] = data_json['surface'] except: pass try: item['pieces'] = data_json['roomsQuantity'] except: pass try: item['deposit'] = data_json['deposit'] except: pass try: item['agency_fee'] = data_json['fees'] except: pass item['rent_buy'] = 'rent' item['url'] = response.urljoin(data_json['url']) yield Request(item['url'], callback=self.final_parse, meta={'item': item}, ) next_page_url = response.xpath('//*[@aria-label="Next"]/a[@aria-label="Next"]/@href').extract_first() if next_page_url: yield Request(response.urljoin(next_page_url), callback=self.parse, dont_filter=True)
def final_parse(self, response): item = RealestateItem() title= response.xpath('//h1[@class="detail-title title1"]/text()').extract_first() if title: title = title.strip() item['title'] = title item['type'] = response.xpath('//h2[@class="c-h2"]/text()').extract_first() item['url'] = response.url rent = "rent" if 'achat' in str(response.url): rent = "buy" item['rent_buy'] = rent # price = response.xpath('//a[@class="js-smooth-scroll-link price"]/text()').extract_first() # if price: # price = price.strip().replace('\xa0', '').replace('€', '').replace(' ', '').strip() # item['price'] = float(price) # else: # item['price'] = None rooms= 0 pieces= 0 size= 0 details= response.xpath('//ul[@class="criterion"]/li') for detail_li in details: detail = detail_li.xpath('./text()').extract_first() if 'pièce' in detail: pieces= detail.split(" ")[0] if 'chambre' in detail: rooms= detail.split(" ")[0] if 'm²' in detail: size= detail.split(" ")[0].replace(",", ".") item['rooms'] = rooms item['pieces'] = pieces item['size'] = size location = response.xpath('//p[@class="localite"]/text()').extract_first() if location and 'Paris' in location: city= 'Paris' parisian_district= location.split(' ')[1].replace('ème', '') else: city= location parisian_district = None item['city'] = city item['parisian_district'] = parisian_district try: item['district'] = int(parisian_district) except: pass agency_name= response.xpath('//a[@class="agence-link"]/@title').extract_first() item['agency_name'] = agency_name try: agency_address= response.xpath('//div[@class="agence-adresse fi fi-map-pin"]/text()').extract_first().strip() except: agency_address= None item['agency_address'] = agency_address agency_phone= response.xpath('//a[@class="bub-phone tagClick"]/@data-phone').extract_first() item['agency_phone'] = agency_phone try: agency_postal_code= [i for i in agency_address.split(" ") if RepresentsInt(i)][0] except: agency_postal_code= None item['agency_postal_code'] = agency_postal_code item['agency_logo'] = response.xpath('//img[@class="agence-logo-img"]/@src').extract_first() images= response.xpath('//div[contains(@class, "carrousel_slide")]/div/@data-lazy').extract() pic = [] for img_data in images: picture = json.loads(img_data)['url'].replace("//", "") pic.append(picture) images1= response.xpath('//div[contains(@class, "carrousel_slide")]/img/@src').extract() for img_data in images1: pic.append(img_data) pics = ",".join(pic) charges_val_keys = response.xpath('//*[@class="categorie with-padding-bottom"]//p/text()').extract() charges_val_texts = response.xpath('//*[@class="categorie with-padding-bottom"]//p/span/text()').extract() item['images'] = pics item['online'] = 1 item['website'] = 'seloger' item['website_logo'] = 'https://static-seloger.com/z/produits/sl/homepage/assets/img/bandeau_app/sl_logo_152x152_thumb.png' id = str(response.url).split('/')[-1].split('.')[0] if id: url = 'https://www.seloger.com/detail,json,caracteristique_bien.json?idannonce=' + id # proxy = response.meta['proxy'] yield Request(url, callback=self.final_attr, meta={'item':item}, dont_filter=True)
def final_parse(self, response): item = RealestateItem() item['online'] = 1 item['website'] = self.name item[ 'website_logo'] = 'https://static.meilleursagents.com/3.4.0/img/www/logo-meilleursagents-std.png' item['url'] = response.url item['description'] = response.xpath( '//p[@class="tjustify chapo-small"]/text()').extract_first() item['title'] = response.xpath( '//h1[@class="margin-none"]/text()').extract_first().strip() price = response.xpath('//div[@class="h2"]/strong/text()').re( r'[\d.,]+') if price: try: price = ''.join(price) item['price'] = price.replace(',', '.') except: pass item['type'] = response.xpath( '//table[@class="table table-striped chapo-small pull-left"]//tr/td/text()' ).extract_first().split(' ')[0] item['city'] = response.xpath('//div[@class="muted"]/text()' ).extract_first().strip().split(' ')[0] item['district'] = response.xpath('//div[@class="muted"]/text()').re( r'[\d]+')[0] images = response.xpath( '//div[@id="realtor_listing_carousel_pictures"]/a/@href').extract( ) imgs = [] for img in images: img = 'https:' + img imgs.append(img) image_urls = ','.join(imgs) item['images'] = image_urls agency_name = response.xpath( '//h4[@class="tcenter margin-bottom"]/a/text()').extract_first() if agency_name: item['agency_name'] = agency_name.strip() other_tags = response.xpath('//div[not(@id)]/table/tr') for li in other_tags: key = ''.join(li.xpath('./td//text()').extract()) if 'pièces' in key: item['pieces'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Surface de' in key: item['size'] = li.xpath('./td//text()').re(r'[\d.,]+')[0] elif 'chambre' in key: item['rooms'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Etage' in key: item['floor'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Construit en' in key: item['construction_year'] = li.xpath('./td//text()').re( r'[\d]+')[0] if "Studio" == item['type']: item['pieces'] = 1 other_tags = response.xpath('//div[not(@id)]/table/tr') for li in other_tags: key = ''.join(li.xpath('./td//text()').extract()) if 'pièces' in key: item['pieces'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Surface de' in key: item['size'] = li.xpath('./td//text()').re(r'[\d.,]+')[0] elif 'chambre' in key: item['rooms'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Etage' in key: item['floor'] = li.xpath('./td//text()').re(r'[\d]+')[0] elif 'Construit en' in key: item['construction_year'] = li.xpath('./td//text()').re( r'[\d]+')[0] other_tags = response.xpath('//div[@id="details"]/table//tr') for li in other_tags: key = ''.join(li.xpath('./td[1]//text()').extract()) if 'Charges locatives' in key: other_charges = li.xpath('./td[2]//text()').re(r'[\d]+') if other_charges: item['other_charges'] = other_charges[0] elif 'Dépôt de garantie' in key: deposit = li.xpath('./td[2]//text()').re(r'[\d.,]+') if deposit: item['deposit'] = deposit[0] elif 'Honoraires charge' in key: fee = li.xpath('./td[2]//text()').re(r'[\d]+') if fee: item['agency_fee'] = fee[0] if 'location' in response.url: item['rent_buy'] = 'rent' else: item['rent_buy'] = 'buy' item['rent_buy'] = 'rent' self.count += 1 print("Total Count: " + str(self.count)) yield item