def parse_property_info(self, response): item = BuyItem() price = price_per_sqft = min_area = max_area = bathrooms = bedrooms = SuperBuiltupArea = is_resale = 0 is_price_fix = 1 address = city = location = age_of_property = agent_name = agent_type = launch_date = status = amenities = "" speciality = {} more_info = [] try: full_price = ''.join( response.xpath('//div[@class="nActualAmt"]//text()').extract()) price = float(full_price.split()[1]) if "Lac" in full_price: price *= 100000 if "Cr" in full_price: price *= 1000000 except: pass try: SuperBuiltupArea = float(''.join( response.xpath( '//span[@id="coveredAreaDisplay"]//text()').extract())) except: pass try: min_area = max_area = float(''.join( response.xpath( '//span[@id="carpetAreaDisplay"]//text()').extract())) except: pass datalist = response.xpath( "//div[@class='nMoreListData']/div[@class='nDataRow']") for data in datalist: try: label = data.xpath( 'div[@class="dataLabel"]//text()').extract()[0] if "Price" in label: try: if price == 0.0: price = ((data.xpath( 'div[@class="dataVal"]/span[contains(@class,"fBold")]//text()' ).extract())[0].split())[-1].replace(',', '').lower() price = float(''.join( ele for ele in price if ele.isdigit() or ele == '.')) except: pass try: price_per_sqft = ((data.xpath( 'div[@class="dataVal"]/span[@class="light"]/text()' ).extract())[1].split())[1] priceunit = ((data.xpath( 'div[@class="dataVal"]/span[@class="light"]//text()' ).extract())[1].split())[3].strip() price_per_sqft = float(''.join( ele for ele in price_per_sqft if ele.isdigit() or ele == '.')) if "sqyrd" in priceunit: price_per_sqft *= 9 except: pass if "Address" in label: try: address = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).replace('\n', '').replace('\t', '') except: pass if "Water Availability" in label: try: speciality[label] = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).replace('\n', '').replace('\t', '') except: pass if "Status of Electricity" in label: try: speciality[label] = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).replace('\n', '').replace('\t', '') except: pass if "Flooring" in label: try: speciality[label] = ''.join( data.xpath( 'div[contains(@class,"dataVal")]//text()'). extract()).replace('\n', '').replace('\t', '') except: pass except: pass try: if address == "": address = (''.join( response.xpath('//div[@class="nProjNmLoc"]//text()'). extract())).replace('\n', '').replace('\t', '') except: pass try: location = ''.join( response.xpath('//span[@itemprop="streetAddress"]//text()'). extract()).replace(',', ' ') except: pass try: city = ''.join( response.xpath('//span[@itemprop="addressLocality"]//text()'). extract()).replace(',', ' ') except: pass try: if address == "": address = location + ' ' + city except: pass if city == '': try: city = address.split(',')[-1] except: pass if location == '': try: location = address.split(',')[-2] except: pass datalist = response.xpath( '//div[@class="nInfoDataBlock"]/div[@class="nDataRow"]') for data in datalist: try: label = ''.join( data.xpath('div[@class="dataLabel"]//text()').extract()) if "Configuration" in label: try: bedrooms = int( ((data.xpath('div[@class="dataVal"]/span//text()'). extract())[0].split())[0]) except: pass try: other = (''.join( data.xpath('div[@class="dataVal"]/text()').extract( ))).split(",") for info in other: info = info.replace('\n', ' ') if "Bathroom" in info: info = (info.strip().split())[0] bathrooms = int(info) if "Room" in info: speciality['addional_room'] = info.strip() except: pass if "Transaction" in label: try: temp = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).strip() if "Resale" in temp: is_resale = 1 except: pass if "Status" in label: try: status = (''.join( data.xpath('div[@class="dataVal"]//text()'). extract())).replace('\n', '') if status == '': status = (''.join( data.xpath('li/div[@class="dataVal"]//text()'). extract())).replace('\n', '') except: pass if "Age" in label: try: age_of_property = (''.join( data.xpath('div[@class="dataVal"]//text()'). extract())).replace('\n', '') if age_of_property == '': age_of_property = (''.join( data.xpath('li/div[@class="dataVal"]//text()'). extract())).replace('\n', '') except: pass if "Furnish" in label: try: speciality['furnishing'] = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).strip() except: pass if "Car Parking" in label: try: speciality['parking'] = ''.join( data.xpath('div[@class="dataVal"]//text()'). extract()).strip() except: pass except: pass try: description = ''.join( response.xpath( "//span[@class='dDetail']//text()").extract()).replace( '\n', '') except: pass if description == "": try: description = (''.join( response.xpath("//div[@class='nAboutBrf']//text()"). extract())).replace('\n', '').replace('\t', '') except: pass try: posted_on_date = (((( response.xpath('//div[@class="propIDnPDate"]//text()').extract( ))[0].split('|'))[1]).split(':'))[1].strip().replace(',', ' ') posted_on_date = posted_on_date.split() posted_on_date[0], posted_on_date[1] = posted_on_date[ 1], posted_on_date[0] posted_on_date[1] = month.find_month(posted_on_date[1]) posted_on_date = ' '.join(posted_on_date) except: pass try: code = (((response.xpath('//div[@class="propIDnPDate"][1]//text()') .extract())[0].split("|"))[0].split(":"))[1].strip() except: pass try: agent_name = response.xpath('//div[@class="agntName"]//text()' ).extract()[-1] # Remove 'Contact' except: pass amenities = ','.join( response.xpath( '//div[@id="normalAminities"]//li[not(@class="notAvail")]/span[@class="ameLabel"]/text()' ).extract()) item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['posted_on'] = posted_on_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url item['code'] = code.encode('utf8') item['description'] = description.encode('utf8') yield item
def parse_property_info(self, response): # this function scrpaes info off the property page using xpaths # try except is used to avoid crashing in case of missing fields item = NewSpdItem() try : maintainance = posted_by_details = posted_on_date = project_name = price_per_unit = location = address = city ="" carpet_area = super_built_area = -1.0 washroom = bedrooms = -1 is_price_fixed = True price = -1.0 try : super_built_area = (''.join(response.xpath('//i[@id="superbuiltupArea_span"]//text()').extract())) super_built_area = float(re.findall('\d+', super_built_area)[0]) except : super_built_area = -1.0 try : carpet_area = (''.join(response.xpath('//i[@id="builtupArea_span"]//text()').extract())) carpet_area = float(re.findall('\d+', carpet_area)[0]) except : carpet_area = -1.0 if carpet_area == "": try : carpet_area = (''.join(response.xpath('//i[@id="carpetArea_span"]//text()').extract())) carpet_area = float(re.findall('\d+', carpet_area)[0]) except : carpet_area = -1.0 try: price = (''.join(response.xpath('//span[@class="redPd b"]/text()').extract())).replace(',','').lower() islac = 'lac' in price iscr = 'cr' in price price = float(re.findall('\d+', price)[0]) if(islac): price = price * 100000 if(iscr): price = price * 10000000 except: price = -1.0 try : maintain = response.xpath('//div[@class="mb10"]//li') for main in maintain: try : if "Maintenance" in ''.join(main.xpath('i//text()').extract()): maintainance = (main.xpath('em/text()').extract()[-1]).replace('\n','') maintainance = re.sub(' +',' ', maintainance) break except : pass except : pass try: address = (''.join(response.xpath('//div[@id="AddTuplePd"]//text()').extract())).replace('Address:','').replace('\n','') address = re.sub(' +', ' ', address) city = address.split(',')[-2] location = address.split(',')[-3] except: pass try: washroom = (response.xpath('//div[@class="lf"]/b//text()').extract()[0]).replace(':','') bedrooms = (''.join(response.xpath('//div[@id="bedroom_numLabel"]/b//text() ').extract()[-1])).replace(':','') washroom = int(washroom) bedrooms = int(bedrooms) except: washroom = bathrooms = -1.0 pass try: project_name = (''.join(response.xpath('//span[@class="addPdElip lf"]//text()').extract()[0])).replace('\n','') project_name = re.sub(' +',' ',project_name) except: pass try: posted_on_date = (''.join(response.xpath('//span[contains(@class,"PostdByPd")]//text()').extract())).replace('\n','').replace('Posted on:','').replace(',','') posted_on_date = posted_on_date.split() posted_on_date[0],posted_on_date[1] = posted_on_date[1],posted_on_date[0] posted_on_date[1] = find_month(posted_on_date[1]) posted_on_date[0] += "0" if len(posted_on_date) == 1 else "" posted_on_date = '-'.join(posted_on_date[::-1]) posted_on_date = re.sub(' +','',posted_on_date) except: pass try: posted_by_details = (''.join(response.xpath('//a[@id="ContactPdBody"]/text()').extract())).replace('Contact','').replace('\n','') posted_by_details = re.sub(' +',' ',posted_by_details) # Remove 'Contact' except: pass try : temp = ''.join(response.xpath('//em/text()').extract()) is_price_fixed = False if "Negotiable" in temp else True except : pass try : item['Price'] = price item['PricePerUnit'] = price_per_unit.encode('utf8') item['maintainance'] = maintainance.encode('utf8') item['is_price_fixed'] = is_price_fixed item['SuperBuiltupArea'] = super_built_area item['CarpetArea'] = carpet_area item['city'] = city.encode('utf8') item['address'] = address.encode('utf8') item['Location'] = location.encode('utf8') item['Washroom'] = washroom item['PostedBy'] = posted_by_details.encode('utf8') item['PostingDate'] = posted_on_date.encode('utf8') item['ProjectName'] = project_name.encode('utf8') item['Bedrooms'] = bedrooms item['URL'] = response.url item['website'] = (response.url).split('/')[2] yield item except : yield NewSpdItem() except : yield item
def parse_property_info(self, response): item = BuyItem() self.driver.get(response.url) input() try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="npPrice"]//text()'))) except TimeoutException: return response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') is_resale = price = bedrooms = bathrooms = price_per_sqft = 0 is_price_fix = 1 print "\n",response.url,"\n" try : full_price = ','.join(response.xpath('//div[@class="npPrice"]//text()').extract()) print full_price print price price = float(full_price.split(',')[3]) if 'Cr' in full_price : price *= 10000000 if "Lac" in full_price : price *= 100000 except : pass if price == 0: try : full_price = ' '.join(response.xpath('//span[@id="pdPrice"]//text()').extract()) print full_price print price price = float(full_price.split()[0]) if 'Cr' in full_price : price *= 10000000 if "Lacs" in full_price : price *= 100000 except : pass print price input() try : price_per_sqft = float((response.xpath('//div[@class="npBasePrice"]/span/text()').extract())[3]) except : pass try : price_per_sqft = float((response.xpath('//div[@id="pricePerUnitArea"]/text()').extract()).split()[1]) except : pass city = address = location = "" try : address = (''.join(response.xpath('//div[@class="project-location"]/span//text()').extract())).replace('\n','') city = address.split(',')[-2] location = (response.xpath('//a[@class="ttlLink"]/text()').extract()[1]) except : pass if address == "": try : address = (''.join(response.xpath('//span[@id="address"]/text()').extract())).replace('\n','') city = address.split(',')[-2] location = (response.xpath('//a[@class="ttlLink"]/text()').extract()[1]) except : pass status = "" min_area = max_area = 0.0 try : status = ''.join(response.xpath('//div[@class="npPossessionDate"]/text()').extract()[2]) except: pass if status == "": try : status = ''.join(response.xpath('//div[@class="pdDetailInfoOther"]/div[3]/span/text()').extract()) except: pass try : temp = ''.join(response.xpath('//div[@class="npAreaPrice"]/span[1]/text()').extract()) temp = temp.split() temp = [float(i) for i in temp if i.isdigit()] try : min_area = temp[0] max_area = temp[1] except : max_area = min_area except : pass SuperBuiltupArea = 0.0 try : SuperBuiltupArea = ' '.join(response.xpath('//div[@class="npPrjArea"] /span//text()').extract()) if "acres" in SuperBuiltupArea: SuperBuiltupArea = float(SuperBuiltupArea.split()[0])*43560 else : SuperBuiltupArea = float(SuperBuiltupArea.split()[0]) except : pass if min_area == 0.0 : try : min_area = float(''.join(response.xpath('//span[@id="superbuiltupArea_span"]/text()').extract())) max_area = min_area except : pass try : SuperBuiltupArea = ''.join(response.xpath('//div[@id="socAreaOccupied"]/text()').extract()) if "acres" in SuperBuiltupArea: SuperBuiltupArea = float(SuperBuiltupArea.split()[0])*43560 else : SuperBuiltupArea = float(SuperBuiltupArea.split()[0]) except: pass launch_date = CarpetArea = posted_on = '' try : posted_on = (''.join(response.xpath('//span[@class="pdPropDate"]/text()').extract()).replace(',','')).split() posted_on[0],posted_on[1] = posted_on[1],posted_on[0] posted_on[1] = find_month(posted_on[1]) posted_on = ' '.join(posted_on) except : pass Description =amenities = age_of_property = '' speciality = {} try : Description = (''.join(response.xpath('//div[@id = "description"]//text()').extract())).replace('\n','') except: pass try : amenities = ','.join(response.xpath('//div[@id="amenitiesSection"]/div/div[2]/div/div/div//text()').extract()) except: pass if amenities == "": try : amenities = ','.join(response.xpath('//div[@id="features"]/div/div//text()').extract()) except: pass try : special = response.xpath('//div[@class=" pdOtherFacts responsive"]/div') for spec in special : try : header = ''.join(special.xpath('span[1]//text()').extract()) text = ''.join(special.xpath('span[2]//text()').extract()) speciality[header] = text except: pass except : pass agent_name = agent_type ="" try : agent_type = ''.join(response.xpath('//div[@id="QryFormPd"]//span[@class="dealerWidgetHeading"]//text()').extract()) agent_type = agent_type.replace('Details','') agent_name = (','.join(response.xpath('//div[@id="QryFormPd"]//div[@class="c2dInfo"]//text()').extract())).split()[0] except : pass if agent_name == "" : try : agent_name = (' '.join(response.xpath('//div[@id="QryFormPd"]//div[@class="c2dRunCaptionAbtDev "]//span[@class="spanBold"]//text()').extract())) agent_name = agent_name.replace('About ','') except : pass try : resale = response.xpath('//span[@id="transactionType"]//text()').extract() if 'Resale' in resale: is_resale = 1 except: pass try : bedrooms = int((''.join(response.xpath('//div[@id="bedRoomNum"]//text()').extract())).split()[0]) except : pass try : bathrooms = int((''.join(response.xpath('//div[@id="bathroomNum"]//text()').extract())).split()[0]) except : pass try : age_of_property = ''.join(response.xpath('//div[@id="agePossessionLbl"]//text()').extract()) except : pass try : additional_rooms = ''.join(response.xpath('//div[@id="additionalRooms"]//text()').extract()) amenities += (", " + additional_rooms) except : pass more_info = {} item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url yield item input()
def parse_property_info(self, response): # this function scrpaes info off the property page using xpaths # try except is used to avoid crashing in case of missing fields item = NewSpdItem() try: maintainance = posted_by_details = posted_on_date = project_name = location = address = city = "" carpet_area = super_built_area = -1.0 washroom = bedrooms = -1 is_price_fixed = True price = price_per_unit = -1.0 try: super_built_area_unit = ''.join( response.xpath( '//div[@id="coveredAreaUnit"]//text()').extract()) except: pass try: carpet_area_unit = ''.join( response.xpath( '//div[@id="carpetAreaUnit"]//text()').extract()) except: pass try: super_built_area = (''.join( response.xpath('//span[@id="coveredAreaDisplay"]//text()'). extract())).replace(',', '').replace('\n', '') super_built_area = float( re.findall('\d+', super_built_area)[0]) if "yrd" in super_built_area_unit: super_built_area *= 9 except: super_built_area = -1.0 try: carpet_area = (''.join( response.xpath('//span[@id="carpetAreaDisplay"]//text()'). extract())).replace(',', '').replace('\n', '') carpet_area = float(re.findall('\d+', carpet_area)[0]) if "yrd" in carpet_area_unit: carpet_area *= 9 except: carpet_area = -1.0 try: datalist = response.xpath( "//div[@class='nMoreListData']/div[@class='nDataRow']") for data in datalist: try: label = data.xpath( 'div[@class="dataLabel"]//text()').extract()[0] if "Rent" in label: try: price = ((data.xpath( 'div[@class="dataVal"]/span[contains(@class,"fBold")]//text()' ).extract())[0].split())[-1].replace( ',', '').lower() price = float(''.join( ele for ele in price if ele.isdigit() or ele == '.')) except: price = -1.0 try: price_per_unit = ((data.xpath( 'div[@class="dataVal"]/span[@class="light"]/text()' ).extract())[1].split())[1] priceunit = ((data.xpath( 'div[@class="dataVal"]/span[@class="light"]//text()' ).extract())[1].split())[3].strip() price_per_unit = float(''.join( ele for ele in price_per_unit if ele.isdigit() or ele == '.')) if "sqyrd" in priceunit: price_per_unit *= 9 except: price_per_unit = -1.0 if "Address" in label: try: address = ''.join( data.xpath('div[@class="dataVal"]/text()'). extract()).replace('\n', '') except: pass except: pass except: pass try: location = ''.join( response.xpath('//span[@itemprop="streetAddress"]//text()' ).extract()).replace(',', ' ') except: pass try: city = ''.join( response.xpath( '//span[@itemprop="addressLocality"]//text()').extract( )).replace(',', ' ') except: pass try: if address == '': address = location + city except: pass try: datalist = response.xpath( '//div[@class="nInfoDataBlock"]/div[@class="nDataRow"]') for data in datalist: try: label = ''.join( data.xpath( 'div[@class="dataLabel"]//text()').extract()) if "Configuration" in label: try: bedrooms = int(((data.xpath( 'div[@class="dataVal"]/span//text()'). extract())[0].split())[0]) except: pass try: other = ''.join( data.xpath('div[@class="dataVal"]/text()'). extract()).split(",") for info in other: try: info = info.replace('\n', ' ') if "Bathroom" in info: info = (info.strip().split())[0] washroom = int(info) except: pass except: pass except: pass except: pass try: posted_on_date = ((( (response.xpath('//div[@class="propIDnPDate"]//text()'). extract())[0].split('|'))[1]).split(':'))[1] posted_on_date = posted_on_date.replace(',', ' ').replace( '\'', ' ') posted_on_date = re.sub(' +', ' ', posted_on_date) posted_on_date = posted_on_date.split() posted_on_date[0], posted_on_date[1] = posted_on_date[ 1], posted_on_date[0] posted_on_date[1] = find_month(posted_on_date[1]) posted_on_date[0] += "0" if len(posted_on_date) == 1 else "" posted_on_date = '-'.join(posted_on_date[::-1]) except: pass try: project_name = (''.join( response.xpath('//div[@class="nProjNmLoc"]/a//text()'). extract())).replace('\n', '') except: pass try: posted_by_details = ''.join( response.xpath('//a[contains(@id,"agentBtn")]//text()'). extract()[0]).replace('Contact ', '') # Remove 'Contact' except: pass try: item['URL'] = response.url item['website'] = (response.url).split('/')[2] item['Price'] = price item['PricePerUnit'] = price_per_unit item['maintainance'] = maintainance.encode('utf8') item['is_price_fixed'] = is_price_fixed item['SuperBuiltupArea'] = super_built_area item['CarpetArea'] = carpet_area item['city'] = city.encode('utf8') item['address'] = address.encode('utf8') item['Location'] = location.encode('utf8') item['Washroom'] = washroom item['Bedrooms'] = bedrooms item['PostedBy'] = posted_by_details.encode('utf8') item['PostingDate'] = posted_on_date.encode('utf8') item['ProjectName'] = project_name.encode('utf8') if project_name == '': print response.url print "\n\n\nproject name missing\n\n\n" yield else: pprint(item) yield item except: print "error1" yield except: print "error2" yield
def parse_property_info(self, response): item = BuyItem() price = price_per_sqft = min_area = max_area = bathrooms = bedrooms = SuperBuiltupArea = is_resale = 0 is_price_fix = 1 address = city = location = age_of_property = agent_name = agent_type = launch_date = status = amenities = "" speciality = {} more_info = [] try : full_price = ''.join(response.xpath('//div[@class="nActualAmt"]//text()').extract()) price = float(full_price.split()[1]) if "Lac" in full_price: price *= 100000 if "Cr" in full_price: price *= 1000000 except : pass try : SuperBuiltupArea = float(''.join(response.xpath('//span[@id="coveredAreaDisplay"]//text()').extract())) except : pass try : min_area = max_area = float(''.join(response.xpath('//span[@id="carpetAreaDisplay"]//text()').extract())) except : pass datalist = response.xpath("//div[@class='nMoreListData']/div[@class='nDataRow']") for data in datalist: try : label = data.xpath('div[@class="dataLabel"]//text()').extract()[0] if "Price" in label: try : if price == 0.0: price = ((data.xpath('div[@class="dataVal"]/span[contains(@class,"fBold")]//text()').extract())[0].split())[-1].replace(',','').lower() price = float(''.join(ele for ele in price if ele.isdigit() or ele == '.')) except: pass try: price_per_sqft = ((data.xpath('div[@class="dataVal"]/span[@class="light"]/text()').extract())[1].split())[1] priceunit = ((data.xpath('div[@class="dataVal"]/span[@class="light"]//text()').extract())[1].split())[3].strip() price_per_sqft = float(''.join(ele for ele in price_per_sqft if ele.isdigit() or ele == '.')) if "sqyrd" in priceunit: price_per_sqft *= 9 except: pass if "Address" in label: try : address = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).replace('\n','').replace('\t','') except: pass if "Water Availability" in label: try : speciality[label] = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).replace('\n','').replace('\t','') except: pass if "Status of Electricity" in label: try : speciality[label] = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).replace('\n','').replace('\t','') except: pass if "Flooring" in label: try : speciality[label] = ''.join(data.xpath('div[contains(@class,"dataVal")]//text()').extract()).replace('\n','').replace('\t','') except: pass except : pass try : if address == "": address = (''.join(response.xpath('//div[@class="nProjNmLoc"]//text()').extract())).replace('\n','').replace('\t','') except: pass try : location = ''.join(response.xpath('//span[@itemprop="streetAddress"]//text()').extract()).replace(',',' ') except : pass try: city = ''.join(response.xpath('//span[@itemprop="addressLocality"]//text()').extract()).replace(',',' ') except: pass try : if address == "": address = location + ' ' + city except: pass if city == '': try : city = address.split(',')[-1] except : pass if location == '': try : location = address.split(',')[-2] except : pass datalist = response.xpath('//div[@class="nInfoDataBlock"]/div[@class="nDataRow"]') for data in datalist: try : label = ''.join(data.xpath('div[@class="dataLabel"]//text()').extract()) if "Configuration" in label : try : bedrooms = int(((data.xpath('div[@class="dataVal"]/span//text()').extract())[0].split())[0]) except : pass try : other = (''.join(data.xpath('div[@class="dataVal"]/text()').extract())).split(",") for info in other: info = info.replace('\n',' ') if "Bathroom" in info: info = (info.strip().split())[0] bathrooms = int(info) if "Room" in info: speciality['addional_room'] = info.strip() except: pass if "Transaction" in label: try : temp = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).strip() if "Resale" in temp: is_resale = 1 except : pass if "Status" in label: try : status = (''.join(data.xpath('div[@class="dataVal"]//text()').extract())).replace('\n','') if status =='': status = (''.join(data.xpath('li/div[@class="dataVal"]//text()').extract())).replace('\n','') except : pass if "Age" in label: try : age_of_property = (''.join(data.xpath('div[@class="dataVal"]//text()').extract())).replace('\n','') if age_of_property=='' : age_of_property = (''.join(data.xpath('li/div[@class="dataVal"]//text()').extract())).replace('\n','') except : pass if "Furnish" in label: try : speciality['furnishing'] = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).strip() except : pass if "Car Parking" in label: try : speciality['parking'] = ''.join(data.xpath('div[@class="dataVal"]//text()').extract()).strip() except : pass except: pass try: description = ''.join(response.xpath("//span[@class='dDetail']//text()").extract()).replace('\n','') except: pass if description == "": try : description = (''.join(response.xpath("//div[@class='nAboutBrf']//text()").extract())).replace('\n','').replace('\t','') except : pass try: posted_on_date = ((((response.xpath('//div[@class="propIDnPDate"]//text()').extract())[0].split('|'))[1]).split(':'))[1].strip().replace(',',' ') posted_on_date = posted_on_date.split() posted_on_date[0],posted_on_date[1] = posted_on_date[1],posted_on_date[0] posted_on_date[1] = month.find_month(posted_on_date[1]) posted_on_date = ' '.join(posted_on_date) except: pass try: code = (((response.xpath('//div[@class="propIDnPDate"][1]//text()').extract())[0].split("|"))[0].split(":"))[1].strip() except: pass try: agent_name = response.xpath('//div[@class="agntName"]//text()').extract()[-1] # Remove 'Contact' except: pass amenities = ','.join(response.xpath('//div[@id="normalAminities"]//li[not(@class="notAvail")]/span[@class="ameLabel"]/text()').extract()) item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['posted_on'] = posted_on_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url item['code'] = code.encode('utf8') item['description'] = description.encode('utf8') yield item
def parse_property_info(self, response): item = BuyItem() self.driver.get(response.url) input() try: WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="npPrice"]//text()'))) except TimeoutException: return response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') is_resale = price = bedrooms = bathrooms = price_per_sqft = 0 is_price_fix = 1 print "\n", response.url, "\n" try: full_price = ','.join( response.xpath('//div[@class="npPrice"]//text()').extract()) print full_price print price price = float(full_price.split(',')[3]) if 'Cr' in full_price: price *= 10000000 if "Lac" in full_price: price *= 100000 except: pass if price == 0: try: full_price = ' '.join( response.xpath('//span[@id="pdPrice"]//text()').extract()) print full_price print price price = float(full_price.split()[0]) if 'Cr' in full_price: price *= 10000000 if "Lacs" in full_price: price *= 100000 except: pass print price input() try: price_per_sqft = float((response.xpath( '//div[@class="npBasePrice"]/span/text()').extract())[3]) except: pass try: price_per_sqft = float((response.xpath( '//div[@id="pricePerUnitArea"]/text()').extract()).split()[1]) except: pass city = address = location = "" try: address = (''.join( response.xpath('//div[@class="project-location"]/span//text()' ).extract())).replace('\n', '') city = address.split(',')[-2] location = ( response.xpath('//a[@class="ttlLink"]/text()').extract()[1]) except: pass if address == "": try: address = (''.join( response.xpath('//span[@id="address"]/text()').extract()) ).replace('\n', '') city = address.split(',')[-2] location = (response.xpath( '//a[@class="ttlLink"]/text()').extract()[1]) except: pass status = "" min_area = max_area = 0.0 try: status = ''.join( response.xpath( '//div[@class="npPossessionDate"]/text()').extract()[2]) except: pass if status == "": try: status = ''.join( response.xpath( '//div[@class="pdDetailInfoOther"]/div[3]/span/text()' ).extract()) except: pass try: temp = ''.join( response.xpath( '//div[@class="npAreaPrice"]/span[1]/text()').extract()) temp = temp.split() temp = [float(i) for i in temp if i.isdigit()] try: min_area = temp[0] max_area = temp[1] except: max_area = min_area except: pass SuperBuiltupArea = 0.0 try: SuperBuiltupArea = ' '.join( response.xpath( '//div[@class="npPrjArea"] /span//text()').extract()) if "acres" in SuperBuiltupArea: SuperBuiltupArea = float(SuperBuiltupArea.split()[0]) * 43560 else: SuperBuiltupArea = float(SuperBuiltupArea.split()[0]) except: pass if min_area == 0.0: try: min_area = float(''.join( response.xpath('//span[@id="superbuiltupArea_span"]/text()' ).extract())) max_area = min_area except: pass try: SuperBuiltupArea = ''.join( response.xpath( '//div[@id="socAreaOccupied"]/text()').extract()) if "acres" in SuperBuiltupArea: SuperBuiltupArea = float( SuperBuiltupArea.split()[0]) * 43560 else: SuperBuiltupArea = float(SuperBuiltupArea.split()[0]) except: pass launch_date = CarpetArea = posted_on = '' try: posted_on = (''.join( response.xpath( '//span[@class="pdPropDate"]/text()').extract()).replace( ',', '')).split() posted_on[0], posted_on[1] = posted_on[1], posted_on[0] posted_on[1] = find_month(posted_on[1]) posted_on = ' '.join(posted_on) except: pass Description = amenities = age_of_property = '' speciality = {} try: Description = (''.join( response.xpath('//div[@id = "description"]//text()').extract()) ).replace('\n', '') except: pass try: amenities = ','.join( response.xpath( '//div[@id="amenitiesSection"]/div/div[2]/div/div/div//text()' ).extract()) except: pass if amenities == "": try: amenities = ','.join( response.xpath( '//div[@id="features"]/div/div//text()').extract()) except: pass try: special = response.xpath( '//div[@class=" pdOtherFacts responsive"]/div') for spec in special: try: header = ''.join( special.xpath('span[1]//text()').extract()) text = ''.join(special.xpath('span[2]//text()').extract()) speciality[header] = text except: pass except: pass agent_name = agent_type = "" try: agent_type = ''.join( response.xpath( '//div[@id="QryFormPd"]//span[@class="dealerWidgetHeading"]//text()' ).extract()) agent_type = agent_type.replace('Details', '') agent_name = (','.join( response.xpath( '//div[@id="QryFormPd"]//div[@class="c2dInfo"]//text()'). extract())).split()[0] except: pass if agent_name == "": try: agent_name = (' '.join( response.xpath( '//div[@id="QryFormPd"]//div[@class="c2dRunCaptionAbtDev "]//span[@class="spanBold"]//text()' ).extract())) agent_name = agent_name.replace('About ', '') except: pass try: resale = response.xpath( '//span[@id="transactionType"]//text()').extract() if 'Resale' in resale: is_resale = 1 except: pass try: bedrooms = int((''.join( response.xpath('//div[@id="bedRoomNum"]//text()').extract()) ).split()[0]) except: pass try: bathrooms = int((''.join( response.xpath('//div[@id="bathroomNum"]//text()').extract()) ).split()[0]) except: pass try: age_of_property = ''.join( response.xpath( '//div[@id="agePossessionLbl"]//text()').extract()) except: pass try: additional_rooms = ''.join( response.xpath( '//div[@id="additionalRooms"]//text()').extract()) amenities += (", " + additional_rooms) except: pass more_info = {} item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url yield item input()
def parse_property_info(self, response): item = BuyItem() is_resale = price = bedrooms = bathrooms = price_per_sqft = 0 is_price_fix = 1 try: price = 0 price = int(''.join( response.xpath( '//span[@class="price-info"]/@data-value').extract())) except: is_price_fixed = 0 try: price_per_sqft = (''.join( response.xpath('//div[@class="pp-container"]/span/text()'). extract()).split())[0] price_per_sqft = price_per_sqft.replace(',', '') price_per_sqft = int(price_per_sqft) except: pass city = address = location = "" try: address = (''.join( response.xpath('//div[@class="location-info"]//text()'). extract())).replace('\n', '') city = address.split(',')[-1] location = ''.join( response.xpath( '//a[@data-category="search"]/span/text()').extract()[5]) except: pass status = "" min_area = max_area = 0.0 try: info_container = response.xpath( '//div[@class="project-info-container"]/div') for info in info_container: try: info_description = ''.join( info.xpath('div[@class="info-description"]//text()'). extract()) temp = ''.join( info.xpath( 'div[@class="info-value"]//text()').extract()) if "Possession" in info_description: status = temp.replace('\n', '') if ("Sizes" in info_description) or ("area" in info_description): temp = temp.split() temp = [float(i) for i in temp if i.isdigit()] try: min_area = temp[0] max_area = temp[1] except: max_area = min_area except: pass except: pass launch_date = CarpetArea = posted_on = '' SuperBuiltupArea = 0.0 try: overview = response.xpath( '//div[@id="overview-card"]//span[@class="entity"]') try: for over in overview: label = ''.join( over.xpath( 'span/span[@class="text"]//text()').extract()) temp = ''.join( over.xpath( 'span/span[@class="value"]//text()').extract()) try: if "Area" in label: SuperBuiltupArea = float((temp.split())[0]) if "Acres" in temp: SuperBuiltupArea = SuperBuiltupArea * 43560 except: pass try: if "Launch" in label: launch_date = ((temp.strip().replace( '\n', '')).replace(',', '')).split() launch_date[0] = find_month(launch_date[0]) launch_date = ' '.join(launch_date) except: pass except: pass except: pass Description = amenities = age_of_property = '' speciality = {} try: Description = ''.join( response.xpath('//p[@class="desc-para"]//text()').extract()) except: pass try: amenities = ','.join( response.xpath( '//span[@class="amenity-entity"]//span[@class="text"]//text()' ).extract()) except: pass try: special = response.xpath('//div[@class="amenity-entity"]') for spec in special: try: header = ''.join( special.xpath( 'span[@class="header"]//text()').extract()) text = ''.join( special.xpath( 'span[@class="texts"]//text()').extract()) speciality[header] = text except: pass except: pass agent_name = agent_type = "" try: agent_name = ''.join( response.xpath('//*[@class="name"]//text()').extract()) agent_type = ''.join( response.xpath( '//div[@class="info"]/div[@class="type"]//text()').extract( )) except: pass more_info = [] try: information = response.xpath( '//div[@class="nsv-list-item-container"]/div') bhk = 0 for info in information: try: header = ''.join(info.xpath('//text()').extract()) if "BHK" in header: bhk = int(''.join( info.xpath( 'span/span//text()').extract()).split())[0] else: size = rate = "" size = float(''.join( info.xpath('div/div[@class="list-heading"]//text()' ).extract()).split())[0] full_rate = ''.join( info.xpath( 'div/div[@class="list-price"]//span/text()'). extract()) rate = float(full_rate.split())[0] if 'Lacs' in full_rate: rate *= 100000 if "Cr" in full_rate: rate *= 10000000 more_info += [(bhk, size, rate)] except: pass except: pass if "resale" in response.url: is_resale = 1 try: location = address.split(',')[-2] except: pass try: price_per_sqft = (''.join( response.xpath( '//div[@class="emi-sub-container"]/span/text()'). extract()).split())[0] price_per_sqft = price_per_sqft.replace(',', '') price_per_sqft = int(price_per_sqft) except: pass try: overview = response.xpath( '//div[@id="overview-card"]//span[@class="entity"]') try: for over in overview: label = ''.join( over.xpath( 'span/span[@class="text"]//text()').extract()) temp = ''.join( over.xpath( 'span/span[@class="value"]//text()').extract()) try: if "Price" in label: if "negotiable" in temp: is_price_fix = 0 except: pass try: if "Added" in label: launch_date = ((temp.replace( '\n', '')).replace(',', '')).split() t1 = launch_date[0] launch_date[0] = '' for i in t1: if i.isdigit(): launch_date[0] += i launch_date[1] = find_month(launch_date[1]) launch_date = ' '.join(launch_date) except: pass try: if "Bedrooms" in label: bedrooms = int(temp.split()[0]) except: pass try: if "Bathrooms" in label: bathrooms = int(temp.split()[0]) except: pass except: pass except: pass try: info_container = response.xpath( '//div[@class="project-info-container"]/div') for info in info_container: try: info_description = ''.join( info.xpath('div[@class="info-description"]//text()' ).extract()) temp = ''.join( info.xpath( 'div[@class="info-value"]//text()').extract()) if "Age of property" in info_description: age_of_property = temp.replace('\n', '') except: pass except: pass item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url yield item
def parse_property_info(self, response): item = BuyItem() is_resale = price = bedrooms = bathrooms = price_per_sqft = 0 is_price_fix = 1 try : price = 0 price = int(''.join(response.xpath('//span[@class="price-info"]/@data-value').extract())) except : is_price_fixed = 0 try : price_per_sqft = (''.join(response.xpath('//div[@class="pp-container"]/span/text()').extract()).split())[0] price_per_sqft = price_per_sqft.replace(',','') price_per_sqft = int(price_per_sqft) except : pass city = address = location = "" try : address = (''.join(response.xpath('//div[@class="location-info"]//text()').extract())).replace('\n','') city = address.split(',')[-1] location = ''.join(response.xpath('//a[@data-category="search"]/span/text()').extract()[5]) except : pass status = "" min_area = max_area = 0.0 try : info_container = response.xpath('//div[@class="project-info-container"]/div') for info in info_container: try : info_description = ''.join(info.xpath('div[@class="info-description"]//text()').extract()) temp = ''.join(info.xpath('div[@class="info-value"]//text()').extract()) if "Possession" in info_description : status = temp.replace('\n','') if ("Sizes" in info_description) or ("area" in info_description): temp = temp.split() temp = [float(i) for i in temp if i.isdigit()] try : min_area = temp[0] max_area = temp[1] except : max_area = min_area except : pass except : pass launch_date = CarpetArea = posted_on = '' SuperBuiltupArea = 0.0 try : overview = response.xpath('//div[@id="overview-card"]//span[@class="entity"]') try : for over in overview : label = ''.join(over.xpath('span/span[@class="text"]//text()').extract()) temp = ''.join(over.xpath('span/span[@class="value"]//text()').extract()) try : if "Area" in label : SuperBuiltupArea = float((temp.split())[0]) if "Acres" in temp: SuperBuiltupArea = SuperBuiltupArea*43560 except : pass try : if "Launch" in label: launch_date = ((temp.strip().replace('\n','')).replace(',','')).split() launch_date[0] = find_month(launch_date[0]) launch_date = ' '.join(launch_date) except : pass except : pass except : pass Description =amenities = age_of_property = '' speciality = {} try : Description = ''.join(response.xpath('//p[@class="desc-para"]//text()').extract()) except: pass try : amenities = ','.join(response.xpath('//span[@class="amenity-entity"]//span[@class="text"]//text()').extract()) except: pass try : special = response.xpath('//div[@class="amenity-entity"]') for spec in special : try : header = ''.join(special.xpath('span[@class="header"]//text()').extract()) text = ''.join(special.xpath('span[@class="texts"]//text()').extract()) speciality[header] = text except: pass except : pass agent_name = agent_type ="" try : agent_name = ''.join(response.xpath('//*[@class="name"]//text()').extract()) agent_type = ''.join(response.xpath('//div[@class="info"]/div[@class="type"]//text()').extract()) except : pass more_info = [] try : information = response.xpath('//div[@class="nsv-list-item-container"]/div') bhk = 0 for info in information : try : header = ''.join(info.xpath('//text()').extract()) if "BHK" in header: bhk = int(''.join(info.xpath('span/span//text()').extract()).split())[0] else : size = rate = "" size = float(''.join(info.xpath('div/div[@class="list-heading"]//text()').extract()).split())[0] full_rate = ''.join(info.xpath('div/div[@class="list-price"]//span/text()').extract()) rate = float(full_rate.split())[0] if 'Lacs' in full_rate: rate *= 100000 if "Cr" in full_rate: rate *= 10000000 more_info += [(bhk,size,rate)] except: pass except : pass if "resale" in response.url : is_resale = 1 try : location = address.split(',')[-2] except: pass try : price_per_sqft = (''.join(response.xpath('//div[@class="emi-sub-container"]/span/text()').extract()).split())[0] price_per_sqft = price_per_sqft.replace(',','') price_per_sqft = int(price_per_sqft) except : pass try : overview = response.xpath('//div[@id="overview-card"]//span[@class="entity"]') try : for over in overview : label = ''.join(over.xpath('span/span[@class="text"]//text()').extract()) temp = ''.join(over.xpath('span/span[@class="value"]//text()').extract()) try : if "Price" in label : if "negotiable" in temp : is_price_fix = 0 except : pass try : if "Added" in label: launch_date = ((temp.replace('\n','')).replace(',','')).split() t1 = launch_date[0] launch_date[0] = '' for i in t1: if i.isdigit(): launch_date[0] += i launch_date[1] = find_month(launch_date[1]) launch_date = ' '.join(launch_date) except : pass try : if "Bedrooms" in label: bedrooms = int(temp.split()[0]) except : pass try : if "Bathrooms" in label: bathrooms = int(temp.split()[0]) except : pass except : pass except : pass try : info_container = response.xpath('//div[@class="project-info-container"]/div') for info in info_container: try : info_description = ''.join(info.xpath('div[@class="info-description"]//text()').extract()) temp = ''.join(info.xpath('div[@class="info-value"]//text()').extract()) if "Age of property" in info_description : age_of_property = temp.replace('\n','') except : pass except : pass item['price'] = price item['price_per_sqft'] = price_per_sqft item['is_price_fix'] = is_price_fix item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['bathrooms'] = bathrooms item['bedrooms'] = bedrooms item['SuperBuiltupArea'] = SuperBuiltupArea item['age_of_property'] = age_of_property.encode('utf8') item['launch_date'] = launch_date.encode('utf8') item['possession_status'] = status.encode('utf8') item['agent_name'] = agent_name.encode('utf8') item['agent_type'] = agent_type.encode('utf8') item['amenities'] = amenities.encode('utf8') item['speciality'] = speciality item['more_info'] = more_info item['is_resale'] = is_resale item['url'] = response.url yield item