def __getProjectListing(self, listing, isRedesign=False): #print(str(listing)) if isRedesign: addressNodeClass = 'listing-result-redesign__project-address' titleNodeClass = 'listing-result-redesign__project-title' urlNodeClass = 'listing-result-redesign__project-title-wrapper' featureNodeClass = 'listing-result-redesign__project-features' childNodeClass = 'listing-result-redesign__listing' priceNodeClass = 'listing-result-redesign__price' else: addressNodeClass = 'listing-result__project-address' titleNodeClass = 'listing-result__project-title' urlNodeClass = 'listing-result__project-title-wrapper' featureNodeClass = 'listing-result__project-features' childNodeClass = 'listing-result__listing' priceNodeClass = 'listing-result__price' addressNode = listing.find('span', class_=addressNodeClass) if addressNode: addressParts = addressNode.text.strip().split(',') addressLine1 = addressParts[0].strip() addressLocality = addressParts[1].strip() try: addressRegion, postalCode = addressParts[2].strip().split(' ') except: print(addressNode.text) raise else: print(str(listing)) raise ValueError('No project address found.') project = Project(addressLine1, addressLocality, addressRegion, postalCode) titleNode = listing.find('h2', class_=titleNodeClass) if titleNode: title = titleNode.text.strip() project.setTitle(title) urlNode = listing.find('a', class_=urlNodeClass) if urlNode and urlNode.has_attr('href'): projectUrl = urlNode['href'] project.setUrl(projectUrl) print(projectUrl) else: print(str(listing)) raise ValueError('No project URL found.') featureNode = listing.find('ul', class_=featureNodeClass) if featureNode: features = listing.find_all('li') for feature in features: project.setFeature(feature.text.strip()) # get the child properties childListings = listing.find_all('a', class_=childNodeClass) if len(childListings) == 0: print(str(listing)) raise ValueError('No child property found.') for childListing in childListings: print(str(childListing)) if childListing.has_attr('href'): propertyUrl = childListing['href'] else: print(str(childListing)) raise ValueError('No child URL found.') priceNode = childListing.find('h3', class_=priceNodeClass) if priceNode: priceText = priceNode.text.strip() price = self.__getPrice(priceText) if price == 0 and addressLine1: price = self.__getPriceFromAddress(addressLine1, addressLocality, addressRegion, postalCode) if price == 0: print(str(childListing)) raise ValueError('No child price found.') childProperty = Property(price) childProperty.setUrl(propertyUrl) childFeatureNodes = listing.find_all( 'span', class_='property-feature__feature-text-container') if len(childFeatureNodes) == 0: print(str(listing)) raise ValueError('No child feature found.') for childFeatureNode in childFeatureNodes: childProperty.setFeature(childFeatureNode.text.strip()) childPage = urlopen(propertyUrl) childDom = BeautifulSoup(childPage, 'html.parser') childAddressNode = childDom.find( 'button', class_='listing-details__project-title-address') if childAddressNode: print(childAddressNode.text.strip()) childAddressParts = childAddressNode.text.strip().split('/') if len(childAddressParts) > 0: childPropertyType = childAddressParts[0].strip() # this is something like: Type A, Type B, Type C, Courtyard, etc. childProperty.setChildPropertyType(childPropertyType) else: raise ValueError('No child address found.') project.addChildProperty(childProperty) return project
def __getPropertyListing(self, listing): priceNode = listing.find('p', class_='listing-result__price') if priceNode is None: priceNode = listing.find('p', class_='listing-result-redesign__price') if priceNode is None: print(str(listing)) raise ValueError('No price tag found.') priceText = priceNode.text.strip() addressNode = listing.find('a', class_='listing-result__address') if addressNode is None: addressNode = listing.find( 'a', class_='listing-result-redesign__address') if addressNode and addressNode.has_attr('href'): propertyUrl = addressNode['href'] else: print(str(listing)) raise ValueError('No property URL found.') addressLine1Node = addressNode.find('span', class_='address-line1') if addressLine1Node: addressLine1 = addressLine1Node.text.strip() if addressLine1.endswith(','): addressLine1 = addressLine1[:-1] else: addressLine1 = '' addressLine2Node = addressNode.find('span', class_='address-line2') if addressLine2Node: addressLine2Parts = addressLine2Node.select('span') addressLocality = addressLine2Parts[0].text.strip() addressRegion = addressLine2Parts[1].text.strip() postalCode = addressLine2Parts[2].text.strip() else: print(str(listing)) raise ValueError('No address line 2 found.') price = self.__getPrice(priceText) if price == 0 and addressLine1: price = self.__getPriceFromAddress(addressLine1, addressLocality, addressRegion, postalCode) property = Property(price, addressLine1, addressLocality, addressRegion, postalCode) property.setUrl(propertyUrl) page = urlopen(propertyUrl) dom = BeautifulSoup(page, 'html.parser') keyFeatureNodes = dom.find_all( 'div', class_='listing-details__key-features--item') #print(propertyUrl) for keyFeatureNode in keyFeatureNodes: key = keyFeatureNode.find( 'div', class_='listing-details__key-features--key').text.strip() value = keyFeatureNode.find( 'div', class_='listing-details__key-features--value').text.strip() #print(key, value) if re.search(r'Property type', key, re.IGNORECASE): property.setType(value) elif re.search(r'Land area', key, re.IGNORECASE): match = re.search(r'(\d+)', value) if match: #print(match.group(1)) property.setLandArea(int(match.group(1))) description = self.__getPropertyDescription(dom) if description: #print(propertyUrl) #print(description) property.setDescription(description) else: print(propertyUrl) raise ValueError('No description found.') if property.getType() == '': print(propertyUrl) raise ValueError('No property type found.') featureNodes = listing.find_all( 'span', class_='property-feature__feature-text-container') if len(featureNodes) == 0: print(str(listing)) raise ValueError('No feature found.') for featureNode in featureNodes: property.setFeature(featureNode.text.strip()) return property