def scrapeListing(url, soup=None): logging.debug('Scraping %s', url) extension = url.split('/')[3] listingID = int(url.split('/')[4].split('.')[0]) logging.info('Scraping listing %s', listingID) newListing = Listing(listingID=listingID) # Retrieve page and extract contents try: if not soup: page = urllib.request.urlopen(url) soup = BeautifulSoup(page) postingtitle = soup.find('h2','postingtitle').get_text() postingbody = soup.find('section',id='postingbody').get_text() except AttributeError: logging.error('Failed to parse listing %s', url) self = None return newListing.listingBody = postingbody newListing.title = postingtitle # Price try: price = postingtitle.split()[0] price = price.split('$')[1] price = price.split('/m')[0] newListing.price = int(price) except (IndexError, ValueError): logging.warning('Non-$ price %s,', newListing.price) #newListing.price='' # Type of listing newListing.listingType = extension if newListing.listingType == 'roo': newListing.shared = True; elif newListing.listingType == 'sub': newListing.sublet = True; # Address try: address = soup.find('div','mapaddress').get_text() newListing.address = address.replace(',', ' ') except AttributeError: pass # Posting date postinginfos = soup.find('div','postinginfos') logging.debug('postinginfos: %s', postinginfos) for p in postinginfos.find_all('p','postinginfo'): postinginfo = p.get_text() if 'posted' in postinginfo: newListing.dateListingPosted = p.find('time')['datetime'] elif 'updated' in postinginfo: newListing.dateListingUpdated = p.find('time')['datetime'] if not newListing.dateListingUpdated: newListing.dateListingUpdated = newListing.dateListingPosted # Go through block of discrete attributes attrgroup = soup.find('p','attrgroup') for span in attrgroup.find_all('span'): attribute = span.get_text() if 'ft2' in attribute: newListing.area = int(attribute.split('ft2')[0]) elif 'laundry' in attribute or 'w/d' in attribute: newListing.laundry = attribute elif 'parking' in attribute or 'garage' in attribute or 'carport' in attribute: newListing.parking = attribute elif 'purrr' in attribute: newListing.catsAllowed = True elif 'wooof' in attribute: newListing.dogsAllowed = True elif 'no smoking' in attribute: newListing.noSmoking = True elif 'BR' in attribute: newListing.bedrooms = int(attribute.split('BR')[0]) else: logging.debug('Mystery attribute: %s', attribute) logging.debug('attrgroup: ' + str(attrgroup)) newListing.attributeGroup = str(attrgroup) # URLs newListing.listingUrl = url try: mapaddress = soup.find('p','mapaddress') newListing.mapUrl = mapaddress.find('a')['href'] except AttributeError: pass # Geocode based on the Maps URL try: mapQueryString = newListing.mapUrl.split('?q=')[1] geocode = Geocoder.geocode(mapQueryString) newListing.city = geocode.locality county = geocode.administrative_area_level_2 newListing.county = county.split(' ')[0] newListing.neighborhood = geocode.neighborhood newListing.zip = geocode.postal_code except: logging.debug('Geocoding error') pass # Region newListing.region = regionByZip(newListing) newListing.dateListingScraped = timezone.now() # Contact phone number try: newListing.phone = findPhone(postingbody) except AttributeError: logging.debug('No phone in posting, trying reply page') if not newListing.phone: try: replylink = soup.find('span','replylink').find('a')['href'] if replylink: baseurl = url.split(extension)[0] replypage = str(urllib.request.urlopen(baseurl + replylink).read()) newListing.phone = findPhone(replypage, True) else: logging.debug('No reply link on page') except: logging.debug('No phone from reply page') if not newListing.phone: newListing.phone = '' logging.debug('newListing (at the end) %s', newListing) return newListing