コード例 #1
0
ファイル: scrape.py プロジェクト: maxharlow/houseshares
	def scrape_listings(self, uri, to_listed_date):
		print('Scraping Gumtree listing: ' + uri)
		page = urllib2.urlopen(uri)
		listing_html = BeautifulSoup(page)
		listing_adverts_html = listing_html.find_all('ul', class_='ad-listings')
		if not listing_adverts_html:
			return [] # there are no listings on this page -- invalid uri?
		listing_adverts_html = listing_adverts_html[0] if len(listing_adverts_html) == 1 else listing_adverts_html[1]  # skip featured listings
		listing_adverts_html = listing_adverts_html.find_all('li', class_='hlisting')
		adverts = []
		for listing_advert_html in listing_adverts_html:
			advert_uri = listing_advert_html.find('a', class_='description')['href']
			advert = Advert(advert_uri)
			advert.date_posted = self._extract_date_posted(listing_advert_html)
			if advert.date_posted < to_listed_date:
				return adverts
			#time.sleep(1) # please don't ban me
			self.scrape_advert(advert_uri, advert)
			adverts.append(advert)
		next_page = listing_html.find('li', class_='pag-next')
		if (next_page != None):
			next_page = next_page.contents[0]['href']
			adverts = adverts + self.scrape_listings(next_page, to_listed_date)
		return adverts
コード例 #2
0
ファイル: scrape.py プロジェクト: maxharlow/houseshares
	def scrape_advert(self, uri, advert=None):
		print('Scraping Gumtree advert: ' + uri)
		page = urllib2.urlopen(uri)
		advert_html = BeautifulSoup(page)
		advert = Advert(uri) if advert is None else advert
		advert.title = self._extract_title(advert_html)
		advert.price = self._extract_price(advert_html)
		advert.location = self._extract_location(advert_html)
		advert.location_coordinates = self._extract_location_coordinates(advert_html)
		advert.room_type = self._extract_room_type(advert_html)
		advert.date_available = self._extract_date_available(advert_html)
		advert.property_type = self._extract_property_type(advert_html)
		advert.seller_type = self._extract_seller_type(advert_html)
		advert.phone_number = self._extract_phone_number(advert_html)
		advert.description = self._extract_description(advert_html)
		advert.photos = self._extract_photos(advert_html)
		return advert