def getPageAndStoreResults(self, letter, url, otherPages = True):
		htmlPage = Page.get(url)

		pageSoup = BeautifulSoup(htmlPage)

		# First, get all subnames in this page
		subnameSoup = BeautifulSoup("%s" % pageSoup.body.find_all(self.getsubnameDiv)).find_all("a")

		# And bufferize them all
		for entry in subnameSoup:
			# If we are on the entry limit, stop process
			if self.entryLimit > 0 and len(self._subnameList) >= self.entryLimit:
				break

			subname = entry.string
			if subname not in self._subnameList:
				self._subnameList += (subname, )

		print "Collected %d entries for letter %s - Total collected: %d" % (len(subnameSoup), letter, len(self._subnameList))

		if otherPages == True:
			# Second, seek other pages
			otherPagesSoup = BeautifulSoup("%s" %
				BeautifulSoup("%s" %
					pageSoup.find_all("div", {"class":"span11"})
				).find_all(self.getsubnamePaginationDiv)
			).find_all("a")

			for entry in otherPagesSoup:
				# If we are on the entry limit, stop process
				if self.entryLimit > 0 and len(self._subnameList) >= self.entryLimit:
					break

				if not re.search("%s-1.html" % letter, entry["href"]):
					self.getPageAndStoreResults(letter, "%s/%s" % (self.rootURL, entry["href"]), False)
Beispiel #2
0
	def getPageAndStoreResults(self, pageId, getRange=True):
		htmlPage = Page.get("%s-%d" % (self.baseURL, pageId))

		pageSoup = BeautifulSoup(htmlPage)

		# on genealogie.com we need to find how many pages of names we have
		if self.lastPageId == -1:
			paginationSoup = BeautifulSoup("%s" % pageSoup.body.find_all("div", {"class": "pageNo"})).find_all("a")
			self.lastPageId = int(re.sub("\n","",re.sub(" ","",paginationSoup[-1].string)))

		nameSoup = pageSoup.find_all("label", {"class":"nameValue"})
		for entry in nameSoup:
			# If we are on the entry limit, stop process
			if self.entryLimit > 0 and len(self._nameList) >= self.entryLimit:
				break

			# We get the first <a> element, it's the name label
			entrySoup = BeautifulSoup("%s" %
				BeautifulSoup("%s" % entry).find_all("a")[-1]
			)
			for entry2 in entrySoup:

				tmpName = "%s" % re.sub("\n","", re.sub(" ","",entry2.string))
				if tmpName not in self._nameList:
					self._nameList += (tmpName,)

		print "Current ID %d - Total Collected Names %d" % (pageId, len(self._nameList))

		# If page is page 1, then we loop the results
		if getRange == True:
			for pId in range(2, self.lastPageId+1):
				# If we are on the entry limit, stop process
				if self.entryLimit > 0 and len(self._nameList) >= self.entryLimit:
					break

				self.getPageAndStoreResults(pId, False)