def getPageAndStoreResults(self, letter, url, otherPages = True): htmlPage = Page.get(url) pageSoup = BeautifulSoup(htmlPage) # First, get all subnames in this page subnameSoup = BeautifulSoup("%s" % pageSoup.body.find_all(self.getsubnameDiv)).find_all("a") # And bufferize them all for entry in subnameSoup: # If we are on the entry limit, stop process if self.entryLimit > 0 and len(self._subnameList) >= self.entryLimit: break subname = entry.string if subname not in self._subnameList: self._subnameList += (subname, ) print "Collected %d entries for letter %s - Total collected: %d" % (len(subnameSoup), letter, len(self._subnameList)) if otherPages == True: # Second, seek other pages otherPagesSoup = BeautifulSoup("%s" % BeautifulSoup("%s" % pageSoup.find_all("div", {"class":"span11"}) ).find_all(self.getsubnamePaginationDiv) ).find_all("a") for entry in otherPagesSoup: # If we are on the entry limit, stop process if self.entryLimit > 0 and len(self._subnameList) >= self.entryLimit: break if not re.search("%s-1.html" % letter, entry["href"]): self.getPageAndStoreResults(letter, "%s/%s" % (self.rootURL, entry["href"]), False)
def getPageAndStoreResults(self, pageId, getRange=True): htmlPage = Page.get("%s-%d" % (self.baseURL, pageId)) pageSoup = BeautifulSoup(htmlPage) # on genealogie.com we need to find how many pages of names we have if self.lastPageId == -1: paginationSoup = BeautifulSoup("%s" % pageSoup.body.find_all("div", {"class": "pageNo"})).find_all("a") self.lastPageId = int(re.sub("\n","",re.sub(" ","",paginationSoup[-1].string))) nameSoup = pageSoup.find_all("label", {"class":"nameValue"}) for entry in nameSoup: # If we are on the entry limit, stop process if self.entryLimit > 0 and len(self._nameList) >= self.entryLimit: break # We get the first <a> element, it's the name label entrySoup = BeautifulSoup("%s" % BeautifulSoup("%s" % entry).find_all("a")[-1] ) for entry2 in entrySoup: tmpName = "%s" % re.sub("\n","", re.sub(" ","",entry2.string)) if tmpName not in self._nameList: self._nameList += (tmpName,) print "Current ID %d - Total Collected Names %d" % (pageId, len(self._nameList)) # If page is page 1, then we loop the results if getRange == True: for pId in range(2, self.lastPageId+1): # If we are on the entry limit, stop process if self.entryLimit > 0 and len(self._nameList) >= self.entryLimit: break self.getPageAndStoreResults(pId, False)