def getPageLinksFromContent(self, info): soup = BeautifulSoup(info["content"]) links = soup.findAll('a') pat = re.compile(r'href="([^"]*)"') pat2 = re.compile(r'http') for item in links: if str(item) != "" and not None: match = pat.search(str(item)) if match: href = match.group(1) if pat2.search(href): ans = href else: ans = info[0]+href if Common.isValidUrl(self._url, ans): # Verify whether legal link info["url"] = ans self._db.saveToPlanLinks(info) # insert into plan links list
def spider(self, i): try: str_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item = self._db.getPlanLink() url = item[u'url'].encode('utf-8') level = int(item['level']) if not Common.isValidUrl(self._url, url): # Verify whether legal link return False if url != "": self._db.setPlanLinksIndexEd(url) print "spider url:", url info = self.getPageInformation(url) info["level"] = self._level self._db.saveToDb(info) self._level = level + 1 # level number increase info["level"] = self._level self.getPageLinksFromContent(info) print url, str_datetime return True except Exception, ex: print "[error = 001,num = "+str(i)+"]", ex