def get_label_a(self): try: soup = BeautifulSoup(self.data, 'lxml') alist = soup.find_all('a') mange = Mange(self.new_urls, self.old_urls) for i in alist: if 'http://' in i['href']: mange.add_new_urls(i['href']) except Exception,e: # no href pass
def main(): # page thread url if not args.url: return mange = Mange(new_urls, old_urls) mange.add_new_urls(args.url) while mange.has_new_urls(): if len(old_urls) >= args.page: break try: url = mange.pop() myt = SpiderThread(url, args.page, args.thread, new_urls, old_urls) myt.start() mange.add_old_urls(url) except Exception,e: print e time.sleep(1)