def run(self): linklist = [] middlelist = [] from BeautifulSoup import BeautifulSoup from DownLoadWeb import DownloadWeb startUrl = self.url page = DownloadWeb(startUrl) assert isinstance (page,str ) html = BeautifulSoup(page) for link in html.findAll('a'): if not self.is_alive: break link = unicode(link.get('href')).encode('utf8') if link.startswith('http') and 'sina' not in link: linklist.append(link) wx.CallAfter(Publisher().sendMessage,'update',str(link)) elif 'sina' not in link: middlelist.append(link) url = '/'.join(startUrl.split('/')[:-1]) for elink in middlelist: if not self.is_alive: break aurl = url + '/' + elink print aurl page = DownloadWeb(aurl) assert isinstance (page,str ) html = BeautifulSoup(page) for link in html.findAll('a'): if not self.is_alive: break link = unicode(link.get('href').encode('utf8')) linklist.append(link) wx.CallAfter(Publisher().sendMessage,'update',str(link))
def getUrl(startUrl): print startUrl linklist = [] middlelist = [] page = DownloadWeb(startUrl) assert isinstance(page, str) html = BeautifulSoup(page) for link in html.findAll('a'): link = unicode(link.get('href')).encode('utf8') if link.startswith('http'): linklist.append(link) #print alink else: middlelist.append(link) #http://www.sina.com.cn/ddt/wangzhi/index.html #to #http://www.sina.com.cn/ddt/wangzhi url = '/'.join(startUrl.split('/')[:-1]) for elink in middlelist: aurl = url + '/' + elink print aurl page = DownloadWeb(aurl) assert isinstance(page, str) html = BeautifulSoup(page) for link in html.findAll('a'): link = unicode(link.get('href').encode('utf8')) linklist.append(link) print len(linklist) return linklist
def run(self): print '正在下载初始页面' page = DownloadWeb(self.starturl) if page is None: print '初始页面下载失败' return links = GetLinks(page) for alink in links: self.urlqueue.put(alink) urlList = [] urlList += links i = 0 while len(urlList) > i: alink = urlList[i] i += 1 print '\033[1;31;40m' print '正在下载:', alink page = DownloadWeb(alink) if page is None: print '获取页面失败', alink continue links = GetLinks(page) print '获取到的连接数:', len(links) for link in links: if 'http:' in link and link not in urlList: self.urlqueue.put(link) urlList.append(link) print '目前urllist的长度:', len(urlList) sleep(1)
def run(self): #print '正在下载初始页面' wx.CallAfter(Publisher().sendMessage,'UpdateProc','正在下载初始页面') page = DownloadWeb(self.starturl) if page is None: #print '初始页面下载失败' wx.CallAfter(Publisher().sendMessage,'UpdateProc','初始页面下载失败') return links = GetLinks(page) for alink in links: self.urlqueue.put(alink) urlList = [] urlList += links wx.CallAfter(Publisher().sendMessage,'UpdateUrlNum',len(links)) i = 0 while len(urlList) > i and self.is_alive: alink = urlList[i] i += 1 #print '\033[1;31;40m' #print '正在下载:',alink wx.CallAfter(Publisher().sendMessage,'UpdateProc','正在下载:'+str(alink)) page = DownloadWeb(alink) if page is None: #print '获取页面失败',alink wx.CallAfter(Publisher().sendMessage,'UpdateProc','获取页面失败'+str(alink)) continue links = GetLinks(page) #print '获取到的连接数:',len(links) wx.CallAfter(Publisher().sendMessage,'UpdateProc','获取到的连接数:'+str(len(links))) count = 0 for link in links: if 'http:' in link and link not in urlList and self.is_alive: self.urlqueue.put(link) urlList.append(link) count += 1 print '目前urllist的长度:',len(urlList) wx.CallAfter(Publisher().sendMessage,'UpdateUrlNum',count) sleep(1) wx.CallAfter(Publisher().sendMessage,'UpdateProc',self.name+'线程已经结束')
def urlAge(self): '判断域名注册年龄' seourl = 'http://seo.chinaz.com/?host=' target = self.url.replace(':','%3a').replace('/','%2f') page = DownloadWeb(seourl + target) if page == None: return u'注册时间无法获取' html = BeautifulSoup(page) info = html.findAll('font',attrs={'color':'blue'}) if len(info) > 1: return info[3].text else: return u'注册时间无法获取'
def isCopyright(self): '判断改网站是否有备案' seourl = 'http://tool.chinaz.com/beian.aspx?s=' target = seourl+ self.url_parse.netloc page = DownloadWeb(target) if page == None: return u"该网站暂无备案" html = BeautifulSoup(page) info = html.findAll('td',attrs={'class':'tdright'}) if len(info) > 1: return info[2].text else: return u"该网站暂无备案"
def __init__(self,url,enable_proxy = False): self.url = url self.url_parse = urlparse(url) self.page = None self.enable_proxy = enable_proxy self.html = None self.page = DownloadWeb(self.url,self.enable_proxy) if self.page == None: self.flag = False else: self.html = BeautifulSoup(self.page) if self.html == None: self.flag = False else: self.flag = True