Beispiel #1
0
 def run(self):
     linklist = []
     middlelist = []
     from BeautifulSoup import BeautifulSoup
     from DownLoadWeb import DownloadWeb
     startUrl = self.url
     page = DownloadWeb(startUrl)
     assert isinstance (page,str )
     html = BeautifulSoup(page)
     for link in html.findAll('a'):
         if not self.is_alive:
             break
         link = unicode(link.get('href')).encode('utf8')
         if link.startswith('http') and 'sina' not in link:
             linklist.append(link)
             wx.CallAfter(Publisher().sendMessage,'update',str(link))
         elif 'sina' not in link:
             middlelist.append(link)
     url = '/'.join(startUrl.split('/')[:-1])
     for elink in middlelist:
         if not self.is_alive:
             break
         aurl = url + '/' + elink
         print aurl
         page = DownloadWeb(aurl)
         assert isinstance (page,str )
         html = BeautifulSoup(page)
         for link in html.findAll('a'):
             if not self.is_alive:
                 break
             link = unicode(link.get('href').encode('utf8'))
             linklist.append(link)
             wx.CallAfter(Publisher().sendMessage,'update',str(link))
Beispiel #2
0
def getUrl(startUrl):
    print startUrl
    linklist = []
    middlelist = []

    page = DownloadWeb(startUrl)
    assert isinstance(page, str)
    html = BeautifulSoup(page)
    for link in html.findAll('a'):
        link = unicode(link.get('href')).encode('utf8')
        if link.startswith('http'):
            linklist.append(link)
            #print alink
        else:
            middlelist.append(link)
    #http://www.sina.com.cn/ddt/wangzhi/index.html
    #to
    #http://www.sina.com.cn/ddt/wangzhi
    url = '/'.join(startUrl.split('/')[:-1])
    for elink in middlelist:
        aurl = url + '/' + elink
        print aurl
        page = DownloadWeb(aurl)
        assert isinstance(page, str)
        html = BeautifulSoup(page)
        for link in html.findAll('a'):
            link = unicode(link.get('href').encode('utf8'))
            linklist.append(link)
    print len(linklist)
    return linklist
Beispiel #3
0
 def run(self):
     print '正在下载初始页面'
     page = DownloadWeb(self.starturl)
     if page is None:
         print '初始页面下载失败'
         return
     links = GetLinks(page)
     for alink in links:
         self.urlqueue.put(alink)
     urlList = []
     urlList += links
     i = 0
     while len(urlList) > i:
         alink = urlList[i]
         i += 1
         print '\033[1;31;40m'
         print '正在下载:', alink
         page = DownloadWeb(alink)
         if page is None:
             print '获取页面失败', alink
             continue
         links = GetLinks(page)
         print '获取到的连接数:', len(links)
         for link in links:
             if 'http:' in link and link not in urlList:
                 self.urlqueue.put(link)
                 urlList.append(link)
         print '目前urllist的长度:', len(urlList)
         sleep(1)
Beispiel #4
0
 def run(self):
     #print '正在下载初始页面'
     wx.CallAfter(Publisher().sendMessage,'UpdateProc','正在下载初始页面')
     page = DownloadWeb(self.starturl)
     if page is None:
         #print '初始页面下载失败'
         wx.CallAfter(Publisher().sendMessage,'UpdateProc','初始页面下载失败')
         return
     links = GetLinks(page)
     for alink in links:
         self.urlqueue.put(alink)
     urlList = []
     urlList += links
     wx.CallAfter(Publisher().sendMessage,'UpdateUrlNum',len(links))
     i = 0
     while len(urlList) > i and self.is_alive:
         alink = urlList[i]
         i += 1
         #print '\033[1;31;40m'
         #print '正在下载:',alink
         wx.CallAfter(Publisher().sendMessage,'UpdateProc','正在下载:'+str(alink))
         page = DownloadWeb(alink)
         if page is None:
             #print '获取页面失败',alink
             wx.CallAfter(Publisher().sendMessage,'UpdateProc','获取页面失败'+str(alink))
             continue
         links = GetLinks(page)
         #print '获取到的连接数:',len(links)
         wx.CallAfter(Publisher().sendMessage,'UpdateProc','获取到的连接数:'+str(len(links)))
         count = 0
         for link in links:
             if 'http:' in link and link not in urlList and self.is_alive:
                 self.urlqueue.put(link)
                 urlList.append(link)
                 count += 1
         print '目前urllist的长度:',len(urlList)
         wx.CallAfter(Publisher().sendMessage,'UpdateUrlNum',count)
         sleep(1)
     wx.CallAfter(Publisher().sendMessage,'UpdateProc',self.name+'线程已经结束')
Beispiel #5
0
 def urlAge(self):
     '判断域名注册年龄'
     seourl = 'http://seo.chinaz.com/?host='
     target = self.url.replace(':','%3a').replace('/','%2f')
     page = DownloadWeb(seourl + target)
     if page == None:
         return u'注册时间无法获取'
     html = BeautifulSoup(page)
     info = html.findAll('font',attrs={'color':'blue'})
     if len(info) > 1:
         return info[3].text
     else:
         return u'注册时间无法获取'
Beispiel #6
0
 def isCopyright(self):
     '判断改网站是否有备案'
     seourl = 'http://tool.chinaz.com/beian.aspx?s='
     target = seourl+ self.url_parse.netloc
     page = DownloadWeb(target)
     if page == None:
         return u"该网站暂无备案"
     html = BeautifulSoup(page)
     info = html.findAll('td',attrs={'class':'tdright'})
     if len(info) > 1:
         return info[2].text
     else:
         return u"该网站暂无备案"
Beispiel #7
0
    def __init__(self,url,enable_proxy = False):
        self.url = url
        self.url_parse = urlparse(url)
        self.page = None
        self.enable_proxy = enable_proxy
        self.html = None
        self.page = DownloadWeb(self.url,self.enable_proxy)
        if self.page == None:
            self.flag = False
        else:
            self.html = BeautifulSoup(self.page)

        if self.html == None:
            self.flag = False
        else:
            self.flag = True