Esempi in Python per checkUrl, esempi in Python per common.common.checkUrl

Esempio n. 1

0

Mostra file

def getNewsList(url, bank, try_times=1):
    if try_times <= 3:
        try:
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试' % (try_times + 1))
            getNewsList(url, bank, try_times=try_times + 1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            # bank = bank + '/' + title
            # tags = soup.find_all('a', class_='more')
            try:
                tags = soup.find('div', id='wp_news_w10').find_all('a')
            except Exception as e:
                pass
            else:
                for tag in tags:
                    # print(bank)
                    if bank not in times:
                        times[bank] = 0
                    if not limit or times[bank] < limit:
                        # print(bank + '：' + str(times[bank]))
                        times[bank] = times[bank] + 1
                        getContent(common.checkUrl(tag['href']), bank)
                if not limit or times[bank] < limit:
                    next_page = soup.find('a', class_='next')
                    if next_page != None:
                        if next_page['href'] != 'javascript:void(0);':
                            getNewsList(common.checkUrl(next_page['href']),
                                        bank)

Esempio n. 2

0

Mostra file

File: sie.py Progetto: hikariHui/collegeNewsCrawler

def getNewsList(url, try_times = 1):
    if try_times <= 3:
        try:
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试'%(try_times+1))
            getNewsList(url, try_times = try_times+1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            title = soup.find('title').get_text()
            # tags = soup.find_all('a', class_='more')
            try:
                tags = soup.find('div',id='wp_news_w3').find_all('a')
            except Exception as e:
                pass
            else:
                for tag in tags:
                    # print(title)
                    if title not in times:
                        times[title] = 0
                    if not limit or times[title] < limit:
                        # print(title + '：' + str(times[title]))
                        times[title] = times[title] + 1
                        getContent(common.checkUrl(tag['href']), title)
                if not limit or times[title] < limit:
                    next_page = soup.find('a', class_='next')
                    if next_page != None:
                        if next_page['href'] != 'javascript:void(0);':
                            getNewsList('http://www2.scut.edu.cn' + next_page['href'])

Esempio n. 3

0

Mostra file

def start(url, try_times=1):
    if try_times <= 3:
        try:
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试' % (try_times + 1))
            start(url, try_times=try_times + 1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            tags = {}
            tags['学院信息'] = soup.find('a', string='学院信息')
            tags['新闻中心'] = soup.find('a', string='新闻中心')
            tags['研究成果'] = soup.find('a', string='研究成果')
            tags['学生工作'] = soup.find('a', title='学生工作')
            for key in tags:
                try:
                    url = tags[key]['href']
                    # print(url)
                except Exception as e:
                    print('获取 医学院 %s 地址失败' % (key))
                else:
                    getMoreUrl(common.checkUrl(url), key)
            for key in times:
                print(key + '：' + str(times[key]))

Esempio n. 4

0

Mostra file

def getMoreUrl(url, try_times=1):
    if try_times <= 3:
        try:
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试' % (try_times + 1))
            getMoreUrl(url, try_times=try_times + 1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            tags = soup.find_all('a', class_='more')
            for tag in tags:
                getNewsList(common.checkUrl(tag['href']))
            for key in times:
                print(key + '：' + str(times[key]))

Esempio n. 5

0

Mostra file

File: sie.py Progetto: hikariHui/collegeNewsCrawler

def start(url, try_times = 1):
    if try_times <= 3:
        try:
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试'%(try_times+1))
            start(url, try_times = try_times+1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            tag = soup.find('a', id='p16c4996')
            try:
                url = tag['href']
                # print(url)
            except Exception as e:
                print('获取 国际教育学院 新闻地址失败')
            else:
                getMoreUrl(common.checkUrl(url))

Esempio n. 6

0

Mostra file

def getMoreUrl(url, bank, try_times=1):
    if try_times <= 3:
        try:
            # print(url)
            html = urllib.request.urlopen(url).read().decode(encoding='utf-8')
        except Exception as e:
            print('错误URL：' + url)
            print(e)
            print('进行第%d次尝试' % (try_times + 1))
            getMoreUrl(url, bank, try_times=try_times + 1)
        else:
            soup = BeautifulSoup(html, "html.parser")
            menuTag = soup.find('ul', class_='wp_listcolumn')
            tags = menuTag.find_all('a')
            if len(tags) == 0:
                getNewsList(url, bank)
            else:
                for tag in tags:
                    try:
                        getNewsList(common.checkUrl(tag['href']),
                                    bank + '/' + tag['title'])
                    except Exception as e:
                        pass