def getNewsList(url, bank, try_times=1): if try_times <= 3: try: html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试' % (try_times + 1)) getNewsList(url, bank, try_times=try_times + 1) else: soup = BeautifulSoup(html, "html.parser") # bank = bank + '/' + title # tags = soup.find_all('a', class_='more') try: tags = soup.find('div', id='wp_news_w10').find_all('a') except Exception as e: pass else: for tag in tags: # print(bank) if bank not in times: times[bank] = 0 if not limit or times[bank] < limit: # print(bank + ':' + str(times[bank])) times[bank] = times[bank] + 1 getContent(common.checkUrl(tag['href']), bank) if not limit or times[bank] < limit: next_page = soup.find('a', class_='next') if next_page != None: if next_page['href'] != 'javascript:void(0);': getNewsList(common.checkUrl(next_page['href']), bank)
def getNewsList(url, try_times = 1): if try_times <= 3: try: html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试'%(try_times+1)) getNewsList(url, try_times = try_times+1) else: soup = BeautifulSoup(html, "html.parser") title = soup.find('title').get_text() # tags = soup.find_all('a', class_='more') try: tags = soup.find('div',id='wp_news_w3').find_all('a') except Exception as e: pass else: for tag in tags: # print(title) if title not in times: times[title] = 0 if not limit or times[title] < limit: # print(title + ':' + str(times[title])) times[title] = times[title] + 1 getContent(common.checkUrl(tag['href']), title) if not limit or times[title] < limit: next_page = soup.find('a', class_='next') if next_page != None: if next_page['href'] != 'javascript:void(0);': getNewsList('http://www2.scut.edu.cn' + next_page['href'])
def start(url, try_times=1): if try_times <= 3: try: html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试' % (try_times + 1)) start(url, try_times=try_times + 1) else: soup = BeautifulSoup(html, "html.parser") tags = {} tags['学院信息'] = soup.find('a', string='学院信息') tags['新闻中心'] = soup.find('a', string='新闻中心') tags['研究成果'] = soup.find('a', string='研究成果') tags['学生工作'] = soup.find('a', title='学生工作') for key in tags: try: url = tags[key]['href'] # print(url) except Exception as e: print('获取 医学院 %s 地址失败' % (key)) else: getMoreUrl(common.checkUrl(url), key) for key in times: print(key + ':' + str(times[key]))
def getMoreUrl(url, try_times=1): if try_times <= 3: try: html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试' % (try_times + 1)) getMoreUrl(url, try_times=try_times + 1) else: soup = BeautifulSoup(html, "html.parser") tags = soup.find_all('a', class_='more') for tag in tags: getNewsList(common.checkUrl(tag['href'])) for key in times: print(key + ':' + str(times[key]))
def start(url, try_times = 1): if try_times <= 3: try: html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试'%(try_times+1)) start(url, try_times = try_times+1) else: soup = BeautifulSoup(html, "html.parser") tag = soup.find('a', id='p16c4996') try: url = tag['href'] # print(url) except Exception as e: print('获取 国际教育学院 新闻地址失败') else: getMoreUrl(common.checkUrl(url))
def getMoreUrl(url, bank, try_times=1): if try_times <= 3: try: # print(url) html = urllib.request.urlopen(url).read().decode(encoding='utf-8') except Exception as e: print('错误URL:' + url) print(e) print('进行第%d次尝试' % (try_times + 1)) getMoreUrl(url, bank, try_times=try_times + 1) else: soup = BeautifulSoup(html, "html.parser") menuTag = soup.find('ul', class_='wp_listcolumn') tags = menuTag.find_all('a') if len(tags) == 0: getNewsList(url, bank) else: for tag in tags: try: getNewsList(common.checkUrl(tag['href']), bank + '/' + tag['title']) except Exception as e: pass