import spiderFunction import requests import re from lib import FMysql from bs4 import BeautifulSoup from download import Download download = Download() mysql = FMysql.FMysql() for page_num in range(1, 11): start_html = 'http://sse.tongji.edu.cn/Data/List/xwdt' start_html += '?page=' start_html += str(page_num) Soup = download.get(start_html) all_a = Soup.find('div', class_='right-nr').find('ul').find_all('a') for a in all_a: article_href = a['href'][a['href'].rfind('/') + 1:] if not mysql.isUrlExist("software_engineering", article_href): page_url = "http://sse.tongji.edu.cn/" + a['href'] page_soup = download.get(page_url) article = page_soup.find('div', class_='right-nr') article_title = article.find('div', class_='view-title').find('h1').get_text() article_time = article.find('div', class_='view-info').find('span').get_text() article_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", article_time).group() article_content = article.find('div', class_='view-cnt').get_text() dict_article = {'title': article_title, 'content': article_content, 'time': article_time, 'id': article_href}
import spiderFunction import requests import re from lib import FMysql from bs4 import BeautifulSoup from download import Download start_html = 'https://news.tongji.edu.cn/classid-5.html' base_html = 'https://news.tongji.edu.cn' download = Download() mysql = FMysql.FMysql() Soup = download.get(start_html) max_page = Soup.find('div', class_='pager').find_all('a')[-1]['href'] start = str(max_page).rfind('-') end = str(max_page).rfind('.') max_page = str(max_page)[start + 1:end] for page_num in range(1, int(max_page)): base_page_html = base_html + '/classid-5-' page_html = base_page_html + str(page_num) + '.html' Soup = download.get(page_html) all_a = Soup.find_all('div', class_='news_list')[2].find_all( 'a', attrs={'title': True}) for a in all_a: a = base_html + '/' + a['href'] # print(a) left = a.rfind('-t') right = a.rfind('id') id = a[right + 3:left] if not mysql.isIdExist('tongji_university_news', id): Soup = download.get(a)
def get_html_content(url, data, headers, method): return Download.post( url, data, headers) if str(method).lower() == 'post' else Download.get(url)