def get_html_link(link_list): # 创建字典 data = { 'title': '', 'url': '', 'review': '', 'content': '', 'time': '', 'type': '' } dataList = [] url_list = [] # 存储所有访问过的 url, 避免重复访问 # 遍历所有子链接 for i in link_list: # 判断是否遍历过 flag = 0 for url in url_list: if url == str(i): flag = 1 break if flag == 1: continue url_list.append(i) # 正常请求 html_link = requests.get(i) html_link.encoding = 'utf-8' soup = BeautifulSoup(html_link.text, 'lxml') title = soup.title.string url = i review = "" content_div = soup.select(".entry-content")[0] content_list = content_div.select("p") for k in range(1, 6): appended_string = content_div.select("h" + str(k)) for ap in appended_string: content_list.append(ap) content = "" for j in content_list: content = content + str(j) # 处理内容,替换字符串中的字符,去掉正文中的图片信息 content = contentDeal.deal_content(content) # 通过标签名查找 时间 time = re.search( r"<meta property=\"article:published_time\" content=\"(.*?)\" />", html_link.text).group(1) type = "christmas" # 给字典赋值 data['title'] = title data['url'] = url data['review'] = review data['content'] = content data['time'] = time data['type'] = type # 加入 List dataList.append(data) # 更改字典地址 data = copy.copy(data) return dataList
def get_html_link(link_list): # 创建字典 data = { 'title': '', 'url': '', 'review': '', 'content': '', 'time': '', 'type': '' } dataList = [] url_list = [] # 存储所有访问过的 url, 避免重复访问 # 遍历所有子链接 for i in link_list: # 判断是否遍历过 flag = 0 for url in url_list: if url == str(i): flag = 1 break if flag == 1: continue url_list.append(i) # 正常请求 html_link = requests.get(i) html_link.encoding = 'utf-8' soup = BeautifulSoup(html_link.text, 'lxml') title = soup.title.string url = i review = "" content_list = soup.select(".article-content-main")[0].select("p") content = "" for j in content_list: content = content + str(j) # 处理内容,替换字符串中的字符,去掉正文中的图片信息 content = contentDeal.deal_content(content) # 通过标签名查找 时间 time = soup.select("time")[0] time = re.search(r"<time datetime=\"(.*)\">", str(time)).group(1) type = "science-and-technology" # 给字典赋值 data['title'] = title data['url'] = url data['review'] = review data['content'] = content data['time'] = time data['type'] = type # 加入 List dataList.append(data) # 更改字典地址 data = copy.copy(data) return dataList
def get_html_link(link_list) : # 创建字典 data = {'title': '', 'url': '','review': '', 'content': '', 'time': '', 'type': ''} dataList = [] url_list = [] # 存储所有访问过的 url, 避免重复访问 # 遍历所有子链接 for i in link_list : # 判断是否遍历过 flag = 0 for url in url_list : if url == str(i) : flag = 1 break if flag == 1 : continue url_list.append(i) # 正常请求 html_link = requests.get(i) html_link.encoding='utf-8' soup = BeautifulSoup(html_link.text, 'lxml') # 去除 js 代码 [s.extract() for s in soup('script')] # 去掉结尾的论坛标记 title = soup.title.string.replace(" - Finland Forum", "") url = i # 提取 论坛 内容 content = soup.select(".content") if content is None : print(i) print("内容为空, 跳过此网站") continue if len(content) == 0 : print(str(url) + "\t 此网站无内容 跳过此网站") continue content = content[0].text # 处理内容,替换字符串中的字符,去掉正文中的图片信息 content = contentDeal.deal_content(content) # 提取 论坛 评论 reviews = soup.select(".content") review = "" if len(reviews) > 2 : for rev in reviews[2:] : review = review + "<p>" + reviewDeal.deal_review(rev.text) + "<p>" # 通过标签名查找 时间 time = "" if len(soup.select('.author')) == 0 : print(str(url) + "\t此网站时间不可获取") else : time = soup.select('.author')[0].get_text()[-26:-1] type = "forum" # 给字典赋值 data['title'] = title data['url'] = url data['review'] = review data['content'] = content data['time'] = time data['type'] = type # 加入 List dataList.append(data) # 更改字典地址 data = copy.copy(data) # 检查 话题 页数 page_total = re.search(r"<strong>(\d+)</strong></span></a>", html_link.text) if page_total is not None : page_total = int(page_total.group(1)) else : page_total = 1 # 如果有很多页,继续遍历 if page_total == 1 : continue # 获取网页地址 # 从第二页开始,遍历每一页 page_url = soup.find_all('a', {'class' : 'button', 'role' : 'button'})[1].get('href') page_url = re.search(r".(/.*)", page_url).group(1).replace("amp;", "") page_url = page_url[0:-2] for j in range(2, page_total+1) : url = page_url + str((j - 1) * 15) inner_after(dataList, data, "https://www.finlandforum.org" + url) # 更改字典地址 data = copy.copy(data) return dataList
def get_html_link(link_list): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } # 创建字典 data = { 'title': '', 'url': '', 'review': '', 'content': '', 'time': '', 'type': '' } dataList = [] url_list = [] # 存储所有访问过的 url, 避免重复访问 # 遍历所有子链接 for i in link_list: # 判断是否遍历过 flag = 0 for url in url_list: if url == str(i): flag = 1 break if flag == 1: continue url_list.append(i) # 正常请求 html_link = requests.get(i, headers=headers, verify=False) html_link.encoding = 'utf-8' soup = BeautifulSoup(html_link.text, 'lxml') title = soup.title.string url = i review = "" content_div = soup.select(".entry-content")[0] content_list = content_div.select("p") for k in range(1, 6): appended_string = content_div.select("h" + str(k)) for ap in appended_string: content_list.append(ap) content = "" for j in content_list: content = content + str(j) # 处理内容,替换字符串中的字符,去掉正文中的图片信息 content = contentDeal.deal_content(content) # 通过标签名查找 时间 time = re.search( r"<meta property=\"article:published_time\" content=\"(.*?)\" />", html_link.text).group(1) type = "life" # 给字典赋值 data['title'] = title data['url'] = url data['review'] = review data['content'] = content data['time'] = time data['type'] = type # 加入 List dataList.append(data) # 更改字典地址 data = copy.copy(data) return dataList
def visit_single_html(url) : headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } # 创建字典 data = {'title': '', 'url': '','review': '', 'content': '', 'time': '', 'type': ''} dataList = [] # 正常请求 html_link = requests.get(url, verify=False, headers=headers) html_link.encoding='utf-8' soup = BeautifulSoup(html_link.text, 'lxml') # 去除 js 代码 [s.extract() for s in soup('script')] # 去掉结尾的论坛标记 title = soup.title.string.replace(" - Finland Forum", "") # 提取 论坛 内容 content = soup.select(".content") if content is None : print(url) print("内容为空, 跳过此网站") return dataList if len(content) == 0 : print(str(url) + "\t 此网站无内容") return dataList content = content[0].text # 处理内容,替换字符串中的字符,去掉正文中的图片信息 content = contentDeal.deal_content(content) # 提取 论坛 评论 reviews = soup.select(".content") review = "" if len(reviews) > 2 : for rev in reviews[2:] : review = review + "<p>" + reviewDeal.deal_review(rev.text) + "<p>" # 通过标签名查找 时间 time = "" if len(soup.select('.author')) == 0 : print(str(url) + "\t此网站时间不可获取") else : time = soup.select('.author')[0].get_text()[-26:-1] type = "forum" # 给字典赋值 data['title'] = title data['url'] = url data['review'] = review data['content'] = content data['time'] = time data['type'] = type # 加入 List dataList.append(data) # 更改字典地址 data = copy.copy(data) # 检查 话题 页数 page_total = re.search(r"<strong>(\d+)</strong></span></a>", html_link.text) if page_total is not None : page_total = int(page_total.group(1)) else : page_total = 1 # 如果有很多页,继续遍历 if page_total == 1 : return dataList # 获取网页地址 # 从第二页开始,遍历每一页 page_url = soup.find_all('a', {'class' : 'button', 'role' : 'button'})[1].get('href') page_url = re.search(r".(/.*)", page_url).group(1).replace("amp;", "") page_url = page_url[0:-2] for j in range(2, page_total+1) : url = page_url + str((j - 1) * 15) inner_after(dataList, data, "https://www.finlandforum.org" + url) # 更改字典地址 data = copy.copy(data) return dataList