def piglify_response(context, flow): with decoded(flow.response): html_response = False for header in flow.response.headers: if header[0] == 'Content-Type' and 'text/html' in header[1]: html_response = True if not html_response: return soup = BeautifulSoup(flow.response.content, 'html.parser') replace = [] for text in soup._all_strings(): replace.append(text) for text in replace: if soup.find(text=text): soup.find(text=text).replaceWith(piglify(text)) flow.response.content = str(soup.prettify().encode('utf-8'))
from bs4 import BeautifulSoup import bs4 import re path = 'C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/31省份新增新冠肺炎确诊病例17例 均为境外输入病例.html' htmlfile = open(path, 'r', encoding='utf-8') htmlhandle = htmlfile.read() htmlfile.close() soup = BeautifulSoup(htmlhandle, 'lxml') #f = open('C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/text.txt', 'w', encoding='utf-8') #f.writelines(soup.text) #f.close() f = open('C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/strings.txt', 'w', encoding='utf-8') #print(list(soup._all_strings())) lines = list(soup._all_strings()) length = len(lines) for i in range(length): if lines[i] != '\n': f.write(lines[i] + '\n') f.close()