Esempio n. 1
0
def piglify_response(context, flow):
    with decoded(flow.response):
        html_response = False
        for header in flow.response.headers:
            if header[0] == 'Content-Type' and 'text/html' in header[1]:
                html_response = True

        if not html_response:
            return

        soup = BeautifulSoup(flow.response.content, 'html.parser')
        replace = []
        for text in soup._all_strings():
            replace.append(text)

        for text in replace:
            if soup.find(text=text):
                soup.find(text=text).replaceWith(piglify(text))

        flow.response.content = str(soup.prettify().encode('utf-8'))
Esempio n. 2
0
from bs4 import BeautifulSoup
import bs4
import re

path = 'C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/31省份新增新冠肺炎确诊病例17例 均为境外输入病例.html'

htmlfile = open(path, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
htmlfile.close()
soup = BeautifulSoup(htmlhandle, 'lxml')

#f = open('C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/text.txt', 'w', encoding='utf-8')
#f.writelines(soup.text)
#f.close()

f = open('C:/Users/Regen/Desktop/C-sharp-learning/htmlFiles/strings.txt',
         'w',
         encoding='utf-8')
#print(list(soup._all_strings()))
lines = list(soup._all_strings())
length = len(lines)
for i in range(length):
    if lines[i] != '\n':
        f.write(lines[i] + '\n')

f.close()