def parse_html_file(file_to_parse, error_db): """ Parses an html file into a searchable, manipulatable Python tree using the BeautifulSoup library. Attempts to parse file using a cascade of potential html parsers {lxml -> html.parser -> html5lib} ordered from fastest to slowest. Errors encountered in parsing are timestamped and logged in the error_files database. :param file_to_parse: string holding the path to file to parse :param error_db: database to hold error messages as log :return: Parsed File Object from BeautifulSoup if parse was successful and None if parse was unsuccessful """ timestamp = time.strftime('%x %X %Z') try: with open(file_to_parse) as ftp: # parse html file to tree object try: soup = bs( ftp, 'lxml') # lxml is the fastest, but least lenient parser except Exception as e: error_log = { 'Time': timestamp, 'File': file_to_parse, 'ErrorMessage': [ "Error parsing {0} with lxml: {1}".format( file_to_parse, e) ] } try: soup = bs(ftp, 'html.parser' ) # built-in parser, decent speed and leniency except Exception as e2: error_log['ErrorMessage'].append( "Error parsing {0} with html.parser: {1}".format( file_to_parse, e2)) try: soup = bs(ftp, 'html5lib') # slowest, most lenient parser except Exception as e3: error_log['ErrorMessage'].append( "Error parsing {0} with html5lib: {1}".format( file_to_parse, e3)) error_log['ErrorMessage'].append( "BeautifulSoup4 diagnosis: {0}".format( diagnose(ftp.read()))) error_db.insert_one(error_log) return None return soup except Exception as e4: error_log = { 'Time': timestamp, 'File': file_to_parse, 'ErrorMessage': ["File error: {1}".format(file_to_parse, e4)] } # write to error collection error_db.insert_one(error_log) return None
from bs4 import BeautifulSoup import requests from bs4.diagnose import diagnose print "Success" #url = 'http://my.ebay.com/ws/eBayISAPI.dll?MyEbay&gbh=1&CurrentPage=MyeBayAllSelling&ssPageName=STRK:ME:LNLK:MESX' data = open("testing-sheriff.html").read() diagnose (data) """response = requests.get(url) html = response.content soup = BeautifulSoup(html, "html.parser") #.encode('utf-8') #print soup.prettify() #print soup table = soup.find('table', attrs={'id': 'v4-My_47_82_tab_0'}) #print table for row in table.findAll('tr'): for cell in row.findAll('td'): print cell.text"""
# Tillie # </a> print('this is getting only id link2:') print( BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify()) # <a class="sister" href="http://example.com/lacie" id="link2"> # Lacie # </a> # bad_html = """ # <html><head><title>oops.</title></head></html> # """ # ## this is to print an error with running # print(BeautifulSoup(bad_html).prettify()) print('this is getting list of names of links:') #this runs it 4 times because the length of string print(diagnose(html_doc)) # print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify()) # Elsie # , # Lacie # and # Tillie # ... # soup = BeautifulSoup(html_doc, 'html.parser')
doc = """<script> h=window.location.protocol+"//",r='<body onload="'; </script>""" from bs4.diagnose import diagnose diagnose(doc)
and they lived at the bottom of a well.</p> <p class="story">...</p> """ print( BeautifulSoup(html_doc, 'html.parser', parse_only=only_a_tags).prettify()) print( BeautifulSoup(html_doc, 'html.parser', parse_only=only_tags_with_id_link2).prettify()) print(BeautifulSoup(html_doc, 'html.parser', parse_only=only_short_strings)) soup = BeautifulSoup(html_doc) print(soup.find_all(only_short_strings)) # 代码诊断 from bs4.diagnose import diagnose data = open("bad.html").read() print(diagnose(data)) from urllib.request import urlopen soup = BeautifulSoup( urlopen('https://blog.csdn.net/weixin_42184707/article/details/80361464')) print(soup.prettify()) print(soup.find_all('pre'))
def soup(request): if request.session.has_key('logged_in'): guiltlink_list = GuiltLink.objects.all().order_by('-id') if request.method == 'POST': guiltlink_id = int(request.POST.get('guiltlink_id')) link = GuiltLink.objects.get(id=guiltlink_id) exceptions = [] results = {} def soupArticle(soup, link): title = soup.find_all("meta", property="og:title") description = soup.find_all("meta", property="og:description") image_url = soup.find_all("meta", property="og:image") if title: link.title = title[0]["content"] else: title = soup.find("title") if title: link.title = str(title) else: exceptions.append('no title') if description: link.description = description[0]["content"] else: exceptions.append('no description') if image_url: link.image_url = image_url[0]["content"] else: exceptions.append('no image url') link.save() results['title'] = link.title results['description'] = link.description results['image_url'] = link.image_url return results try: page = urlopen(link.link) except Exception, e: exceptions.append('Exception at first, so needed headers.') try: USERAGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' HEADERS = {'User-Agent': USERAGENT} req = Request(link.link, headers=HEADERS) page = urlopen(req) except Exception, e: exceptions.append('Exception with without headers.') try: soup = BeautifulSoup(page.read(), "html.parser") soupArticle(soup, link) except Exception, e: diagnosis = diagnose(page.read()) exceptions.append('html.parser didn\'t work.' + str(diagnosis)) try: soup = BeautifulSoup(page.read(), "html5lib") soupArticle(soup, link) except Exception, e: exceptions.append('html5lib didn\'t work either.') try: soup = BeautifulSoup(page.read(), "lxml") soupArticle(soup, link) except Exception, e: exceptions.append('lxml didn\'t work either.') try: soup = bs3(page.read(), "html5lib") soupArticle(soup, link) except: exceptions.append('bs3 didn\'t work either')
from bs4.diagnose import diagnose testStrings = [ "<a><b></b></a>", "<a><b></b> ", "<a><b> </a>", "<a> </b></a>", " <b></b></a>", ] f_string = open('test3.html').read() testStrings.append(f_string) f_string = open('test4.html').read() testStrings.append(f_string) f_string = open('test5.html').read() testStrings.append(f_string) for test in testStrings: print("\n\ndiagnosing " + test) diagnose(test)
from markov_python.cc_markov import MarkovChain import urllib2 from bs4 import BeautifulSoup from bs4.diagnose import diagnose import os mc = MarkovChain() text = urllib2.urlopen( "https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-only-part-of-a-document" ) html = text.read() diagnose(read) #mc.add_string(example) #new = mc.generate_text() #print new
#!/usr/bin/env python from bs4.diagnose import diagnose import urllib html = urllib.urlopen("http://www.nhl.com/scores/htmlreports/20082009/GS021229.HTM").read() diagnose(html)
#recursive=False . print soup.find_all('title',recursive=False) # 直接返回结果,不是list # find find_all # find_parents() find_parent # find_next_siblings() find_next_sibling() # find_previous_siblings() find_previous_sibling() # find_all_next() find_next() # find_all_previous() 和 find_previous() ###############css选择器返回list # select 方法返回的结果都是列表形式, # 可以遍历形式输出, # 然后用 get_text() 方法来获取它的内容。 soup.select('a') #tag,class,id,属性以及组合查找子标签查找均可 #一个tag多个class属性 # soup.select('tagname.class1.class2')[0] # soup.find('tagname', class_=['class1', 'class2']) soup = BeautifulSoup(html, 'lxml') print type(soup.select('title')) print soup.select('title')[0].get_text() for title in soup.select('title'): print title.get_text() from bs4.diagnose import diagnose data = open(bad.html).read()diagnose(data)
from markov_python.cc_markov import MarkovChain import urllib2 from bs4 import BeautifulSoup from bs4.diagnose import diagnose import os mc = MarkovChain() text = urllib2.urlopen("https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-only-part-of-a-document") html = text.read() diagnose(read) #mc.add_string(example) #new = mc.generate_text() #print new