def parse_html_file(file_to_parse, error_db):
    """
    Parses an html file into a searchable, manipulatable Python tree using the BeautifulSoup library.  Attempts to parse
    file using a cascade of potential html parsers {lxml -> html.parser -> html5lib} ordered from fastest to slowest.
    Errors encountered in parsing are timestamped and logged in the error_files database.

    :param file_to_parse: string holding the path to file to parse
    :param error_db: database to hold error messages as log
    :return: Parsed File Object from BeautifulSoup if parse was successful and None if parse was unsuccessful
    """
    timestamp = time.strftime('%x %X %Z')
    try:
        with open(file_to_parse) as ftp:
            # parse html file to tree object
            try:
                soup = bs(
                    ftp,
                    'lxml')  # lxml is the fastest, but least lenient parser
            except Exception as e:
                error_log = {
                    'Time':
                    timestamp,
                    'File':
                    file_to_parse,
                    'ErrorMessage': [
                        "Error parsing {0} with lxml: {1}".format(
                            file_to_parse, e)
                    ]
                }
                try:
                    soup = bs(ftp, 'html.parser'
                              )  # built-in parser, decent speed and leniency
                except Exception as e2:
                    error_log['ErrorMessage'].append(
                        "Error parsing {0} with html.parser: {1}".format(
                            file_to_parse, e2))
                    try:
                        soup = bs(ftp,
                                  'html5lib')  # slowest, most lenient parser
                    except Exception as e3:
                        error_log['ErrorMessage'].append(
                            "Error parsing {0} with html5lib: {1}".format(
                                file_to_parse, e3))
                        error_log['ErrorMessage'].append(
                            "BeautifulSoup4 diagnosis: {0}".format(
                                diagnose(ftp.read())))

                        error_db.insert_one(error_log)
                        return None

            return soup
    except Exception as e4:
        error_log = {
            'Time': timestamp,
            'File': file_to_parse,
            'ErrorMessage': ["File error: {1}".format(file_to_parse, e4)]
        }
        # write to error collection
        error_db.insert_one(error_log)
        return None
Esempio n. 2
0
from bs4 import BeautifulSoup
import requests
from bs4.diagnose import diagnose
print "Success"

#url = 'http://my.ebay.com/ws/eBayISAPI.dll?MyEbay&gbh=1&CurrentPage=MyeBayAllSelling&ssPageName=STRK:ME:LNLK:MESX'

data = open("testing-sheriff.html").read()
diagnose (data)

"""response = requests.get(url)
html = response.content

soup = BeautifulSoup(html, "html.parser") #.encode('utf-8')
#print soup.prettify()
#print soup
table = soup.find('table', attrs={'id': 'v4-My_47_82_tab_0'})
#print table

for row in table.findAll('tr'):
    for cell in row.findAll('td'):
        print cell.text"""
Esempio n. 3
0
#  Tillie
# </a>
print('this is getting only id link2:')
print(
    BeautifulSoup(html_doc, "html.parser",
                  parse_only=only_tags_with_id_link2).prettify())
# <a class="sister" href="http://example.com/lacie" id="link2">
#  Lacie
# </a>

# bad_html = """
#  <html><head><title>oops.</title></head></html>
#  """
# ## this is to print an error with running
# print(BeautifulSoup(bad_html).prettify())

print('this is getting list of names of links:')
#this runs it 4 times because the length of string
print(diagnose(html_doc))

# print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())
# Elsie
# ,
# Lacie
# and
# Tillie
# ...
#

soup = BeautifulSoup(html_doc, 'html.parser')
Esempio n. 4
0
doc = """<script>
h=window.location.protocol+"//",r='<body onload="';
</script>"""
from bs4.diagnose import diagnose
diagnose(doc)
Esempio n. 5
0
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

print(
    BeautifulSoup(html_doc, 'html.parser', parse_only=only_a_tags).prettify())

print(
    BeautifulSoup(html_doc, 'html.parser',
                  parse_only=only_tags_with_id_link2).prettify())

print(BeautifulSoup(html_doc, 'html.parser', parse_only=only_short_strings))

soup = BeautifulSoup(html_doc)
print(soup.find_all(only_short_strings))

# 代码诊断
from bs4.diagnose import diagnose

data = open("bad.html").read()
print(diagnose(data))

from urllib.request import urlopen

soup = BeautifulSoup(
    urlopen('https://blog.csdn.net/weixin_42184707/article/details/80361464'))
print(soup.prettify())

print(soup.find_all('pre'))
Esempio n. 6
0
def soup(request):
    if request.session.has_key('logged_in'):
        guiltlink_list = GuiltLink.objects.all().order_by('-id')
        if request.method == 'POST':
            guiltlink_id = int(request.POST.get('guiltlink_id'))
            link = GuiltLink.objects.get(id=guiltlink_id)
            exceptions = []
            results = {}

            def soupArticle(soup, link):
                title = soup.find_all("meta", property="og:title")
                description = soup.find_all("meta", property="og:description")
                image_url = soup.find_all("meta", property="og:image")
                if title:
                    link.title = title[0]["content"]
                else:
                    title = soup.find("title")
                    if title:
                        link.title = str(title)
                    else:
                        exceptions.append('no title')
                if description:
                    link.description = description[0]["content"]
                else:
                    exceptions.append('no description')
                if image_url:
                    link.image_url = image_url[0]["content"]
                else:
                    exceptions.append('no image url')
                link.save()
                results['title'] = link.title
                results['description'] = link.description
                results['image_url'] = link.image_url
                return results

            try:
                page = urlopen(link.link)
            except Exception, e:
                exceptions.append('Exception at first, so needed headers.')
                try:
                    USERAGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
                    HEADERS = {'User-Agent': USERAGENT}
                    req = Request(link.link, headers=HEADERS)
                    page = urlopen(req)
                except Exception, e:
                    exceptions.append('Exception with without headers.')
            try:
                soup = BeautifulSoup(page.read(), "html.parser")
                soupArticle(soup, link)
            except Exception, e:
                diagnosis = diagnose(page.read())
                exceptions.append('html.parser didn\'t work.' + str(diagnosis))
                try:
                    soup = BeautifulSoup(page.read(), "html5lib")
                    soupArticle(soup, link)
                except Exception, e:
                    exceptions.append('html5lib didn\'t work either.')
                    try:
                        soup = BeautifulSoup(page.read(), "lxml")
                        soupArticle(soup, link)
                    except Exception, e:
                        exceptions.append('lxml didn\'t work either.')
                        try:
                            soup = bs3(page.read(), "html5lib")
                            soupArticle(soup, link)
                        except:
                            exceptions.append('bs3 didn\'t work either')
Esempio n. 7
0
from bs4.diagnose import diagnose

testStrings = [
    "<a><b></b></a>",
    "<a><b></b>    ",
    "<a><b>    </a>",
    "<a>   </b></a>",
    "   <b></b></a>",
]
f_string = open('test3.html').read()
testStrings.append(f_string)
f_string = open('test4.html').read()
testStrings.append(f_string)

f_string = open('test5.html').read()
testStrings.append(f_string)
for test in testStrings:
    print("\n\ndiagnosing " + test)
    diagnose(test)
Esempio n. 8
0
from markov_python.cc_markov import MarkovChain
import urllib2
from bs4 import BeautifulSoup
from bs4.diagnose import diagnose
import os

mc = MarkovChain()
text = urllib2.urlopen(
    "https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-only-part-of-a-document"
)
html = text.read()
diagnose(read)

#mc.add_string(example)
#new = mc.generate_text()

#print new
Esempio n. 9
0
#!/usr/bin/env python
from bs4.diagnose import diagnose
import urllib

html = urllib.urlopen("http://www.nhl.com/scores/htmlreports/20082009/GS021229.HTM").read()

diagnose(html)
Esempio n. 10
0
File: bea1.py Progetto: dittoyy/Web
#recursive=False .
print soup.find_all('title',recursive=False)

# 直接返回结果,不是list
# find find_all
# find_parents() find_parent
# find_next_siblings()  find_next_sibling()
# find_previous_siblings()  find_previous_sibling()
# find_all_next()  find_next()
# find_all_previous() 和 find_previous()

###############css选择器返回list
# select 方法返回的结果都是列表形式,
# 可以遍历形式输出,
# 然后用 get_text() 方法来获取它的内容。
soup.select('a')
#tag,class,id,属性以及组合查找子标签查找均可
#一个tag多个class属性
# soup.select('tagname.class1.class2')[0]
# soup.find('tagname', class_=['class1', 'class2'])

soup = BeautifulSoup(html, 'lxml')
print type(soup.select('title'))
print soup.select('title')[0].get_text()

for title in soup.select('title'):
    print title.get_text()

from bs4.diagnose import diagnose
data = open(bad.html).read()diagnose(data)
Esempio n. 11
0
doc = """<script>
h=window.location.protocol+"//",r='<body onload="';
</script>"""
from bs4.diagnose import diagnose

diagnose(doc)
Esempio n. 12
0
from markov_python.cc_markov import MarkovChain
import urllib2 
from bs4 import BeautifulSoup
from bs4.diagnose import diagnose
import os

mc = MarkovChain()
text = urllib2.urlopen("https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-only-part-of-a-document")
html = text.read()
diagnose(read)



#mc.add_string(example)
#new = mc.generate_text()

#print new