Python BeautifulSoup.read Examples

Programming Language: Python

Namespace/Package Name: BeautifulSoup

Class/Type: BeautifulSoup

Method/Function: read

Examples at hotexamples.com: 10

Python BeautifulSoup.read - 10 examples found. These are the top rated real world Python examples of BeautifulSoup.BeautifulSoup.read extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

BeautifulSoup(30)

decompose(30)

first(30)

find_all(30)

findAll(30)

find(30)

fetch(30)

feed(30)

getText(29)

insert(20)

findChildren(19)

body(12)

close(11)

__str__(11)

encode(8)

new_tag(6)

findChild(5)

append(4)

prettify(4)

findSelect(4)

decode(4)

get(4)

__unicode__(3)

goahead(3)

lower(3)

div(3)

findall(3)

pretify(3)

__init__(3)

firstText(2)

pop(2)

data(2)

findNext(2)

read(2)

index(1)

html(1)

query(1)

json(1)

load(1)

re_left(1)

noscript(1)

orig_url(1)

partition(1)

popTag(1)

pretiffy(1)

head(1)

findNextSiblings(1)

group(1)

encodeContents(1)

attrs(1)

Example #1

Show file

File: books.py Project: rzjfr/LTCrawler

def get_reviews(work):
    """(str)->list
    dsc: get all reviews of given book work id. result includes user name of
    reviewer, ranke he gave to work and the main text of the review
    """
    url = """http://www.librarything.com/ajax_profilereviews.php?offset=0
&type=3&showCount=10000&workid=%s&languagePick=en&mode=profile""" % work
    url = url.replace('\n', '')
    result = []
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
        reviews = html.findAll('div', attrs={'class': 'bookReview'})
        if reviews:
            for review in reviews:
                text = review.find('div', attrs={'class': 'commentText'}).text
                text = text.encode('utf-8')
                cntl_itm = review.find('span', attrs={'class': 'controlItems'})
                user = cntl_itm.find('a').text
                rv_lnk = review.find('span', attrs={'class': 'rating'})
                if rv_lnk:
                    rv_txt = rv_lnk.find('img')['src']
                    rank = re.search('ss(\d+).gif', rv_txt).group(1)
                else:
                    rank = 'NA'
                result.append({'name': user, 'text': text, 'rank': rank})
    except HTTPError, err:
        log("Error # "+str(err.code))
        return result

Example #2

Show file

File: users.py Project: rzjfr/LTCrawler

def get_all_tag_name(name):
    """(str)->dic
    dsc: get all tags and counts for given book
    """
    url = 'http://www.librarything.com/tagcloud.php?view=%s' % name
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')

Example #3

Show file

File: GrabBanners.py Project: shounakitraj/BannerDetect

def ChangeProtocol(Link):
    try:
        if "http" not in Link:
            Link = "https://" + Link + "/"
        # Use HTPT/1.0
        httplib.HTTPSConnection._http_vsn_str = 'HTPT/1.0'
        Response = opener.open(Link)

        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e))

Example #4

Show file

File: books.py Project: rzjfr/LTCrawler

def get_all_tag_work(work):
    """(str)->dic
    dsc: get all tags and counts for given book in a dictionary
    """
    url = """http://www.librarything.com/ajaxinc_showbooktags.php?work=%s
&all=1&print=1&doit=1&lang=en""" % work
    url = url.replace("\n", "")
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')

Example #5

Show file

File: PageComparer.py Project: Michelin01jp/YMS

class PageComparer:
    def __init__(self, url, path):
        self.url = url
        self.path = path
        self.new = BeautifulSoup(urllib2.urlopen(self.url))
        self.local = open(self.path)
        self.local = BeautifulSoup(self.local.read())

    def isChanged(self):
        if self.new == self.local:
            return False

        return False if unicode(str(self.new), "utf-8").find(u"連絡事項はありません") != -1 else True

    def sync(self):
        self.fp = open("index.html", "w")
        self.fp.write(str(self.new))
        self.fp.close()

Example #6

Show file

File: not_in_use.py Project: rzjfr/LTCrawler

def get_shared_books(member_a, member_b):
    """(str, str)->str
    dsc: get numbers of shared books for between two users using LT catalog
    >>>get_shared_books('Des2', 'Jon.Roemer')
    43
    """
    print 'Retrieving data to compare %s and %s...' % (member_a, member_b)
    base = 'http://www.librarything.com/'
    url = 'catalog_bottom.php?view=%s&compare=%s' % (member_a, member_b)
    try:
        html = urlopen(base+url)
        html = BeautifulSoup(html.read())
        if html.find('td', attrs={'class': 'pbGroup'}):
            text = html.find('td', attrs={'class': 'pbGroup'}).text
            if re.search('(?<=of) \d*', text):
                return re.search('(?<=of) \d*', text).group(0)[1:]
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')

Example #7

Show file

File: bukkit-build.py Project: rmccue/Minecraft-Tools

from BeautifulSoup import BeautifulSoup
import urllib2
import re

stuff = urllib2.urlopen("http://ci.bukkit.org/job/dev-CraftBukkit/lastSuccessfulBuild/")
stuff = BeautifulSoup(stuff.read())

matches = re.search("Build #(\d+)", stuff.h1.contents[1])
print matches.group(1)

Example #8

Show file

File: GrabBanners.py Project: shounakitraj/BannerDetect

        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e))
    try:
        # USe HTTP/3.0
        httplib.HTTPSConnection._http_vsn_str = 'HTTP/3.0'
        Response = opener.open(Link)

        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e))


def AddHeaders(Link):
    try:
        Headers.update({
            'content-type': 'application',
            'X-Forwarded-For': 'xxxxxx'
        })
        if "http" not in Link:

Example #9

Show file

File: not_in_use.py Project: rzjfr/LTCrawler

    """
    print 'Retrieving isbn for %s...' % title
    url = 'http://www.librarything.com/api/thingTitle/'
    title = title.encode('utf-8')
    try:
        xml = urlopen(url+title)
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')
    except URLError, err:
        log(str(err.reason), 'Error')
    xml = BeautifulSoup(xml.read())
    if xml.find('isbn'):
        return xml.find('isbn').text
    else:
        log('No isbn found for '+str(title))
        return get_work_title_retry(title)


def get_work_title_retry(title):
    """(str)->str
    dsc: find workid from given title
    >>>get_work_title_retry('Freckle Juice')[:-1]
    '17'
    """
    print 'Retrieving isbn for %s...' % title
    key = 'ba4a76cea44a763da0317089b6b4c103'

Example #10

Show file

File: forrentfinddata.py Project: BadBinary/Apartments

                linkblock = aptbuilding.find("a", {"class": "permalink"})
                apartmentname = linkblock["title"].encode("utf-8")
                link = linkblock["href"].encode("utf-8")
                bdrmsandprice = (
                    aptbuilding.find("div", {"class": "unit size3of5"})
                    .a.string.encode("utf-8")
                    .replace("  ", "")
                    .replace("\n", "")
                    .replace("\t", "")
                )
                split = bdrmsandprice.split(" | ")
                bedrooms = split[0]
                price = split[1]
                aptpage = urllib2.build_opener()
                aptpage = aptpage.open(link)
                aptpage = BeautifulSoup(aptpage.read())
                locationinfo = aptpage.find("div", {"id": "addressForMap"})
                lat = locationinfo.find("span", {"class": "lat hide"}).string.encode("utf-8")
                lon = locationinfo.find("span", {"class": "lon hide"}).string.encode("utf-8")
                address = locationinfo.find("span", {"class": "dBlock street-address"}).string.encode("utf-8")
                city = locationinfo.find("span", {"class": "locality"}).string.replace(",", "").encode("utf-8")
                state = locationinfo.find("span", {"class": "region"}).string.encode("utf-8")
                zipcode = locationinfo.find("span", {"class": "postal-code"}).string.encode("utf-8")
            except TypeError:
                pass

            try:
                matchedlist.append(
                    ApartmentBuilding(apartmentname, link, bedrooms, price, lat, lon, address, city, state, zipcode)
                )
            except NameError: