Beispiel #1
0
def get_reviews(work):
    """(str)->list
    dsc: get all reviews of given book work id. result includes user name of
    reviewer, ranke he gave to work and the main text of the review
    """
    url = """http://www.librarything.com/ajax_profilereviews.php?offset=0
&type=3&showCount=10000&workid=%s&languagePick=en&mode=profile""" % work
    url = url.replace('\n', '')
    result = []
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
        reviews = html.findAll('div', attrs={'class': 'bookReview'})
        if reviews:
            for review in reviews:
                text = review.find('div', attrs={'class': 'commentText'}).text
                text = text.encode('utf-8')
                cntl_itm = review.find('span', attrs={'class': 'controlItems'})
                user = cntl_itm.find('a').text
                rv_lnk = review.find('span', attrs={'class': 'rating'})
                if rv_lnk:
                    rv_txt = rv_lnk.find('img')['src']
                    rank = re.search('ss(\d+).gif', rv_txt).group(1)
                else:
                    rank = 'NA'
                result.append({'name': user, 'text': text, 'rank': rank})
    except HTTPError, err:
        log("Error # "+str(err.code))
        return result
Beispiel #2
0
def get_all_tag_name(name):
    """(str)->dic
    dsc: get all tags and counts for given book
    """
    url = 'http://www.librarything.com/tagcloud.php?view=%s' % name
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')
def ChangeProtocol(Link):
    try:
        if "http" not in Link:
            Link = "https://" + Link + "/"
        # Use HTPT/1.0
        httplib.HTTPSConnection._http_vsn_str = 'HTPT/1.0'
        Response = opener.open(Link)

        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e))
Beispiel #4
0
def get_all_tag_work(work):
    """(str)->dic
    dsc: get all tags and counts for given book in a dictionary
    """
    url = """http://www.librarything.com/ajaxinc_showbooktags.php?work=%s
&all=1&print=1&doit=1&lang=en""" % work
    url = url.replace("\n", "")
    try:
        html = urlopen(url)
        html = BeautifulSoup(html.read())
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')
Beispiel #5
0
class PageComparer:
    def __init__(self, url, path):
        self.url = url
        self.path = path
        self.new = BeautifulSoup(urllib2.urlopen(self.url))
        self.local = open(self.path)
        self.local = BeautifulSoup(self.local.read())

    def isChanged(self):
        if self.new == self.local:
            return False

        return False if unicode(str(self.new), "utf-8").find(u"連絡事項はありません") != -1 else True

    def sync(self):
        self.fp = open("index.html", "w")
        self.fp.write(str(self.new))
        self.fp.close()
Beispiel #6
0
def get_shared_books(member_a, member_b):
    """(str, str)->str
    dsc: get numbers of shared books for between two users using LT catalog
    >>>get_shared_books('Des2', 'Jon.Roemer')
    43
    """
    print 'Retrieving data to compare %s and %s...' % (member_a, member_b)
    base = 'http://www.librarything.com/'
    url = 'catalog_bottom.php?view=%s&compare=%s' % (member_a, member_b)
    try:
        html = urlopen(base+url)
        html = BeautifulSoup(html.read())
        if html.find('td', attrs={'class': 'pbGroup'}):
            text = html.find('td', attrs={'class': 'pbGroup'}).text
            if re.search('(?<=of) \d*', text):
                return re.search('(?<=of) \d*', text).group(0)[1:]
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')
Beispiel #7
0
from BeautifulSoup import BeautifulSoup
import urllib2
import re

stuff = urllib2.urlopen("http://ci.bukkit.org/job/dev-CraftBukkit/lastSuccessfulBuild/")
stuff = BeautifulSoup(stuff.read())

matches = re.search("Build #(\d+)", stuff.h1.contents[1])
print matches.group(1)
        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e))
    try:
        # USe HTTP/3.0
        httplib.HTTPSConnection._http_vsn_str = 'HTTP/3.0'
        Response = opener.open(Link)

        # Response body
        Response = BeautifulSoup(Response.read())
        for elem in Response(text=Pattern):
            File.write(Link + " | Version | " + str(elem.parent) + "\n")
            print(Link + " | Version | " + str(elem.parent))
    except Exception, e:
        File.write(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e) + "\n")
        print(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e))


def AddHeaders(Link):
    try:
        Headers.update({
            'content-type': 'application',
            'X-Forwarded-For': 'xxxxxx'
        })
        if "http" not in Link:
Beispiel #9
0
    """
    print 'Retrieving isbn for %s...' % title
    url = 'http://www.librarything.com/api/thingTitle/'
    title = title.encode('utf-8')
    try:
        xml = urlopen(url+title)
    except HTTPError, err:
        if err.code == 404:
            log("Page not found!", 'Error')
        elif err.code == 403:
            log("Access denied!", 'Error')
        else:
            log("Error "+str(err.code), 'Error')
    except URLError, err:
        log(str(err.reason), 'Error')
    xml = BeautifulSoup(xml.read())
    if xml.find('isbn'):
        return xml.find('isbn').text
    else:
        log('No isbn found for '+str(title))
        return get_work_title_retry(title)


def get_work_title_retry(title):
    """(str)->str
    dsc: find workid from given title
    >>>get_work_title_retry('Freckle Juice')[:-1]
    '17'
    """
    print 'Retrieving isbn for %s...' % title
    key = 'ba4a76cea44a763da0317089b6b4c103'
Beispiel #10
0
                linkblock = aptbuilding.find("a", {"class": "permalink"})
                apartmentname = linkblock["title"].encode("utf-8")
                link = linkblock["href"].encode("utf-8")
                bdrmsandprice = (
                    aptbuilding.find("div", {"class": "unit size3of5"})
                    .a.string.encode("utf-8")
                    .replace("  ", "")
                    .replace("\n", "")
                    .replace("\t", "")
                )
                split = bdrmsandprice.split(" | ")
                bedrooms = split[0]
                price = split[1]
                aptpage = urllib2.build_opener()
                aptpage = aptpage.open(link)
                aptpage = BeautifulSoup(aptpage.read())
                locationinfo = aptpage.find("div", {"id": "addressForMap"})
                lat = locationinfo.find("span", {"class": "lat hide"}).string.encode("utf-8")
                lon = locationinfo.find("span", {"class": "lon hide"}).string.encode("utf-8")
                address = locationinfo.find("span", {"class": "dBlock street-address"}).string.encode("utf-8")
                city = locationinfo.find("span", {"class": "locality"}).string.replace(",", "").encode("utf-8")
                state = locationinfo.find("span", {"class": "region"}).string.encode("utf-8")
                zipcode = locationinfo.find("span", {"class": "postal-code"}).string.encode("utf-8")
            except TypeError:
                pass

            try:
                matchedlist.append(
                    ApartmentBuilding(apartmentname, link, bedrooms, price, lat, lon, address, city, state, zipcode)
                )
            except NameError: