def get_reviews(work): """(str)->list dsc: get all reviews of given book work id. result includes user name of reviewer, ranke he gave to work and the main text of the review """ url = """http://www.librarything.com/ajax_profilereviews.php?offset=0 &type=3&showCount=10000&workid=%s&languagePick=en&mode=profile""" % work url = url.replace('\n', '') result = [] try: html = urlopen(url) html = BeautifulSoup(html.read()) reviews = html.findAll('div', attrs={'class': 'bookReview'}) if reviews: for review in reviews: text = review.find('div', attrs={'class': 'commentText'}).text text = text.encode('utf-8') cntl_itm = review.find('span', attrs={'class': 'controlItems'}) user = cntl_itm.find('a').text rv_lnk = review.find('span', attrs={'class': 'rating'}) if rv_lnk: rv_txt = rv_lnk.find('img')['src'] rank = re.search('ss(\d+).gif', rv_txt).group(1) else: rank = 'NA' result.append({'name': user, 'text': text, 'rank': rank}) except HTTPError, err: log("Error # "+str(err.code)) return result
def get_all_tag_name(name): """(str)->dic dsc: get all tags and counts for given book """ url = 'http://www.librarything.com/tagcloud.php?view=%s' % name try: html = urlopen(url) html = BeautifulSoup(html.read()) except HTTPError, err: if err.code == 404: log("Page not found!", 'Error') elif err.code == 403: log("Access denied!", 'Error') else: log("Error "+str(err.code), 'Error')
def ChangeProtocol(Link): try: if "http" not in Link: Link = "https://" + Link + "/" # Use HTPT/1.0 httplib.HTTPSConnection._http_vsn_str = 'HTPT/1.0' Response = opener.open(Link) # Response body Response = BeautifulSoup(Response.read()) for elem in Response(text=Pattern): File.write(Link + " | Version | " + str(elem.parent) + "\n") print(Link + " | Version | " + str(elem.parent)) except Exception, e: File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n") print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e))
def get_all_tag_work(work): """(str)->dic dsc: get all tags and counts for given book in a dictionary """ url = """http://www.librarything.com/ajaxinc_showbooktags.php?work=%s &all=1&print=1&doit=1&lang=en""" % work url = url.replace("\n", "") try: html = urlopen(url) html = BeautifulSoup(html.read()) except HTTPError, err: if err.code == 404: log("Page not found!", 'Error') elif err.code == 403: log("Access denied!", 'Error') else: log("Error "+str(err.code), 'Error')
class PageComparer: def __init__(self, url, path): self.url = url self.path = path self.new = BeautifulSoup(urllib2.urlopen(self.url)) self.local = open(self.path) self.local = BeautifulSoup(self.local.read()) def isChanged(self): if self.new == self.local: return False return False if unicode(str(self.new), "utf-8").find(u"連絡事項はありません") != -1 else True def sync(self): self.fp = open("index.html", "w") self.fp.write(str(self.new)) self.fp.close()
def get_shared_books(member_a, member_b): """(str, str)->str dsc: get numbers of shared books for between two users using LT catalog >>>get_shared_books('Des2', 'Jon.Roemer') 43 """ print 'Retrieving data to compare %s and %s...' % (member_a, member_b) base = 'http://www.librarything.com/' url = 'catalog_bottom.php?view=%s&compare=%s' % (member_a, member_b) try: html = urlopen(base+url) html = BeautifulSoup(html.read()) if html.find('td', attrs={'class': 'pbGroup'}): text = html.find('td', attrs={'class': 'pbGroup'}).text if re.search('(?<=of) \d*', text): return re.search('(?<=of) \d*', text).group(0)[1:] except HTTPError, err: if err.code == 404: log("Page not found!", 'Error') elif err.code == 403: log("Access denied!", 'Error') else: log("Error "+str(err.code), 'Error')
from BeautifulSoup import BeautifulSoup import urllib2 import re stuff = urllib2.urlopen("http://ci.bukkit.org/job/dev-CraftBukkit/lastSuccessfulBuild/") stuff = BeautifulSoup(stuff.read()) matches = re.search("Build #(\d+)", stuff.h1.contents[1]) print matches.group(1)
# Response body Response = BeautifulSoup(Response.read()) for elem in Response(text=Pattern): File.write(Link + " | Version | " + str(elem.parent) + "\n") print(Link + " | Version | " + str(elem.parent)) except Exception, e: File.write(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e) + "\n") print(Link + " | ChangeProtocol (HTPT/1.0)| " + str(e)) try: # USe HTTP/3.0 httplib.HTTPSConnection._http_vsn_str = 'HTTP/3.0' Response = opener.open(Link) # Response body Response = BeautifulSoup(Response.read()) for elem in Response(text=Pattern): File.write(Link + " | Version | " + str(elem.parent) + "\n") print(Link + " | Version | " + str(elem.parent)) except Exception, e: File.write(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e) + "\n") print(Link + " | ChangeProtocol (HTTP/3.0)| " + str(e)) def AddHeaders(Link): try: Headers.update({ 'content-type': 'application', 'X-Forwarded-For': 'xxxxxx' }) if "http" not in Link:
""" print 'Retrieving isbn for %s...' % title url = 'http://www.librarything.com/api/thingTitle/' title = title.encode('utf-8') try: xml = urlopen(url+title) except HTTPError, err: if err.code == 404: log("Page not found!", 'Error') elif err.code == 403: log("Access denied!", 'Error') else: log("Error "+str(err.code), 'Error') except URLError, err: log(str(err.reason), 'Error') xml = BeautifulSoup(xml.read()) if xml.find('isbn'): return xml.find('isbn').text else: log('No isbn found for '+str(title)) return get_work_title_retry(title) def get_work_title_retry(title): """(str)->str dsc: find workid from given title >>>get_work_title_retry('Freckle Juice')[:-1] '17' """ print 'Retrieving isbn for %s...' % title key = 'ba4a76cea44a763da0317089b6b4c103'
linkblock = aptbuilding.find("a", {"class": "permalink"}) apartmentname = linkblock["title"].encode("utf-8") link = linkblock["href"].encode("utf-8") bdrmsandprice = ( aptbuilding.find("div", {"class": "unit size3of5"}) .a.string.encode("utf-8") .replace(" ", "") .replace("\n", "") .replace("\t", "") ) split = bdrmsandprice.split(" | ") bedrooms = split[0] price = split[1] aptpage = urllib2.build_opener() aptpage = aptpage.open(link) aptpage = BeautifulSoup(aptpage.read()) locationinfo = aptpage.find("div", {"id": "addressForMap"}) lat = locationinfo.find("span", {"class": "lat hide"}).string.encode("utf-8") lon = locationinfo.find("span", {"class": "lon hide"}).string.encode("utf-8") address = locationinfo.find("span", {"class": "dBlock street-address"}).string.encode("utf-8") city = locationinfo.find("span", {"class": "locality"}).string.replace(",", "").encode("utf-8") state = locationinfo.find("span", {"class": "region"}).string.encode("utf-8") zipcode = locationinfo.find("span", {"class": "postal-code"}).string.encode("utf-8") except TypeError: pass try: matchedlist.append( ApartmentBuilding(apartmentname, link, bedrooms, price, lat, lon, address, city, state, zipcode) ) except NameError: