def updateUbuWebDBOld(self): # Open Ubuweb film page req = urllib2.Request("http://www.ubu.com/film/") response = urllib2.urlopen(req) result = response.read() response.close() soup = BS.BeautifulSoup(result) links = soup.findAll("table")[1].findAll("a") fewerLinks = links[1:5] c = self.db.cursor() for link in fewerLinks: name = link.text nameHash = sha1.sha(name).hexdigest() href = link["href"][2:] c.execute('insert into names (name, hash, link) values (?,?,?)', (unescape(name), nameHash, href)) self.db.commit() # Open up new requests to load film links req = urllib2.Request(self.BASE + href) response = urllib2.urlopen(req) result = response.read() response.close() nameSoup = BS.BeautifulSoup(result) print "Working on %s" % (self.BASE + href) potentialFilmLinks = nameSoup.findAll("table")[1].findAll( "font")[2].findAll("img") for potentialFilmLink in potentialFilmLinks: a = potentialFilmLink.findNext() name = a.text potentialHref = a["href"] print "Working on %s" % (self.BASE + potentialHref) filmRequest = urllib2.Request(self.BASE + potentialHref) try: filmResponse = urllib2.urlopen(filmRequest) except urllib2.HTTPError: continue filmResult = filmResponse.read() filmResponse.close() filmSoup = BS.BeautifulSoup(filmResult) for s in filmSoup.findAll("script"): r = self.fileRE.findall(s.text) if r != []: c.execute( "insert into films (hash, title, link) values (?,?,?)", (nameHash, unescape(name), r[0])) #filmLink = self.fileRE.findall(filmSoup.findAll("script")[2].text)[0] self.db.commit()
def updateUbuWebDBOld(self): # Open Ubuweb film page req = urllib2.Request("http://www.ubu.com/film/") response = urllib2.urlopen(req) result = response.read() response.close() soup = BS.BeautifulSoup(result) links = soup.findAll("table")[1].findAll("a") fewerLinks = links[1:5] c = self.db.cursor() for link in fewerLinks: name = link.text nameHash = sha1.sha(name).hexdigest() href = link["href"][2:] c.execute("insert into names (name, hash, link) values (?,?,?)", (unescape(name), nameHash, href)) self.db.commit() # Open up new requests to load film links req = urllib2.Request(self.BASE + href) response = urllib2.urlopen(req) result = response.read() response.close() nameSoup = BS.BeautifulSoup(result) print "Working on %s" % (self.BASE + href) potentialFilmLinks = nameSoup.findAll("table")[1].findAll("font")[2].findAll("img") for potentialFilmLink in potentialFilmLinks: a = potentialFilmLink.findNext() name = a.text potentialHref = a["href"] print "Working on %s" % (self.BASE + potentialHref) filmRequest = urllib2.Request(self.BASE + potentialHref) try: filmResponse = urllib2.urlopen(filmRequest) except urllib2.HTTPError: continue filmResult = filmResponse.read() filmResponse.close() filmSoup = BS.BeautifulSoup(filmResult) for s in filmSoup.findAll("script"): r = self.fileRE.findall(s.text) if r != []: c.execute( "insert into films (hash, title, link) values (?,?,?)", (nameHash, unescape(name), r[0]) ) # filmLink = self.fileRE.findall(filmSoup.findAll("script")[2].text)[0] self.db.commit()
def __init__(self, **kw): """ @todo: Make a heartbeat to clean-up temporary subscribtions """ self.email = kw.pop("email") self.first_name = kw.get("first_name") self.last_name = kw.get("last_name") self.organisation = kw.get("organisation") self.country = kw.get("country") self.notif_type = kw.pop("notif_type") self.lang = kw.pop("lang") self.location = kw.pop("location") self.key = sha("%s%s" % (time(), random.randrange(1, 10000))).hexdigest() self.__dict__.update(kw) self.datetime = datetime.now()
def parseFilmListingPage(self, filmPage="http://www.ubu.com/film", numLinks=10, startLink=1): # Open Ubuweb film page req = urllib2.Request("http://www.ubu.com/film/") response = urllib2.urlopen(req) result = response.read() response.close() soup = BS.BeautifulSoup(result) links = soup.findAll("table")[1].findAll("a") # Select a subset (or all) if numLinks is not None: totalLinks = links[startLink:(startLink + numLinks)] else: totalLinks = links[startLink:] c = self.db.cursor() currentLink = startLink for link in totalLinks: print "Working on link %d" % currentLink name = link.text nameHash = sha1.sha(name).hexdigest() nameLink = link["href"][2:] result = self.parseNamePage(self.BASE + nameLink) if (result is not None): c.execute( 'insert into names (name, hash, link, comments) values (?,?,?,?)', (unescape(name), nameHash, nameLink, result["comments"])) self.db.commit() for film in result["allFilms"]: c.execute( "insert into Films(hash, title, link, originalLink, comments) values (?, ?, ?, ?, ?)", (nameHash, film["filmName"], film["link"], film["originalLink"], film["comments"])) # Sleep for a bit to cutdown on usage sleepTime = random.randrange(5, 10) print "Sleeping for %d" % sleepTime time.sleep(sleepTime) currentLink += 1 self.db.commit() c.close()
def __init__(self, **kw): """ @todo: Make a heartbeat to clean-up temporary subscribtions """ self.email = kw.pop('email') self.first_name = kw.get('first_name') self.last_name = kw.get('last_name') self.organisation = kw.get('organisation') self.country = kw.get('country') self.notif_type = kw.pop('notif_type') self.lang = kw.pop('lang') self.content_types = kw.pop('content_types') self.location = kw.pop('location') self.key = sha("%s%s" % (time(), random.randrange(1, 10000))).hexdigest() self.__dict__.update(kw) self.datetime = datetime.now()
def parseFilmListingPage(self, filmPage="http://www.ubu.com/film", numLinks=10, startLink=1): # Open Ubuweb film page req = urllib2.Request("http://www.ubu.com/film/") response = urllib2.urlopen(req) result = response.read() response.close() soup = BS.BeautifulSoup(result) links = soup.findAll("table")[1].findAll("a") # Select a subset (or all) if numLinks is not None: totalLinks = links[startLink : (startLink + numLinks)] else: totalLinks = links[startLink:] c = self.db.cursor() currentLink = startLink for link in totalLinks: print "Working on link %d" % currentLink name = link.text nameHash = sha1.sha(name).hexdigest() nameLink = link["href"][2:] result = self.parseNamePage(self.BASE + nameLink) if result is not None: c.execute( "insert into names (name, hash, link, comments) values (?,?,?,?)", (unescape(name), nameHash, nameLink, result["comments"]), ) self.db.commit() for film in result["allFilms"]: c.execute( "insert into Films(hash, title, link, originalLink, comments) values (?, ?, ?, ?, ?)", (nameHash, film["filmName"], film["link"], film["originalLink"], film["comments"]), ) # Sleep for a bit to cutdown on usage sleepTime = random.randrange(5, 10) print "Sleeping for %d" % sleepTime time.sleep(sleepTime) currentLink += 1 self.db.commit() c.close()
def hashAndBase64(s): return stringToBase64(sha.sha(s).digest())