def importHTML(self, rawhtml, url = ''): """Populate object by scraping chunk of HTML rawhtml : May be a string or a list of strings. url : Optional param, useful to specify URL explicitly in situations where the URL is known. Many vB installations use only relative links so it can be hard to discover a URL from code. """ html = [] # Clean up the raw html if type(rawhtml) == type(list()): for h in rawhtml: html.append(vbutils.cleanEncoding(h)) else: html.append(vbutils.cleanEncoding(rawhtml)) self.id = vbscrape.scrapeThreadID(html[0]) if url: self.url = url else: self.url = vbscrape.scrapeThreadURL(self.id, html[0]) self.forum = vbutils.makeSlug(vbscrape.scrapeForumName(html[0])) self.title = vbutils.makeSlug(vbscrape.scrapeThreadTitle(html[0])) self.numpages = vbscrape.scrapeNumPages(html[0]) self.post = {} for h in html: self.post.update(vbscrape.scrapePosts(h))
def importJSON(self, jsondata): """Populate object from a string of JSON data""" # ''.join is used to accomodate list input clean = vbutils.cleanEncoding(''.join(jsondata), isHTML = False) # Try to load JSON data from the input str try: j = json.loads(clean) except: print "Error: Could not find JSON data." return None # Loop over posts creating Post objs self.post = {} print "Found %s posts." % len(j["post"]) for id, p in j["post"].iteritems(): print "Importing Post #%s ..." % str(id) if type(p) is dict: # Keyword args must be str # but our JSON dict has unicode keys # so we must convert before passing # into the Post.__init__() method kw = vbutils.convertKeysToStr(p) self.post[id] = vbpost.Post(**kw) elif (type(p) in [str, unicode]): self.post[id] = vbpost.Post(jsonstr = p) self.lastupdate = j["lastupdate"] self.forum = j["forum"] self.id = j["id"] self.numpages = j["numpages"] self.title = j["title"] self.url = j["url"]
def importHTML(self, rawhtml): """Populate object by scraping chunk of HTML """ # TODO clean up the HTML # Converting to UTF-8 bc JSON uses UTF-8 html = vbutils.cleanEncoding(rawhtml) # Force integer type conversion self.id = int(vbscrape.scrapePostID(html)) self.postcount = int(vbscrape.scrapePostCount(html, self.id)) self.authorid = int(vbscrape.scrapePostAuthorID(html)) self.permalink = vbscrape.scrapePostPermalink(html) self.dateposted = vbscrape.scrapePostDate(html) self.title = vbscrape.scrapePostTitle(html) self.message = vbscrape.scrapePostMessage(html) self.sig = vbscrape.scrapePostSig(html) self.editnote = vbscrape.scrapePostEditNote(html)
def importJSON(self, jsondata): """Populate object from JSON string """ # ''.join is used to accomodate list input clean = vbutils.cleanEncoding(''.join(jsondata), isHTML = False) # Try to load JSON data from the input str try: j = json.loads(clean) except TypeError: print "Error: Could not find JSON data." return None self.permalink = j["permalink"] self.id = j["id"] self.postcount = j["postcount"] self.dateposted = j["dateposted"] self.title = j["title"] self.authorid = j["authorid"] self.message = j["message"] self.sig = j["sig"] self.editnote = j["editnote"]