def scrapeThreadURL(id, html): """Return thread URL from str of HTML""" # TODO Some vB installation have only relative links # TODO Not sure how to get the URL in that case pattern = r'http://[^\'"]*showthread[^\'"]*t=%s[^\'"]*' % id m = re.search(pattern, html) if m: return vbutils.cleanURL(m.group(0).strip()) return ''
def update(self, url = ''): """Retrieve HTML from first page and scrape basic info """ if not url: url = self.url self.url = vbutils.cleanURL(url) self.id = vbutils.findThreadID(self.url) page = [] print "Scraping %s ..." % self.url page.append(getPage(self.url)) self.numpages = int(vbscrape.scrapeNumPages(page[0])) print "Found %s pages." % str(self.numpages) for p in range(1, self.numpages): print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages)) page.append(getPage(self.url, (p + 1))) print "Importing data from HTML ..." self.importHTML(page, self.url) self.lastupdate = vbutils.getDateTime() print "Thread update completed at %s" % self.lastupdate
def init(argv): """Init all globals according to options and params in argv Returns string with cleaned up URL of a thread to archive """ params = {} params = { "utc" : 0, "localdir" : '.' } # wget uses _platform for escaping illegal chrs in filenames if (sys.platform in "win32"): params["platform"] = "windows" else: params["platform"] = "unix" # params["useragent"] = "HTTP_USER_AGENT:Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.17) Gecko/2010010604 Ubuntu/9.04 (jaunty) Firefox/3.0.17" # Check the commandline for arguments try: opts, args = getopt.getopt(argv, "dhl:uvw", ["debug", "help", "localdir=", "utc", "verbose", "windows"]) except getopt.GetoptError: # Found a flag not in our known list # Returning a short usage message ... print "Error: unrecognized flag" usage() # ... and bye-bye! sys.exit(2) # Evaluate commandline options, arguments for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-v", "--verbose"): pass elif opt in ("-d", "--debug"): pass elif opt in ("-u", "--utc"): params["utc"] = 1 elif opt in ("-l", "--localdir"): # TODO need to validate the directory # strip trailing slashes params["localdir"] = arg.rstrip('/') elif opt in ("-w", "--windows"): params["platform"] = "windows" else: print "opt: %s, arg: %s" % (opt, arg) # Should be exactly 1 positional argument # URL to the thread should be the only argument try: raw_URL = args[0] except IndexError: print "getthread.py: missing URL" usage() sys.exit(2) # Is raw_URL a valid vB thread URL? print "Validating URL ..." if not (vbutils.isValidURL(raw_URL)): print "Error: %s is not a valid vBulletin thread URL." % raw_URL print sys.exit(2) # Clean up URL from the commandline # Keep the domain, sub dirs, showthread, and &t= params["url"] = vbutils.cleanURL(raw_URL) print "Valid thread URL: %s" % url return params