def wgetPage(thread, page, targetdir, platform = 'unix', logfile = ''): """Call wget to download page of a thread to targetdir for local archive """ if not vbutils.isValidURL(thread.url): return False # Count up the subdirs in this URL # Sometimes vBulletin is installed in root # Sometimes it is in a subdir # We need to know this number so we can tell wget to chill later subdirs = (thread.url.count('/') - 3) print "Found %s sub-dir in the URL ..." % subdirs if (page == 1): wget_cmd = "wget -v -nH --cut-dirs=%s -k -K -E -H -p -P %s -S -w 1 --random-wait --no-cache --no-cookies \"%s\"" % (subdirs, targetdir, thread.url) else: wget_cmd = "wget -v -nH --cut-dirs=%s -k -K -E -H -p -P %s -N -w 1 --random-wait --no-cache --no-cookies \"%s&page=%s\"" % (subdirs, targetdir, thread.url, page) # Split wget invocation into list for subprocess # TODO Apparently shlex.split can't take unicode arguments? # Is this going to be a problem in the future? args = shlex.split(str(wget_cmd)) # Are we specifying a special logfile location? if (logfile != ''): args.insert(1, '-a') args.insert(2, logfile) # Force special character converstion to win32 # e.g. ? becomes @ (see wget man pages for details) if (platform == 'windows'): args.insert(1, '--restrict-file-names=windows') # subprocess.Popen() will start a subprocess # TODO This would be better with a single data structure # of fixed length containing all the processes. # TODO need to catch errors proc_firstpage = subprocess.Popen(args) # .communicate() will grab output and exit info # will also half execution until complete # TODO Catch errors from communicate() proc_firstpage.communicate() # Assuming success! return True
def init(argv): """Init all globals according to options and params in argv Returns string with cleaned up URL of a thread to archive """ params = {} params = { "utc" : 0, "localdir" : '.' } # wget uses _platform for escaping illegal chrs in filenames if (sys.platform in "win32"): params["platform"] = "windows" else: params["platform"] = "unix" # params["useragent"] = "HTTP_USER_AGENT:Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.17) Gecko/2010010604 Ubuntu/9.04 (jaunty) Firefox/3.0.17" # Check the commandline for arguments try: opts, args = getopt.getopt(argv, "dhl:uvw", ["debug", "help", "localdir=", "utc", "verbose", "windows"]) except getopt.GetoptError: # Found a flag not in our known list # Returning a short usage message ... print "Error: unrecognized flag" usage() # ... and bye-bye! sys.exit(2) # Evaluate commandline options, arguments for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-v", "--verbose"): pass elif opt in ("-d", "--debug"): pass elif opt in ("-u", "--utc"): params["utc"] = 1 elif opt in ("-l", "--localdir"): # TODO need to validate the directory # strip trailing slashes params["localdir"] = arg.rstrip('/') elif opt in ("-w", "--windows"): params["platform"] = "windows" else: print "opt: %s, arg: %s" % (opt, arg) # Should be exactly 1 positional argument # URL to the thread should be the only argument try: raw_URL = args[0] except IndexError: print "getthread.py: missing URL" usage() sys.exit(2) # Is raw_URL a valid vB thread URL? print "Validating URL ..." if not (vbutils.isValidURL(raw_URL)): print "Error: %s is not a valid vBulletin thread URL." % raw_URL print sys.exit(2) # Clean up URL from the commandline # Keep the domain, sub dirs, showthread, and &t= params["url"] = vbutils.cleanURL(raw_URL) print "Valid thread URL: %s" % url return params