def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename+".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename + ".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename+".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename+".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def assert_in_web(self, url): if url not in self.index: io.write_err("Url %s not in the web\n" % shcolor.color(shcolor.YELLOW, url)) sys.exit(1)
def write_abort(): write_err("\n%s\n" % shcolor.color(shcolor.RED, "User aborted"))
a("--fetch", action="store_true", help="Fetch urls, don't dump") a("--dump", action="store_true", help="Dump urls, don't fetch") a("--host", action="store_true", help="Only spider this host") a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = io.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] (q, w) = restore_session(url) if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) wb = w or web.Web(url) except recipe.PatternError, e: io.write_err(shcolor.color(shcolor.RED, "%s\n" % e)) sys.exit(1) except IndexError: io.opts_help(None, None, None, parser) main(queue, rules, wb)
type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = io.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] (q, w) = restore_session(url) if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) wb = w or web.Web(url) except recipe.PatternError, e: io.write_err(shcolor.color(shcolor.RED, "%s\n" % e)) sys.exit(1) except IndexError: io.opts_help(None, None, None, parser) main(queue, rules, wb)