def print_popular(self): tuples = [(len(n.incoming), n) for n in self.index.values()] tuples.sort(reverse=True) ln = len(str(tuples[0][0]).rjust(2)) io.write_err("Showing most referenced urls:\n") for (i, node) in tuples[:10]: io.write_err(" %s %s\n" % (str(i).rjust(ln), node.url))
def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename+".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def save_session(wb, queue=None): hostname = urlrewrite.get_hostname(wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) io.write_err("Saving session to %s ..." % shcolor.color(shcolor.YELLOW, filename + ".{web,session}")) io.serialize(wb, filename + ".web", dir=io.LOGDIR) if queue: io.serialize(queue, filename + ".session", dir=io.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif io.file_exists(filename + ".session", dir=io.LOGDIR): io.delete(filename + ".session", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n"))
def main(): try: line = sys.stdin.readline() while line: path = line.strip() filename = os.path.basename(path) args = ["mplayer", "-dumpstream", "-dumpfile", filename, path] retval = subprocess.call(args) if retval: logerror(path) line = sys.stdin.readline() except KeyboardInterrupt: io.write_abort() except Exception, e: s = "%s\n" % traceback.format_exc() s += "%s\n" % str(e) s += "Invocation string: %s\n" % str(args) io.write_err(s)
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename+".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename+".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def restore_session(url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (io.file_exists(filename + ".web", dir=io.LOGDIR)): io.write_err("Restoring web from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".web")) wb = io.deserialize(filename + ".web", dir=io.LOGDIR) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) if (io.file_exists(filename + ".session", dir=io.LOGDIR)): io.write_err("Restoring session from %s ..." % shcolor.color(shcolor.YELLOW, filename + ".session")) q = io.deserialize(filename + ".session", dir=io.LOGDIR) q = recipe.overrule_records(q) io.write_err(shcolor.color(shcolor.GREEN, "done\n")) return q, wb
def print_multiple(self): ss = [] for n in self.index.values(): if len(n.aliases) > 1: pair = (len(n.aliases), n.aliases) if pair not in ss: ss.append(pair) if ss: ss.sort(reverse=True) ln = len(str(ss[0][0])) # length of highest count io.write_err("Showing documents with multiple urls:\n") for pair in ss: (count, aliases) = pair for url in aliases: prefix = "".rjust(ln) if aliases.index(url) == 0: prefix = str(count).rjust(ln) io.write_err(" %s %s\n" % (prefix, url)) if not ss.index(pair) == len(ss)-1: io.write_err("\n")
def print_multiple(self): ss = [] for n in self.index.values(): if len(n.aliases) > 1: pair = (len(n.aliases), n.aliases) if pair not in ss: ss.append(pair) if ss: ss.sort(reverse=True) ln = len(str(ss[0][0])) # length of highest count io.write_err("Showing documents with multiple urls:\n") for pair in ss: (count, aliases) = pair for url in aliases: prefix = "".rjust(ln) if aliases.index(url) == 0: prefix = str(count).rjust(ln) io.write_err(" %s %s\n" % (prefix, url)) if not ss.index(pair) == len(ss) - 1: io.write_err("\n")
def print_trace(self, path): if path: io.write_err("Showing trace from root:\n") for (i, hop) in enumerate(path): io.write_err(" %s %s\n" % (str(i).rjust(1 + (len(path) / 10)), hop))
type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = io.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] (q, w) = restore_session(url) if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) wb = w or web.Web(url) except recipe.PatternError, e: io.write_err(shcolor.color(shcolor.RED, "%s\n" % e)) sys.exit(1) except IndexError: io.opts_help(None, None, None, parser) main(queue, rules, wb)
def print_trace(self, path): if path: io.write_err("Showing trace from root:\n") for (i, hop) in enumerate(path): io.write_err(" %s %s\n" % (str(i).rjust(1+(len(path)/10)), hop))
a("--fetch", action="store_true", help="Fetch urls, don't dump") a("--dump", action="store_true", help="Dump urls, don't fetch") a("--host", action="store_true", help="Only spider this host") a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = io.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] (q, w) = restore_session(url) if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) wb = w or web.Web(url) except recipe.PatternError, e: io.write_err(shcolor.color(shcolor.RED, "%s\n" % e)) sys.exit(1) except IndexError: io.opts_help(None, None, None, parser) main(queue, rules, wb)
def assert_in_web(self, url): if url not in self.index: io.write_err("Url %s not in the web\n" % shcolor.color(shcolor.YELLOW, url)) sys.exit(1)
def print_stats(self): s = "Root url : %s\n" % self.root.url s += "Web size : %s urls\n" % len(self.index) io.write_err(s)
a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests") a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = io.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.pause: os.environ["PAUSE"] = str(opts.pause) if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] (q, w) = restore_session(url) if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) queue = q or recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) wb = w or web.Web(url) except recipe.PatternError, e: io.write_err(ansicolor.red("%s\n" % e)) sys.exit(1) except IndexError: io.opts_help(None, None, None, parser) main(queue, rules, wb)
def assert_in_web(self, url): if url not in self.index: io.write_err("Url %s not in the web\n" % ansicolor.yellow(url)) sys.exit(1)