def print_popular(self): tuples = [(len(n.incoming), n) for n in self.index.values()] tuples.sort(reverse=True) ln = len(str(tuples[0][0]).rjust(2)) ioutils.write_err("Showing most referenced urls:\n") for (i, node) in tuples[:10]: ioutils.write_err(" %s %s\n" % (str(i).rjust(ln), node.url))
def write_progress(self, rate=None, prestart=None, wait=None, complete=False, error=None): # compute string lengths action = self.action.rjust(self.actionwidth) if error: rate = error elif prestart: rate = "starting" elif wait: rate = ("%s" % self.retry_wait) + "s..." elif complete: rate = "done" else: rate = "%s/s" % self.format_size(rate) rate = rate.ljust(self.ratewidth) url = self.url_fmt if self.totalsize: size = self.format_size(self.totalsize) elif self.download_size: size = self.format_size(self.download_size) else: size = "????? B" size = (" %s" % size).ljust(self.sizewidth) # add formatting if error: rate = ansicolor.red(rate) elif prestart or wait: rate = ansicolor.cyan(rate) elif complete: rate = ansicolor.green(rate) else: rate = ansicolor.yellow(rate) # draw progress bar if not (error or prestart or complete) and self.totalsize: c = int(self.urlwidth * self.download_size / self.totalsize) url = ansicolor.wrap_string(self.url_fmt, c, None, reverse=True) if not self.totalsize: size = ansicolor.yellow(size) line = "%s :: %s " % (action, rate) term = (os.environ.get("DEBUG_FETCH") and "\n") or "\r" if error or complete: term = "\n" ioutils.write_err("%s%s%s%s" % (line, url, size, term)) # log download if error: self.log_url(error, error=True) elif complete: self.log_url("done")
def save(self): hostname = urlrewrite.get_hostname(self.wb.root.url) filename = urlrewrite.hostname_to_filename(hostname) ioutils.write_err("Saving session to %s ..." % ansicolor.yellow(filename + ".{web,session}")) ioutils.serialize(self.wb, filename + ".web", dir=ioutils.LOGDIR) if self.queue: ioutils.serialize(self.queue, filename + ".session", dir=ioutils.LOGDIR) # only web being saved, ie. spidering complete, remove old session elif ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR): ioutils.delete(filename + ".session", dir=ioutils.LOGDIR) ioutils.write_err(ansicolor.green("done\n"))
def run_script(): (parser, a) = ioutils.init_opts("<url> ['<pattern>'] [options]") a("--recipe", metavar="<recipe>", dest="recipe", help="Use a spidering recipe") a("--fetch", action="store_true", help="Fetch urls, don't dump") a("--dump", action="store_true", help="Dump urls, don't fetch") a("--host", action="store_true", help="Only spider this host") a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests") a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth") (opts, args) = ioutils.parse_args(parser) try: if opts.fetch: os.environ["FETCH_ALL"] = "1" elif opts.dump: os.environ["DUMP_ALL"] = "1" if opts.host: os.environ["HOST_FILTER"] = "1" if opts.pause: os.environ["PAUSE"] = str(opts.pause) if opts.depth: os.environ["DEPTH"] = str(opts.depth) url = args[0] if opts.recipe: rules = recipe.load_recipe(opts.recipe, url) else: pattern = args[1] rules = recipe.get_recipe(pattern, url) session = Session.restore(url) session.rules = rules if session.queue is None: session.queue = recipe.get_queue(url, mode=fetch.Fetcher.SPIDER) if session.wb is None: session.wb = web.Web(url) except recipe.PatternError as e: ioutils.write_err(ansicolor.red("%s\n" % e)) sys.exit(1) except IndexError: ioutils.opts_help(None, None, None, parser) spiderfetcher = SpiderFetcher(session) spiderfetcher.main()
def main(): try: line = sys.stdin.readline() while line: path = line.strip() filename = os.path.basename(path) args = ["mplayer", "-dumpstream", "-dumpfile", filename, path] retval = subprocess.call(args) if retval: logerror(path) line = sys.stdin.readline() except KeyboardInterrupt: ioutils.write_abort() except Exception as e: s = "%s\n" % traceback.format_exc() s += "%s\n" % str(e) s += "Invocation string: %s\n" % str(args) ioutils.write_err(s)
def restore(cls, url): hostname = urlrewrite.get_hostname(url) filename = urlrewrite.hostname_to_filename(hostname) q, wb = None, None if (ioutils.file_exists(filename + ".web", dir=ioutils.LOGDIR)): ioutils.write_err("Restoring web from %s ..." % ansicolor.yellow(filename + ".web")) wb = ioutils.deserialize(filename + ".web", dir=ioutils.LOGDIR) ioutils.write_err(ansicolor.green("done\n")) if (ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR)): ioutils.write_err("Restoring session from %s ..." % ansicolor.yellow(filename + ".session")) q = ioutils.deserialize(filename + ".session", dir=ioutils.LOGDIR) q = recipe.overrule_records(q) ioutils.write_err(ansicolor.green("done\n")) return cls(wb=wb, queue=q)
def print_multiple(self): ss = [] for n in self.index.values(): if len(n.aliases) > 1: pair = (len(n.aliases), n.aliases) if pair not in ss: ss.append(pair) if ss: ss.sort(reverse=True) ln = len(str(ss[0][0])) # length of highest count ioutils.write_err("Showing documents with multiple urls:\n") for pair in ss: (count, aliases) = pair for url in aliases: prefix = "".rjust(ln) if aliases.index(url) == 0: prefix = str(count).rjust(ln) ioutils.write_err(" %s %s\n" % (prefix, url)) if not ss.index(pair) == len(ss) - 1: ioutils.write_err("\n")
def assert_in_web(self, url): if url not in self.index: ioutils.write_err("Url %s not in the web\n" % ansicolor.yellow(url)) sys.exit(1)
def print_stats(self): s = "Root url : %s\n" % self.root.url s += "Web size : %s urls\n" % len(self.index) ioutils.write_err(s)
def print_trace(self, path): if path: ioutils.write_err("Showing trace from root:\n") for (i, hop) in enumerate(path): ioutils.write_err(" %s %s\n" % (str(i).rjust(1 + (len(path) / 10)), hop))