Exemple #1
0
 def print_popular(self):
     tuples = [(len(n.incoming), n) for n in self.index.values()]
     tuples.sort(reverse=True)
     ln = len(str(tuples[0][0]).rjust(2))
     ioutils.write_err("Showing most referenced urls:\n")
     for (i, node) in tuples[:10]:
         ioutils.write_err(" %s  %s\n" % (str(i).rjust(ln), node.url))
Exemple #2
0
    def write_progress(self, rate=None, prestart=None, wait=None, complete=False, error=None):
        # compute string lengths
        action = self.action.rjust(self.actionwidth)

        if error:
            rate = error
        elif prestart:
            rate = "starting"
        elif wait:
            rate = ("%s" % self.retry_wait) + "s..."
        elif complete:
            rate = "done"
        else:
            rate = "%s/s" % self.format_size(rate)
        rate = rate.ljust(self.ratewidth)

        url = self.url_fmt

        if self.totalsize:
            size = self.format_size(self.totalsize)
        elif self.download_size:
            size = self.format_size(self.download_size)
        else:
            size = "????? B"
        size = ("  %s" % size).ljust(self.sizewidth)

        # add formatting
        if error:
            rate = ansicolor.red(rate)
        elif prestart or wait:
            rate = ansicolor.cyan(rate)
        elif complete:
            rate = ansicolor.green(rate)
        else:
            rate = ansicolor.yellow(rate)

        # draw progress bar
        if not (error or prestart or complete) and self.totalsize:
            c = int(self.urlwidth * self.download_size / self.totalsize)
            url = ansicolor.wrap_string(self.url_fmt, c, None, reverse=True)

        if not self.totalsize:
            size = ansicolor.yellow(size)

        line = "%s ::  %s  " % (action, rate)

        term = (os.environ.get("DEBUG_FETCH") and "\n") or "\r"
        if error or complete:
            term = "\n"
        ioutils.write_err("%s%s%s%s" % (line, url, size, term))

        # log download
        if error:
            self.log_url(error, error=True)
        elif complete:
            self.log_url("done")
Exemple #3
0
 def save(self):
     hostname = urlrewrite.get_hostname(self.wb.root.url)
     filename = urlrewrite.hostname_to_filename(hostname)
     ioutils.write_err("Saving session to %s ..." %
                     ansicolor.yellow(filename + ".{web,session}"))
     ioutils.serialize(self.wb, filename + ".web", dir=ioutils.LOGDIR)
     if self.queue:
         ioutils.serialize(self.queue, filename + ".session", dir=ioutils.LOGDIR)
     # only web being saved, ie. spidering complete, remove old session
     elif ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR):
         ioutils.delete(filename + ".session", dir=ioutils.LOGDIR)
     ioutils.write_err(ansicolor.green("done\n"))
Exemple #4
0
def run_script():
    (parser, a) = ioutils.init_opts("<url> ['<pattern>'] [options]")
    a("--recipe", metavar="<recipe>", dest="recipe", help="Use a spidering recipe")
    a("--fetch", action="store_true", help="Fetch urls, don't dump")
    a("--dump", action="store_true", help="Dump urls, don't fetch")
    a("--host", action="store_true", help="Only spider this host")
    a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests")
    a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth")
    (opts, args) = ioutils.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.pause:
            os.environ["PAUSE"] = str(opts.pause)
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)

        url = args[0]
        if opts.recipe:
            rules = recipe.load_recipe(opts.recipe, url)
        else:
            pattern = args[1]
            rules = recipe.get_recipe(pattern, url)

        session = Session.restore(url)
        session.rules = rules

        if session.queue is None:
            session.queue = recipe.get_queue(url, mode=fetch.Fetcher.SPIDER)
        if session.wb is None:
            session.wb = web.Web(url)

    except recipe.PatternError as e:
        ioutils.write_err(ansicolor.red("%s\n" % e))
        sys.exit(1)
    except IndexError:
        ioutils.opts_help(None, None, None, parser)

    spiderfetcher = SpiderFetcher(session)
    spiderfetcher.main()
Exemple #5
0
def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        ioutils.write_abort()
    except Exception as e:
        s = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        ioutils.write_err(s)
Exemple #6
0
 def restore(cls, url):
     hostname = urlrewrite.get_hostname(url)
     filename = urlrewrite.hostname_to_filename(hostname)
     q, wb = None, None
     if (ioutils.file_exists(filename + ".web", dir=ioutils.LOGDIR)):
         ioutils.write_err("Restoring web from %s ..." %
                         ansicolor.yellow(filename + ".web"))
         wb = ioutils.deserialize(filename + ".web", dir=ioutils.LOGDIR)
         ioutils.write_err(ansicolor.green("done\n"))
     if (ioutils.file_exists(filename + ".session", dir=ioutils.LOGDIR)):
         ioutils.write_err("Restoring session from %s ..." %
                         ansicolor.yellow(filename + ".session"))
         q = ioutils.deserialize(filename + ".session", dir=ioutils.LOGDIR)
         q = recipe.overrule_records(q)
         ioutils.write_err(ansicolor.green("done\n"))
     return cls(wb=wb, queue=q)
Exemple #7
0
 def print_multiple(self):
     ss = []
     for n in self.index.values():
         if len(n.aliases) > 1:
             pair = (len(n.aliases), n.aliases)
             if pair not in ss:
                 ss.append(pair)
     if ss:
         ss.sort(reverse=True)
         ln = len(str(ss[0][0]))  # length of highest count
         ioutils.write_err("Showing documents with multiple urls:\n")
         for pair in ss:
             (count, aliases) = pair
             for url in aliases:
                 prefix = "".rjust(ln)
                 if aliases.index(url) == 0:
                     prefix = str(count).rjust(ln)
                 ioutils.write_err(" %s  %s\n" % (prefix, url))
             if not ss.index(pair) == len(ss) - 1:
                 ioutils.write_err("\n")
Exemple #8
0
 def assert_in_web(self, url):
     if url not in self.index:
         ioutils.write_err("Url %s not in the web\n" % ansicolor.yellow(url))
         sys.exit(1)
Exemple #9
0
 def print_stats(self):
     s = "Root url : %s\n" % self.root.url
     s += "Web size : %s urls\n" % len(self.index)
     ioutils.write_err(s)
Exemple #10
0
 def print_trace(self, path):
     if path:
         ioutils.write_err("Showing trace from root:\n")
         for (i, hop) in enumerate(path):
             ioutils.write_err(" %s  %s\n" % (str(i).rjust(1 + (len(path) / 10)), hop))