Example #1
0
def logerror(path):
    io.savelog("Path failed: %s\n" % path, "error_dumpstream")

def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        io.write_abort()
    except Exception, e:
        s  = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        io.write_err(s)



if __name__ == "__main__":
    (parser, a) = io.init_opts("< <file>")
    (opts, args) = io.parse_args(parser)
    main()
Example #2
0
                node.outgoing[n] = None
        #for node in self.index.values():
        #    print node.incoming
        #    print node.outgoing

    def _from_pickle(self):
        for node in self.index.values():
            for n in node.incoming:
                node.incoming[n] = self.index[n]
            for n in node.outgoing:
                node.outgoing[n] = self.index[n]



if __name__ == "__main__":
    (parser, a) = io.init_opts("<web> [options]")
    a("--dump", action="store_true", help="Dump all urls in web")
    a("--in", metavar="<url>", dest="into", help="Find incoming urls to <url>")
    a("--out", metavar="<url>", help="Find outgoing urls from <url>")
    a("--aliases", metavar="<url>", help="Find other urls for the document at <url>")
    a("--multiple", action="store_true", help="Find documents with multiple urls")
    a("--trace", metavar="<url>", help="Trace path from root to <url>")
    a("--deepest", action="store_true", help="Trace url furthest from root")
    a("--popular", action="store_true", help="Find the most referenced urls")
    a("--test", action="store_true", help="Run trace loop test")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.test:
            wb = Web()
            wb.root = Node("a")
            wb.index["a"] = wb.root
Example #3
0
        if len(stack) > 1:  # more than one layer of color
            col_bold = True
        if len(stack) > 0:  # at least one layer
            col = stack[-1:].pop()

        str_fmt += str[cursor:pos] + shcolor.code(col, bold=col_bold)
        cursor = pos
    str_fmt += str[cursor:-1]

    return str_fmt



if __name__ == "__main__":
    (parser, a) = io.init_opts("[ <url|file> [options] | --test ]")
    a("--dump", action="store_true", help="Dump urls")
    a("--test", action="store_true", help="Run spider testsuite")
    (opts, args) = io.parse_args(parser)
    try:
        url = None
        if opts.test:
            data = testcases
        else:
            url = args[0]
            data = urllib.urlopen(url).read()

        if opts.dump:
            for u in unique(unbox_it_to_ss(findall(data, url))):
                print u
        else:
Example #4
0
            for n in node.outgoing:
                node.outgoing[n] = None
        #for node in self.index.values():
        #    print node.incoming
        #    print node.outgoing

    def _from_pickle(self):
        for node in self.index.values():
            for n in node.incoming:
                node.incoming[n] = self.index[n]
            for n in node.outgoing:
                node.outgoing[n] = self.index[n]


if __name__ == "__main__":
    (parser, a) = io.init_opts("<web> [options]")
    a("--dump", action="store_true", help="Dump all urls in web")
    a("--in", metavar="<url>", dest="into", help="Find incoming urls to <url>")
    a("--out", metavar="<url>", help="Find outgoing urls from <url>")
    a("--aliases",
      metavar="<url>",
      help="Find other urls for the document at <url>")
    a("--multiple",
      action="store_true",
      help="Find documents with multiple urls")
    a("--trace", metavar="<url>", help="Trace path from root to <url>")
    a("--deepest", action="store_true", help="Trace url furthest from root")
    a("--popular", action="store_true", help="Find the most referenced urls")
    a("--test", action="store_true", help="Run trace loop test")
    (opts, args) = io.parse_args(parser)
    try:
Example #5
0
def logerror(path):
    io.savelog("Path failed: %s\n" % path, "error_dumpstream")


def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        io.write_abort()
    except Exception, e:
        s = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        io.write_err(s)


if __name__ == "__main__":
    (parser, a) = io.init_opts("< <file>")
    (opts, args) = io.parse_args(parser)
    main()
Example #6
0
        while queue:
            if depth > 0: 
                depth -= 1
            elif depth == 0: 
            # There may still be records in the queue, but since depth is reached
            # no more spidering is allowed, so we allow one more iteration, but
            # only for fetching
                queue, outer_queue = split_queue(queue, rules.index(rule) == len(rules)-1)

            queue = process_records(queue, rule, wb)

    save_session(wb)


if __name__ == "__main__":
    (parser, a) = io.init_opts("<url> ['<pattern>'] [options]")
    a("--recipe", metavar="<recipe>", dest="recipe", help="Use a spidering recipe")
    a("--fetch", action="store_true", help="Fetch urls, don't dump")
    a("--dump", action="store_true", help="Dump urls, don't fetch")
    a("--host", action="store_true", help="Only spider this host")
    a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)
Example #7
0
                depth -= 1
            elif depth == 0:
                # There may still be records in the queue, but since depth is reached
                # no more spidering is allowed, so we allow one more iteration, but
                # only for fetching
                queue, outer_queue = split_queue(
                    queue,
                    rules.index(rule) == len(rules) - 1)

            queue = process_records(queue, rule, wb)

    save_session(wb)


if __name__ == "__main__":
    (parser, a) = io.init_opts("<url> ['<pattern>'] [options]")
    a("--recipe",
      metavar="<recipe>",
      dest="recipe",
      help="Use a spidering recipe")
    a("--fetch", action="store_true", help="Fetch urls, don't dump")
    a("--dump", action="store_true", help="Dump urls, don't fetch")
    a("--host", action="store_true", help="Only spider this host")
    a("--depth",
      type="int",
      metavar="<depth>",
      dest="depth",
      help="Spider to this depth")
    (opts, args) = io.parse_args(parser)
    try:
        if opts.fetch: