コード例 #1
0
ファイル: spiderfetch.py プロジェクト: numerodix/spiderfetch
def run_script():
    (parser, a) = ioutils.init_opts("<url> ['<pattern>'] [options]")
    a("--recipe", metavar="<recipe>", dest="recipe", help="Use a spidering recipe")
    a("--fetch", action="store_true", help="Fetch urls, don't dump")
    a("--dump", action="store_true", help="Dump urls, don't fetch")
    a("--host", action="store_true", help="Only spider this host")
    a("--pause", type="int", metavar="<pause>", dest="pause", help="Pause for x seconds between requests")
    a("--depth", type="int", metavar="<depth>", dest="depth", help="Spider to this depth")
    (opts, args) = ioutils.parse_args(parser)
    try:
        if opts.fetch:
            os.environ["FETCH_ALL"] = "1"
        elif opts.dump:
            os.environ["DUMP_ALL"] = "1"
        if opts.host:
            os.environ["HOST_FILTER"] = "1"
        if opts.pause:
            os.environ["PAUSE"] = str(opts.pause)
        if opts.depth:
            os.environ["DEPTH"] = str(opts.depth)

        url = args[0]
        if opts.recipe:
            rules = recipe.load_recipe(opts.recipe, url)
        else:
            pattern = args[1]
            rules = recipe.get_recipe(pattern, url)

        session = Session.restore(url)
        session.rules = rules

        if session.queue is None:
            session.queue = recipe.get_queue(url, mode=fetch.Fetcher.SPIDER)
        if session.wb is None:
            session.wb = web.Web(url)

    except recipe.PatternError as e:
        ioutils.write_err(ansicolor.red("%s\n" % e))
        sys.exit(1)
    except IndexError:
        ioutils.opts_help(None, None, None, parser)

    spiderfetcher = SpiderFetcher(session)
    spiderfetcher.main()
コード例 #2
0
ファイル: spider.py プロジェクト: numerodix/spiderfetch
    for rx in it:
        (rx_id, match) = rx
        try:
            regexs[rx_id].append(match)
        except KeyError:
            regexs[rx_id] = []
            regexs[rx_id].append(match)

    spanlists = [map(lambda m: m.span('url'), regexs[rx_id])
                 for rx_id in sorted(regexs.keys())]
    return ansicolor.highlight_string(str, *spanlists)



if __name__ == "__main__":
    (parser, a) = ioutils.init_opts("[ <url|file> [options] | --test ]")
    a("--dump", action="store_true", help="Dump urls")
    a("--test", action="store_true", help="Run spider testsuite")
    (opts, args) = ioutils.parse_args(parser)
    try:
        url = None
        if opts.test:
            data = testcases
        else:
            url = args[0]
            data = urllib.urlopen(url).read()

        if opts.dump:
            for u in unique(unbox_it_to_ss(findall(data, url))):
                print(u)
        else:
コード例 #3
0
ファイル: fetch.py プロジェクト: numerodix/spiderfetch
            self.launch()

            if not self.error or not err.is_temporal(self.error):
                return

            if self.tries < 1:
                return

            # retry after a short delay
            self.write_progress(wait=True)
            time.sleep(self.retry_wait)



if __name__ == "__main__":
    (parser, a) = ioutils.init_opts("<url>+ [options]")
    a("--fullpath", action="store_true",
      help="Use full path as filename to avoid name collisions")
    a("-c", "--continue", dest="cont", action="store_true", help="Resume downloads")
    a("-t", "--tries", dest="tries", type="int", action="store", help="Number of retries")
    a("-q", "--quiet", dest="quiet", action="store_true", help="Turn off logging")
    a("--spidertest", action="store_true", help="Test spider with url")
    (opts, args) = ioutils.parse_args(parser)
    if getattr(opts, 'cont', None):
        os.environ["CONT"] = "1"
    if getattr(opts, 'tries', None):
        if not os.environ.get("TRIES"):
            os.environ["TRIES"] = str(opts.tries)
    if opts.quiet:
        os.environ["LOGGING"] = str(False)
    try:
コード例 #4
0
ファイル: dumpstream.py プロジェクト: numerodix/spiderfetch
def logerror(path):
    ioutils.savelog("Path failed: %s\n" % path, "error_dumpstream")

def main():
    try:
        line = sys.stdin.readline()
        while line:
            path = line.strip()
            filename = os.path.basename(path)
            args = ["mplayer", "-dumpstream", "-dumpfile", filename, path]
            retval = subprocess.call(args)
            if retval:
                logerror(path)

            line = sys.stdin.readline()
    except KeyboardInterrupt:
        ioutils.write_abort()
    except Exception as e:
        s = "%s\n" % traceback.format_exc()
        s += "%s\n" % str(e)
        s += "Invocation string: %s\n" % str(args)
        ioutils.write_err(s)



if __name__ == "__main__":
    (parser, a) = ioutils.init_opts("< <file>")
    (opts, args) = ioutils.parse_args(parser)
    main()
コード例 #5
0
ファイル: web.py プロジェクト: numerodix/spiderfetch
                node.outgoing[n] = None
        #for node in self.index.values():
        #    print(node.incoming)
        #    print(node.outgoing)

    def _from_pickle(self):
        for node in self.index.values():
            for n in node.incoming:
                node.incoming[n] = self.index[n]
            for n in node.outgoing:
                node.outgoing[n] = self.index[n]



if __name__ == "__main__":
    (parser, a) = ioutils.init_opts("<web> [options]")
    a("--dump", action="store_true", help="Dump all urls in web")
    a("--in", metavar="<url>", dest="into", help="Find incoming urls to <url>")
    a("--out", metavar="<url>", help="Find outgoing urls from <url>")
    a("--aliases", metavar="<url>", help="Find other urls for the document at <url>")
    a("--multiple", action="store_true", help="Find documents with multiple urls")
    a("--trace", metavar="<url>", help="Trace path from root to <url>")
    a("--deepest", action="store_true", help="Trace url furthest from root")
    a("--popular", action="store_true", help="Find the most referenced urls")
    a("--test", action="store_true", help="Run trace loop test")
    (opts, args) = ioutils.parse_args(parser)
    try:
        if opts.test:
            wb = Web()
            wb.root = Node("a")
            wb.index["a"] = wb.root