예제 #1
0
 def test_scrape_urls(self):
     url_format = 'http://something{}'
     self.results = {}
     scrape.scrape_urls(url_format, self.callback, ignore_regexp='^/doc')
     self.assertSetEqual({'/',
                          '/path/to/file1',
                          '/path/to/image.png'}, set(self.results.keys()))
예제 #2
0
def scrape_webapp(webargs, callback, ignore_regexp):
    """Run a web server on a Beancount file and scrape it.

    This is the main entry point of this module.

    Args:
      webargs: An argparse.Namespace container of the arguments provided in
        web.add_web_arguments().
      callback: A callback function to invoke on each page to validate it.
        The function is called with the response and the url as arguments.
        This function should trigger an error on failure (via an exception).
      ignore_regexp: A regular expression string, the urls to ignore.
    Returns:
      A set of all the processed URLs and a set of all the skipped URLs.
    """
    url_format = 'http://localhost:{}{{}}'.format(webargs.port)

    thread = thread_server_start(webargs)

    # Skips:
    # - Docs cannot be read for external files.
    #
    # - Components views... well there are just too many, makes the tests
    #   impossibly slow. Just keep the A's so some are covered.
    url_lists = scrape.scrape_urls(url_format, callback, ignore_regexp)

    thread_server_shutdown(thread)

    return url_lists
예제 #3
0
파일: web.py 프로젝트: droogmic/beancount
def scrape_webapp(filename,
                  callback,
                  port,
                  ignore_regexp,
                  quiet=True,
                  no_colons=False,
                  extra_args=None):
    """Run a web server on a Beancount file and scrape it.

    This is the main entry point of this module.

    Args:
      filename: A string, the name of the file to parse.
      callback: A callback function to invoke on each page to validate it.
        The function is called with the response and the url as arguments.
        This function should trigger an error on failure (via an exception).
      port: An integer, a free port to use for serving the pages.
      ignore_regexp: A regular expression string, the urls to ignore.
      quiet: True if we shouldn't log the web server pages.
      no_colons: True if we should avoid rendering colons in URLs (for Windows).
      extra_args: Extra arguments to bean-web that we want to start the
        server with.
    Returns:
      A set of all the processed URLs and a set of all the skipped URLs.
    """
    url_format = 'http://localhost:{}{{}}'.format(port)

    # Create a set of valid arguments to run the app.
    argparser = version.ArgumentParser()
    group = add_web_arguments(argparser)
    group.set_defaults(filename=filename,
                       port=port,
                       no_colons=no_colons,
                       quiet=quiet)

    all_args = [filename]
    if extra_args:
        all_args.extend(extra_args)
    args = argparser.parse_args(args=all_args)

    thread = thread_server_start(args)

    # Skips:
    # - Docs cannot be read for external files.
    #
    # - Components views... well there are just too many, makes the tests
    #   impossibly slow. Just keep the A's so some are covered.
    url_lists = scrape.scrape_urls(url_format, callback, ignore_regexp)

    thread_server_shutdown(thread)

    return url_lists