def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def brozzle_page(argv=None): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzle-page - brozzle a single page', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=('json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( None, { 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password }) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker( frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def brozzle_page(argv=None): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzle-page - brozzle a single page', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=( 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true') arg_parser.add_argument( '--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true') arg_parser.add_argument( '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) brozzler.chrome.check_version(args.chrome_exe) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site(None, { 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) except brozzler.PageInterstitialShown as e: logging.error('page interstitial shown %s', e) finally: browser.stop()
def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=( 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features, behavior_parameters=behavior_parameters) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()