def run(): # get commandline args args = parse_args() initialize_database() # store start time & args, plus get an ID for this crawl with dataset.connect(DATABASE_URL) as db: crawl_id = db["crawl"].insert(dict(args=" ".join(sys.argv[1:]), start_time=datetime.now())) url_queue = Queue() # (url, num_timeouts) tuples result_queue = Queue() # read in URLs and populate the job queue with args.urls: urls = list(args.urls) # randomize crawl order shuffle(urls) for url in urls: url = url.strip() if not urlparse(url).scheme: url = "http://" + url url_queue.put((url, 0)) log = Logger().log if not args.quiet else lambda *args, **kwargs: None # launch browsers crawlers = [] for i in range(args.num_crawlers): crawler = Process( target=Crawler, args=(i + 1,), kwargs={ "crx": args.crx, "headless": args.headless, "logger": log, "timeout": args.timeout, "url_queue": url_queue, "result_queue": result_queue, }, ) crawler.start() crawlers.append(crawler) # start the collector process Process(target=collect, args=(crawl_id, result_queue, log)).start() # wait for all browsers to finish for crawler in crawlers: crawler.join() # tell collector we are done result_queue.put(None) # store completion time with dataset.connect(DATABASE_URL) as db: db["crawl"].update(dict(id=crawl_id, end_time=datetime.now()), "id") log("Main process all done!")
def run(): # get commandline args args = parse_args() initialize_database() # store start time & args, plus get an ID for this crawl with dataset.connect(DATABASE_URL) as db: crawl_id = db['crawl'].insert( dict(args=" ".join(sys.argv[1:]), start_time=datetime.now())) url_queue = Queue() # (url, num_timeouts) tuples result_queue = Queue() # read in URLs and populate the job queue with args.urls: urls = list(args.urls) # randomize crawl order shuffle(urls) for url in urls: url = url.strip() if not urlparse(url).scheme: url = 'http://' + url url_queue.put((url, 0)) log = Logger().log if not args.quiet else lambda *args, **kwargs: None # launch browsers crawlers = [] for i in range(args.num_crawlers): crawler = Process(target=Crawler, args=(i + 1, ), kwargs={ 'crx': args.crx, 'headless': args.headless, 'logger': log, 'timeout': args.timeout, 'url_queue': url_queue, 'result_queue': result_queue }) crawler.start() crawlers.append(crawler) # start the collector process Process(target=collect, args=(crawl_id, result_queue, log)).start() # wait for all browsers to finish for crawler in crawlers: crawler.join() # tell collector we are done result_queue.put(None) # store completion time with dataset.connect(DATABASE_URL) as db: db['crawl'].update(dict(id=crawl_id, end_time=datetime.now()), 'id') log("Main process all done!")
# License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. from datetime import datetime from flask import Flask, render_template, request from utils.database import DATABASE_URL, initialize_database import dataset app = Flask(__name__) app.config['DATABASE_URL'] = DATABASE_URL app.jinja_env.trim_blocks = True app.jinja_env.lstrip_blocks = True initialize_database() def get_canvases(result_ids): canvases = {} sql = """SELECT canvas.id, data_url FROM canvas JOIN result ON result.canvas_id = canvas.id WHERE result.id IN (%s)""" % ','.join( [str(int(id)) for id in result_ids]) with dataset.connect(app.config['DATABASE_URL']) as db: for row in db.query(sql): canvases[row['id']] = row['data_url']
# file, You can obtain one at http://mozilla.org/MPL/2.0/. from datetime import datetime from flask import Flask, render_template, request from utils.database import DATABASE_URL, initialize_database import dataset app = Flask(__name__) app.config['DATABASE_URL'] = DATABASE_URL app.jinja_env.trim_blocks = True app.jinja_env.lstrip_blocks = True initialize_database() def get_canvases(result_ids): canvases = {} sql = """SELECT canvas.id, data_url FROM canvas JOIN result ON result.canvas_id = canvas.id WHERE result.id IN (%s)""" % ','.join( [str(int(id)) for id in result_ids]) with dataset.connect(app.config['DATABASE_URL']) as db: for row in db.query(sql): canvases[row['id']] = row['data_url']