Example #1
0
def run():
    # get commandline args
    args = parse_args()

    initialize_database()

    # store start time & args, plus get an ID for this crawl
    with dataset.connect(DATABASE_URL) as db:
        crawl_id = db["crawl"].insert(dict(args=" ".join(sys.argv[1:]), start_time=datetime.now()))

    url_queue = Queue()  # (url, num_timeouts) tuples
    result_queue = Queue()

    # read in URLs and populate the job queue
    with args.urls:
        urls = list(args.urls)
        # randomize crawl order
        shuffle(urls)
    for url in urls:
        url = url.strip()
        if not urlparse(url).scheme:
            url = "http://" + url
        url_queue.put((url, 0))

    log = Logger().log if not args.quiet else lambda *args, **kwargs: None

    # launch browsers
    crawlers = []
    for i in range(args.num_crawlers):
        crawler = Process(
            target=Crawler,
            args=(i + 1,),
            kwargs={
                "crx": args.crx,
                "headless": args.headless,
                "logger": log,
                "timeout": args.timeout,
                "url_queue": url_queue,
                "result_queue": result_queue,
            },
        )
        crawler.start()
        crawlers.append(crawler)

    # start the collector process
    Process(target=collect, args=(crawl_id, result_queue, log)).start()

    # wait for all browsers to finish
    for crawler in crawlers:
        crawler.join()

    # tell collector we are done
    result_queue.put(None)

    # store completion time
    with dataset.connect(DATABASE_URL) as db:
        db["crawl"].update(dict(id=crawl_id, end_time=datetime.now()), "id")

    log("Main process all done!")
Example #2
0
def run():
    # get commandline args
    args = parse_args()

    initialize_database()

    # store start time & args, plus get an ID for this crawl
    with dataset.connect(DATABASE_URL) as db:
        crawl_id = db['crawl'].insert(
            dict(args=" ".join(sys.argv[1:]), start_time=datetime.now()))

    url_queue = Queue()  # (url, num_timeouts) tuples
    result_queue = Queue()

    # read in URLs and populate the job queue
    with args.urls:
        urls = list(args.urls)
        # randomize crawl order
        shuffle(urls)
    for url in urls:
        url = url.strip()
        if not urlparse(url).scheme:
            url = 'http://' + url
        url_queue.put((url, 0))

    log = Logger().log if not args.quiet else lambda *args, **kwargs: None

    # launch browsers
    crawlers = []
    for i in range(args.num_crawlers):
        crawler = Process(target=Crawler,
                          args=(i + 1, ),
                          kwargs={
                              'crx': args.crx,
                              'headless': args.headless,
                              'logger': log,
                              'timeout': args.timeout,
                              'url_queue': url_queue,
                              'result_queue': result_queue
                          })
        crawler.start()
        crawlers.append(crawler)

    # start the collector process
    Process(target=collect, args=(crawl_id, result_queue, log)).start()

    # wait for all browsers to finish
    for crawler in crawlers:
        crawler.join()

    # tell collector we are done
    result_queue.put(None)

    # store completion time
    with dataset.connect(DATABASE_URL) as db:
        db['crawl'].update(dict(id=crawl_id, end_time=datetime.now()), 'id')

    log("Main process all done!")
Example #3
0
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from datetime import datetime
from flask import Flask, render_template, request
from utils.database import DATABASE_URL, initialize_database

import dataset

app = Flask(__name__)

app.config['DATABASE_URL'] = DATABASE_URL
app.jinja_env.trim_blocks = True
app.jinja_env.lstrip_blocks = True

initialize_database()


def get_canvases(result_ids):
    canvases = {}

    sql = """SELECT canvas.id, data_url
        FROM canvas
        JOIN result ON result.canvas_id = canvas.id
        WHERE result.id IN (%s)""" % ','.join(
        [str(int(id)) for id in result_ids])

    with dataset.connect(app.config['DATABASE_URL']) as db:
        for row in db.query(sql):
            canvases[row['id']] = row['data_url']
Example #4
0
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from datetime import datetime
from flask import Flask, render_template, request
from utils.database import DATABASE_URL, initialize_database

import dataset


app = Flask(__name__)

app.config['DATABASE_URL'] = DATABASE_URL
app.jinja_env.trim_blocks = True
app.jinja_env.lstrip_blocks = True

initialize_database()


def get_canvases(result_ids):
    canvases = {}

    sql = """SELECT canvas.id, data_url
        FROM canvas
        JOIN result ON result.canvas_id = canvas.id
        WHERE result.id IN (%s)""" % ','.join(
        [str(int(id)) for id in result_ids])

    with dataset.connect(app.config['DATABASE_URL']) as db:
        for row in db.query(sql):
            canvases[row['id']] = row['data_url']