def __init__(self,
                 db_path,
                 browser_params,
                 num_browsers,
                 task_description=None):
        # sets up the information needed to write to the database
        self.desc = task_description
        self.db_path = db_path

        # sets up the crawl data database
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'),
                  'r') as f:
            self.db.executescript(f.read())

        # prepares browser settings
        self.num_browsers = num_browsers
        # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers>
        if type(browser_params) is not list:
            browser_params = [
                copy.deepcopy(browser_params) for i in xrange(0, num_browsers)
            ]

        if len(browser_params) != num_browsers:
            raise Exception(
                "Number of browser parameter dictionaries is not the same as <num_browsers>"
            )

        # sets up the DataAggregator + associated queues
        self.aggregator_status_queue = None  # queue used for sending graceful KILL command to DataAggregator
        self.data_aggregator = self.launch_data_aggregator()
        self.aggregator_address = self.aggregator_status_queue.get(
        )  # socket location: (address, port)

        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0],
                          self.aggregator_address[1])

        # update task table
        cur = self.db.cursor()
        cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc, ))
        self.db.commit()
        self.task_id = cur.lastrowid

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self.initialize_browsers(
            browser_params)  # List of the Browser(s)

        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0],
                          self.aggregator_address[1])
Esempio n. 2
0
def dump_profile_cookies(start_time, visit_id, webdriver, browser_params,
                         manager_params):
    """ Save changes to Firefox's cookies.sqlite to database

    We determine which cookies to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.

    Note that the extension's cookieInstrument is preferred to this approach,
    as this is likely to miss changes still present in the sqlite `wal` files.
    This will likely be removed in a future version.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills window to avoid stray requests
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            data = dict(row)
            data["crawl_id"] = browser_params['crawl_id']
            data["visit_id"] = visit_id
            sock.send(("profile_cookies", data))

    # Close connection to db
    sock.close()
Esempio n. 3
0
    def __init__(self, db_path, browser_params, num_browsers, log_file = '~/openwpm.log', process_watchdog=False, task_description=None):
        # Flow control
        self.closing = False
        self.failure_flag = False
        self.threadlock = threading.Lock()
        self.failurecount = 0

        # sets up the information needed to write to the database
        self.desc = task_description
        self.db_path = db_path

        self.log_file = log_file
        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        
        # prepares browser settings
        self.num_browsers = num_browsers
        # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers>
        if type(browser_params) is not list:
            browser_params = [copy.deepcopy(browser_params) for i in xrange(0, num_browsers)]

        if len(browser_params) != num_browsers:
            raise Exception("Number of browser parameter dictionaries is not the same as <num_browsers>")

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        self.logger_address = self.logging_status_queue.get()  # socket location: (address, port)
        self.logger = MPLogger.loggingclient(*self.logger_address)
        
        # sets up the DataAggregator + associated queues
        self.aggregator_status_queue = None  # queue used for sending graceful KILL command to DataAggregator
        self.data_aggregator = self._launch_data_aggregator()
        self.aggregator_address = self.aggregator_status_queue.get()  # socket location: (address, port)

        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0], self.aggregator_address[1])

        # update task table
        cur = self.db.cursor()
        cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,))
        self.db.commit()
        self.task_id = cur.lastrowid
        
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery):
    logger = loggingclient(*browser_params['logger_address'])
    
    # Start the proxy
    proxy_site_queue = None  # used to pass the current site down to the proxy
    if browser_params['proxy']:
        (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'],
                                                                      browser_params['logger_address'],
                                                                      browser_params['crawl_id'])
        browser_params['proxy'] = local_port

    # Start the virtualdisplay (if necessary), webdriver, and browser
    (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, crash_recovery)

    # Read the extension port -- if extension is enabled
    # TODO: This needs to be cleaner
    if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']:
        logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder))
        while not os.path.isfile(prof_folder + 'extension_port.txt'):
            time.sleep(0.1)
        time.sleep(0.5)
        with open(prof_folder + 'extension_port.txt', 'r') as f:
            port = f.read().strip()
        extension_socket = clientsocket()
        extension_socket.connect('127.0.0.1',int(port))
    else:
        extension_socket = None

    # passes the profile folder, WebDriver pid and display pid back to the TaskManager
    # now, the TaskManager knows that the browser is successfully set up
    status_queue.put('READY')
    browser_params['profile_path'] = prof_folder

    # starts accepting arguments until told to die
    while True:
        # no command for now -> sleep to avoid pegging CPU on blocking get
        if command_queue.empty():
            time.sleep(0.001)
            continue

        # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
        command = command_queue.get()
        logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command)))

        # attempts to perform an action and return an OK signal
        # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
        try:
            command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket)
            status_queue.put("OK")
        except Exception as e:
            logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s" % (browser_params['crawl_id'], str(type(e)), str(e)))
            status_queue.put("FAILED")
            break
    def __init__(self, db_path, browser_params, num_browsers, task_description=None):
        # sets up the information needed to write to the database
        self.desc = task_description
        self.db_path = db_path

        # sets up the crawl data database
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        
        # prepares browser settings
        self.num_browsers = num_browsers
        # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers>
        if type(browser_params) is not list:
            browser_params = [copy.deepcopy(browser_params) for i in xrange(0, num_browsers)]

        if len(browser_params) != num_browsers:
            raise Exception("Number of browser parameter dictionaries is not the same as <num_browsers>")

        # sets up the DataAggregator + associated queues
        self.aggregator_status_queue = None  # queue used for sending graceful KILL command to DataAggregator
        self.data_aggregator = self.launch_data_aggregator()
        self.aggregator_address = self.aggregator_status_queue.get()  # socket location: (address, port)

        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0], self.aggregator_address[1])

        # update task table
        cur = self.db.cursor()
        cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,))
        self.db.commit()
        self.task_id = cur.lastrowid
        
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self.initialize_browsers(browser_params)  # List of the Browser(s)
        
        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0], self.aggregator_address[1])
Esempio n. 6
0
def BrowserManager(command_queue, status_queue, browser_params, crash_recovery):
    # sets up the proxy (for now, mitmproxy) if necessary
    proxy_site_queue = None  # used to pass the current site down to the proxy
    if browser_params['proxy']:
        (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params['aggregator_address'],
                                                                      browser_params['crawl_id'])
        browser_params['proxy'] = local_port

    # Gets the WebDriver, profile folder (i.e. where history/cookies are stored) and display pid (None if not headless)
    (driver, prof_folder, display_pid, browser_settings) = deploy_browser.deploy_browser(browser_params, crash_recovery)

    # Read the extension port -- if extension is enabled
    # TODO: This needs to be cleaner
    if browser_params['browser'] == 'firefox' and browser_params['extension']['enabled']:
        while not os.path.isfile(prof_folder + 'extension_port.txt'):
            time.sleep(0.01)
        with open(prof_folder + 'extension_port.txt', 'r') as f:
            port = f.read().strip()
        extension_socket = clientsocket()
        extension_socket.connect('127.0.0.1',int(port))
    else:
        extension_socket = None

    # passes the profile folder, WebDriver pid and display pid back to the TaskManager
    # now, the TaskManager knows that the browser is successfully set up
    status_queue.put((prof_folder, int(driver.binary.process.pid), display_pid, browser_settings))
    browser_params['profile_path'] = prof_folder

    # starts accepting arguments until told to die
    while True:
        # no command for now -> sleep to avoid pegging CPU on blocking get
        if command_queue.empty():
            time.sleep(0.001)
            continue

        # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
        command = command_queue.get()
        print "EXECUTING COMMAND: " + str(command)

        # attempts to perform an action and return an OK signal
        # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
        try:
            command_executor.execute_command(command, driver, proxy_site_queue, browser_settings, browser_params, extension_socket)
            status_queue.put("OK")
        except Exception as ex:
            print "CRASH IN DRIVER ORACLE:" + str(ex) + " RESTARTING BROWSER MANAGER"
            status_queue.put("FAILED")
            break
Esempio n. 7
0
    def _launch_aggregators(self):
        """Launch the necessary data aggregators"""
        if self.manager_params["output_format"] == "local":
            self.data_aggregator = LocalAggregator.LocalAggregator(
                self.manager_params, self.browser_params)
        elif self.manager_params["output_format"] == "s3":
            self.data_aggregator = S3Aggregator.S3Aggregator(
                self.manager_params, self.browser_params)
        else:
            raise Exception("Unrecognized output format: %s" %
                            self.manager_params["output_format"])
        self.data_aggregator.launch()
        self.manager_params[
            'aggregator_address'] = self.data_aggregator.listener_address

        # open connection to aggregator for saving crawl details
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])
Esempio n. 8
0
        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs['driver']
            manager_params = kwargs['manager_params']
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name('a'))
                if x.startswith(scheme + '://')
            ]
            current_url = driver.current_url

            sock = clientsocket()
            sock.connect(*manager_params['aggregator_address'])

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT);" % table_name)
            sock.send(("create_table", query))

            for link in link_urls:
                query = (table_name, {"top_url": current_url, "link": link})
                sock.send(query)
            sock.close()
Esempio n. 9
0
def dump_flash_cookies(start_time, visit_id, webdriver, browser_params,
                       manager_params):
    """ Save newly changed Flash LSOs to database

    We determine which LSOs to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills window to avoid stray requests
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Flash cookies
    flash_cookies = get_flash_cookies(start_time)
    for cookie in flash_cookies:
        data = cookie._asdict()
        data["crawl_id"] = browser_params["crawl_id"]
        data["visit_id"] = visit_id
        sock.send(("flash_cookies", data))

    # Close connection to db
    sock.close()
Esempio n. 10
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params, crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the proxy
        proxy_site_queue = None  # used to pass the current site down to the proxy
        if browser_params['proxy']:
            (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(browser_params,
                                                                          manager_params,
                                                                          status_queue)
            browser_params['proxy'] = local_port
        status_queue.put(('STATUS','Proxy Ready','READY'))

        # Start the virtualdisplay (if necessary), webdriver, and browser
        (driver, prof_folder, browser_settings) = deploy_browser.deploy_browser(status_queue, browser_params, manager_params, crash_recovery)

        # Read the extension port -- if extension is enabled
        # TODO: This needs to be cleaner
        if browser_params['browser'] == 'firefox' and browser_params['extension_enabled']:
            logger.debug("BROWSER %i: Looking for extension port information in %s" % (browser_params['crawl_id'], prof_folder))
            while not os.path.isfile(prof_folder + 'extension_port.txt'):
                time.sleep(0.1)
            time.sleep(0.5)
            with open(prof_folder + 'extension_port.txt', 'r') as f:
                port = f.read().strip()
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1',int(port))
        else:
            extension_socket = None

        # passes the profile folder, WebDriver pid and display pid back to the TaskManager
        # now, the TaskManager knows that the browser is successfully set up
        status_queue.put(('STATUS','Browser Ready','READY'))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" % (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
            command_executor.execute_command(command,
                                             driver,
                                             proxy_site_queue,
                                             browser_settings,
                                             browser_params,
                                             manager_params,
                                             extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL',cPickle.dumps(err_info)))
        return
    except Exception as e:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info("BROWSER %i: Crash in driver, restarting browser manager \n %s" % (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED',None))
        return
Esempio n. 11
0
    def __init__(self, manager_params, browser_params, process_watchdog=False, task_description=None):
        # Make paths absolute in manager_params
        manager_params['data_directory'] = os.path.expanduser(manager_params['data_directory'])
        manager_params['log_directory'] = os.path.expanduser(manager_params['log_directory'])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
        self.manager_params = manager_params
        
        # Flow control
        self.closing = False
        self.failure_flag = False
        self.threadlock = threading.Lock()
        self.failurecount = 0
        
        self.desc = task_description
        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        
        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']")

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(*self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()
        
        # open client socket
        self.sock = clientsocket()
        self.sock.connect(*self.manager_params['aggregator_address'])

        # update task table
        cur = self.db.cursor()
        cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc,))
        self.db.commit()
        self.task_id = cur.lastrowid
        
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()
Esempio n. 12
0
    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory','log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(manager_params['data_directory'],manager_params['database_name'])
        manager_params['log_file'] = os.path.join(manager_params['log_directory'],manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception("Number of <browser_params> dicts is not the same as manager_params['num_browsers']")

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'), 'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(*self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])

        self._save_configuration(browser_params)

        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()
Esempio n. 13
0
def BrowserManager(command_queue, status_queue, browser_params,
                   crash_recovery):
    logger = loggingclient(*browser_params['logger_address'])

    # Start the proxy
    proxy_site_queue = None  # used to pass the current site down to the proxy
    if browser_params['proxy']:
        (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(
            browser_params['aggregator_address'],
            browser_params['logger_address'], browser_params['crawl_id'])
        browser_params['proxy'] = local_port

    # Start the virtualdisplay (if necessary), webdriver, and browser
    (driver, prof_folder,
     browser_settings) = deploy_browser.deploy_browser(status_queue,
                                                       browser_params,
                                                       crash_recovery)

    # Read the extension port -- if extension is enabled
    # TODO: This needs to be cleaner
    if browser_params['browser'] == 'firefox' and browser_params['extension'][
            'enabled']:
        logger.debug(
            "BROWSER %i: Looking for extension port information in %s" %
            (browser_params['crawl_id'], prof_folder))
        while not os.path.isfile(prof_folder + 'extension_port.txt'):
            time.sleep(0.1)
        time.sleep(0.5)
        with open(prof_folder + 'extension_port.txt', 'r') as f:
            port = f.read().strip()
        extension_socket = clientsocket()
        extension_socket.connect('127.0.0.1', int(port))
    else:
        extension_socket = None

    # passes the profile folder, WebDriver pid and display pid back to the TaskManager
    # now, the TaskManager knows that the browser is successfully set up
    status_queue.put('READY')
    browser_params['profile_path'] = prof_folder

    # starts accepting arguments until told to die
    while True:
        # no command for now -> sleep to avoid pegging CPU on blocking get
        if command_queue.empty():
            time.sleep(0.001)
            continue

        # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
        command = command_queue.get()
        logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                    (browser_params['crawl_id'], str(command)))

        # attempts to perform an action and return an OK signal
        # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
        try:
            command_executor.execute_command(command, driver, proxy_site_queue,
                                             browser_settings, browser_params,
                                             extension_socket)
            status_queue.put("OK")
        except Exception as e:
            logger.info(
                "BROWSER %i: Crash in driver, restarting browser manager \n %s \n %s"
                % (browser_params['crawl_id'], str(type(e)), str(e)))
            status_queue.put("FAILED")
            break
Esempio n. 14
0
    def __init__(self,
                 db_path,
                 browser_params,
                 num_browsers,
                 log_file='~/openwpm.log',
                 process_watchdog=False,
                 task_description=None):
        # Flow control
        self.closing = False
        self.failure_flag = False
        self.threadlock = threading.Lock()
        self.failurecount = 0

        # sets up the information needed to write to the database
        self.desc = task_description
        self.db_path = db_path

        self.log_file = log_file
        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'),
                  'r') as f:
            self.db.executescript(f.read())

        # prepares browser settings
        self.num_browsers = num_browsers
        # special case: for singleton dictionary, we perform deep copies so that number of dicts is <num_browsers>
        if type(browser_params) is not list:
            browser_params = [
                copy.deepcopy(browser_params) for i in xrange(0, num_browsers)
            ]

        if len(browser_params) != num_browsers:
            raise Exception(
                "Number of browser parameter dictionaries is not the same as <num_browsers>"
            )

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        self.logger_address = self.logging_status_queue.get(
        )  # socket location: (address, port)
        self.logger = MPLogger.loggingclient(*self.logger_address)

        # sets up the DataAggregator + associated queues
        self.aggregator_status_queue = None  # queue used for sending graceful KILL command to DataAggregator
        self.data_aggregator = self._launch_data_aggregator()
        self.aggregator_address = self.aggregator_status_queue.get(
        )  # socket location: (address, port)

        # open client socket
        self.sock = clientsocket()
        self.sock.connect(self.aggregator_address[0],
                          self.aggregator_address[1])

        # update task table
        cur = self.db.cursor()
        cur.execute("INSERT INTO task (description) VALUES (?)", (self.desc, ))
        self.db.commit()
        self.task_id = cur.lastrowid

        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(
            browser_params)  # List of the Browser(s)
        self._launch_browsers()

        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()
Esempio n. 15
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params,
                   crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the proxy
        proxy_site_queue = None  # used to pass the current site down to the proxy
        if browser_params['proxy']:
            (local_port, proxy_site_queue) = deploy_mitm_proxy.init_proxy(
                browser_params, manager_params, status_queue)
            browser_params['proxy'] = local_port
        status_queue.put(('STATUS', 'Proxy Ready', 'READY'))

        # Start the virtualdisplay (if necessary), webdriver, and browser
        (driver, prof_folder,
         browser_settings) = deploy_browser.deploy_browser(
             status_queue, browser_params, manager_params, crash_recovery)

        # Read the extension port -- if extension is enabled
        # TODO: This needs to be cleaner
        if browser_params['browser'] == 'firefox' and browser_params[
                'extension_enabled']:
            logger.debug(
                "BROWSER %i: Looking for extension port information in %s" %
                (browser_params['crawl_id'], prof_folder))
            while not os.path.isfile(prof_folder + 'extension_port.txt'):
                time.sleep(0.1)
            time.sleep(0.5)
            with open(prof_folder + 'extension_port.txt', 'r') as f:
                port = f.read().strip()
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1', int(port))
        else:
            extension_socket = None

        # passes the profile folder, WebDriver pid and display pid back to the TaskManager
        # now, the TaskManager knows that the browser is successfully set up
        status_queue.put(('STATUS', 'Browser Ready', 'READY'))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                        (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskMaster to kill and restart its worker processes
            command_executor.execute_command(command, driver, proxy_site_queue,
                                             browser_settings, browser_params,
                                             manager_params, extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                    (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL', cPickle.dumps(err_info)))
        return
    except Exception as e:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info(
            "BROWSER %i: Crash in driver, restarting browser manager \n %s" %
            (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED', None))
        return
Esempio n. 16
0
    def __init__(self, manager_params, browser_params, process_watchdog=False):

        # Make paths absolute in manager_params
        for path in ['data_directory', 'log_directory']:
            if manager_params[path] is not None:
                manager_params[path] = os.path.expanduser(manager_params[path])
        manager_params['database_name'] = os.path.join(
            manager_params['data_directory'], manager_params['database_name'])
        manager_params['log_file'] = os.path.join(
            manager_params['log_directory'], manager_params['log_file'])
        manager_params['screenshot_path'] = os.path.join(
            manager_params['data_directory'], 'screenshots')
        manager_params['source_dump_path'] = os.path.join(
            manager_params['data_directory'], 'sources')
        self.manager_params = manager_params

        # Create data directories if they do not exist
        if not os.path.exists(manager_params['screenshot_path']):
            os.makedirs(manager_params['screenshot_path'])
        if not os.path.exists(manager_params['source_dump_path']):
            os.makedirs(manager_params['source_dump_path'])

        # check size of parameter dictionary
        self.num_browsers = manager_params['num_browsers']
        if len(browser_params) != self.num_browsers:
            raise Exception(
                "Number of <browser_params> dicts is not the same as manager_params['num_browsers']"
            )

        # Flow control
        self.closing = False
        self.failure_status = None
        self.threadlock = threading.Lock()
        self.failurecount = 0
        if manager_params['failure_limit'] is not None:
            self.failure_limit = manager_params['failure_limit']
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        self.process_watchdog = process_watchdog

        # sets up the crawl data database
        db_path = manager_params['database_name']
        if not os.path.exists(manager_params['data_directory']):
            os.mkdir(manager_params['data_directory'])
        self.db = sqlite3.connect(db_path)
        with open(os.path.join(os.path.dirname(__file__), 'schema.sql'),
                  'r') as f:
            self.db.executescript(f.read())
        self.db.commit()

        # sets up logging server + connect a client
        self.logging_status_queue = None
        self.loggingserver = self._launch_loggingserver()
        # socket location: (address, port)
        self.manager_params['logger_address'] = self.logging_status_queue.get()
        self.logger = MPLogger.loggingclient(
            *self.manager_params['logger_address'])

        # Mark if LDBAggregator is needed (if js is enabled on any browser)
        self.ldb_enabled = False
        for params in browser_params:
            if params['save_javascript'] or params['save_javascript_proxy']:
                self.ldb_enabled = True
                break

        # Initialize the data aggregators
        self._launch_aggregators()

        # open client socket
        self.sock = clientsocket(serialization='dill')
        self.sock.connect(*self.manager_params['aggregator_address'])
        print 1
        self._save_configuration(browser_params)
        print 2
        # read the last used site visit id
        cur = self.db.cursor()
        cur.execute("SELECT MAX(visit_id) from site_visits")
        last_visit_id = cur.fetchone()[0]
        if last_visit_id is None:
            last_visit_id = 0
        self.next_visit_id = last_visit_id + 1
        print 3
        # sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(
            browser_params)  # List of the Browser(s)
        print 5
        self._launch_browsers()
        print 4
        # start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.start()
Esempio n. 17
0
def BrowserManager(command_queue, status_queue, browser_params, manager_params,
                   crash_recovery):
    """
    The BrowserManager function runs in each new browser process.
    It is responsible for listening to command instructions from
    the Task Manager and passing them to the command module to execute
    and interface with Selenium. Command execution status is sent back
    to the TaskManager.
    """
    try:
        logger = loggingclient(*manager_params['logger_address'])

        # Start the virtualdisplay (if necessary), webdriver, and browser
        driver, prof_folder, browser_settings = deploy_browser.deploy_browser(
            status_queue, browser_params, manager_params, crash_recovery)
        if prof_folder[-1] != '/':
            prof_folder += '/'

        # Read the extension port -- if extension is enabled
        # TODO: Initial communication from extension to TM should use sockets
        if (browser_params['browser'] == 'firefox'
                and browser_params['extension_enabled']):
            logger.debug("BROWSER %i: Looking for extension port information "
                         "in %s" % (browser_params['crawl_id'], prof_folder))
            elapsed = 0
            port = None
            ep_filename = os.path.join(prof_folder, 'extension_port.txt')
            while elapsed < 5:
                try:
                    with open(ep_filename, 'rt') as f:
                        port = int(f.read().strip())
                        break
                except IOError as e:
                    if e.errno != errno.ENOENT:
                        raise
                time.sleep(0.1)
                elapsed += 0.1
            if port is None:
                # try one last time, allowing all exceptions to propagate
                with open(ep_filename, 'rt') as f:
                    port = int(f.read().strip())

            logger.debug("BROWSER %i: Connecting to extension on port %i" %
                         (browser_params['crawl_id'], port))
            extension_socket = clientsocket(serialization='json')
            extension_socket.connect('127.0.0.1', int(port))
        else:
            extension_socket = None

        logger.debug("BROWSER %i: BrowserManager ready." %
                     browser_params['crawl_id'])

        # passes the profile folder, WebDriver pid and display pid back to the
        # TaskManager to signal a successful startup
        status_queue.put(('STATUS', 'Browser Ready', (prof_folder, 'READY')))
        browser_params['profile_path'] = prof_folder

        # starts accepting arguments until told to die
        while True:
            # no command for now -> sleep to avoid pegging CPU on blocking get
            if command_queue.empty():
                time.sleep(0.001)
                continue

            # reads in the command tuple of form:
            # (command, arg0, arg1, arg2, ..., argN) where N is variable
            command = command_queue.get()
            logger.info("BROWSER %i: EXECUTING COMMAND: %s" %
                        (browser_params['crawl_id'], str(command)))
            # attempts to perform an action and return an OK signal
            # if command fails for whatever reason, tell the TaskManager to
            # kill and restart its worker processes
            command_executor.execute_command(command, driver, browser_settings,
                                             browser_params, manager_params,
                                             extension_socket)
            status_queue.put("OK")

    except (ProfileLoadError, BrowserConfigError, AssertionError) as e:
        logger.info("BROWSER %i: %s thrown, informing parent and raising" %
                    (browser_params['crawl_id'], e.__class__.__name__))
        err_info = sys.exc_info()
        status_queue.put(('CRITICAL', pickle.dumps(err_info)))
        return
    except Exception:
        excp = traceback.format_exception(*sys.exc_info())
        logger.info("BROWSER %i: Crash in driver, restarting browser manager "
                    "\n %s" % (browser_params['crawl_id'], ''.join(excp)))
        status_queue.put(('FAILED', None))
        return