예제 #1
0
    def shutdown_browser(self, during_init):
        """ Runs the closing tasks for this Browser/BrowserManager """
        # Join command thread
        if self.command_thread is not None:
            self.logger.debug("BROWSER %i: Joining command thread" % self.crawl_id)
            start_time = time.time()
            if self.current_timeout is not None:
                self.command_thread.join(self.current_timeout + 10)
            else:
                self.command_thread.join(60)
            self.logger.debug("BROWSER %i: %f seconds to join command thread" % (self.crawl_id, time.time() - start_time))

        # Kill BrowserManager process and children
        self.logger.debug("BROWSER %i: Killing browser manager..." % self.crawl_id)
        self.kill_browser_manager()

        # Archive browser profile (if requested)
        self.logger.debug("BROWSER %i: during_init=%s | profile_archive_dir=%s" % (self.crawl_id, str(during_init), self.browser_params['profile_archive_dir']))
        if not during_init and self.browser_params['profile_archive_dir'] is not None:
            self.logger.debug("BROWSER %i: Archiving browser profile directory to %s" % (self.crawl_id, self.browser_params['profile_archive_dir']))
            profile_commands.dump_profile(self.current_profile_path,
                                          self.manager_params,
                                          self.browser_params,
                                          self.browser_params['profile_archive_dir'],
                                          close_webdriver=False,
                                          browser_settings=self.browser_settings,
                                          compress=True,
                                          save_flash=self.browser_params['disable_flash'] is False)

        # Clean up temporary files
        if self.current_profile_path is not None:
            shutil.rmtree(self.current_profile_path, ignore_errors = True)
예제 #2
0
    def launch_browser_manager(self, spawn_timeout=30):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        <spawn_timeout> is the timeout for creating BrowserManager
        """

        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            crashed_profile_path = self.current_profile_path
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(crashed_profile_path, tempdir, close_webdriver=False,
                                          browser_settings=self.browser_settings, full_profile=True)
            self.browser_params['profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params['random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crashed_profile_path = None
            crash_recovery = False
        
        # keep trying to spawn a BrowserManager until we have a successful launch within the timeout limit
        browser_manager = None
        successful_spawn = False
        while not successful_spawn:
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params, crash_recovery)
            browser_manager = Process(target=BrowserManager, args=args)
            browser_manager.start()

            # waits for BrowserManager to send success tuple i.e. (current_profile_path, browser pid, display pid)
            for i in xrange(0, int(spawn_timeout) * 1000):
                 # no status for now -> sleep to avoid pegging CPU on blocking get
                if self.status_queue.empty():
                    time.sleep(0.001)
                    continue

                (self.current_profile_path, self.browser_pid, self.display_pid, self.browser_settings) \
                    = self.status_queue.get()
                successful_spawn = True
                break

            # kill the BrowserManager if it failed to start up the browser
            if not successful_spawn:
                os.kill(browser_manager.pid, signal.SIGKILL)

        # if recovering from a crash, new browser has a new profile dir
        # so the crashed dir and temporary tar dump can be cleaned up
        if tempdir is not None:
            subprocess.call(["rm", "-r", tempdir])
        if crashed_profile_path is not None:
            subprocess.call(["rm", "-r", crashed_profile_path])

        self.is_fresh = crashed_profile_path is None  # browser is fresh iff it starts from a blank profile
        return browser_manager
예제 #3
0
    def shutdown_browser(self, during_init):
        """ Runs the closing tasks for this Browser/BrowserManager """
        # Join command thread
        if self.command_thread is not None:
            self.logger.debug("BROWSER %i: Joining command thread" % self.crawl_id)
            start_time = time.time()
            if self.current_timeout is not None:
                self.command_thread.join(self.current_timeout + 10)
            else:
                self.command_thread.join(60)
            self.logger.debug("BROWSER %i: %f seconds to join command thread" % (self.crawl_id, time.time() - start_time))

        # Kill BrowserManager process and children
        self.logger.debug("BROWSER %i: Killing browser manager..." % self.crawl_id)
        self.kill_browser_manager()

        # Archive browser profile (if requested)
        self.logger.debug("BROWSER %i: during_init=%s | profile_archive_dir=%s" % (self.crawl_id, str(during_init), self.browser_params['profile_archive_dir']))
        if not during_init and self.browser_params['profile_archive_dir'] is not None:
            self.logger.debug("BROWSER %i: Archiving browser profile directory to %s" % (self.crawl_id, self.browser_params['profile_archive_dir']))
            profile_commands.dump_profile(self.current_profile_path,
                                          self.manager_params,
                                          self.browser_params,
                                          self.browser_params['profile_archive_dir'],
                                          close_webdriver=False,
                                          browser_settings=self.browser_settings,
                                          compress=True,
                                          save_flash=self.browser_params['disable_flash'] is False)

        # Clean up temporary files
        if self.current_profile_path is not None:
            shutil.rmtree(self.current_profile_path, ignore_errors = True)
예제 #4
0
    def launch_browser_manager(self):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if ENABLE_CRASH_RECOVERY and self.current_profile_path is not None:
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(
                self.current_profile_path,
                self.manager_params,
                self.browser_params,
                tempdir,
                close_webdriver=False,
                browser_settings=self.browser_settings)
            self.browser_params[
                'profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params[
                'random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crash_recovery = False
        self.is_fresh = not crash_recovery

        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        success = False

        def check_queue(launch_status):
            result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
            if result[0] == 'STATUS':
                launch_status[result[1]] = True
                return result[2]
            elif result[0] == 'CRITICAL':
                reraise(*cPickle.loads(result[1]))
            elif result[0] == 'FAILED':
                raise BrowserCrashError(
                    'Browser spawn returned failure status')

        while not success and unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT:
            self.logger.debug("BROWSER %i: Spawn attempt %i " %
                              (self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params,
                    self.manager_params, crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            launch_status = dict()
            try:
                check_queue(launch_status)  # proxy enabled (if necessary)
                spawned_profile_path = check_queue(
                    launch_status)  # selenium profile created
                check_queue(launch_status)  # profile tar loaded (if necessary)
                (self.display_pid, self.display_port) = check_queue(
                    launch_status)  # Display launched
                check_queue(launch_status)  # browser launch attempted
                (self.browser_pid, self.browser_settings) = check_queue(
                    launch_status)  # Browser launched
                if check_queue(launch_status) != 'READY':
                    self.logger.error(
                        "BROWSER %i: Mismatch of status queue return values, trying again..."
                        % self.crawl_id)
                    unsuccessful_spawns += 1
                    continue
                success = True
            except (EmptyQueue, BrowserCrashError):
                unsuccessful_spawns += 1
                error_string = ''
                status_strings = [
                    'Proxy Ready', 'Profile Created', 'Profile Tar', 'Display',
                    'Launch Attempted', 'Browser Launched', 'Browser Ready'
                ]
                for string in status_strings:
                    error_string += " | %s: %s " % (
                        string, launch_status.get(string, False))
                self.logger.error("BROWSER %i: Spawn unsuccessful %s" %
                                  (self.crawl_id, error_string))
                self.kill_browser_manager()
                if launch_status.has_key('Profile Created'):
                    shutil.rmtree(spawned_profile_path, ignore_errors=True)

        # If the browser spawned successfully, we should update the
        # current profile path class variable and clean up the tempdir
        # and previous profile path.
        if success:
            self.logger.debug("BROWSER %i: Browser spawn sucessful!" %
                              self.crawl_id)
            previous_profile_path = self.current_profile_path
            self.current_profile_path = spawned_profile_path
            if previous_profile_path is not None:
                shutil.rmtree(previous_profile_path, ignore_errors=True)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)

        return success
예제 #5
0
    def launch_browser_manager(self, spawn_timeout=30):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        <spawn_timeout> is the timeout for creating BrowserManager
        """

        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            crashed_profile_path = self.current_profile_path
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(
                crashed_profile_path,
                tempdir,
                close_webdriver=False,
                browser_settings=self.browser_settings,
                full_profile=True)
            self.browser_params[
                'profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params[
                'random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crashed_profile_path = None
            crash_recovery = False

        # keep trying to spawn a BrowserManager until we have a successful launch within the timeout limit
        browser_manager = None
        successful_spawn = False
        while not successful_spawn:
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params,
                    crash_recovery)
            browser_manager = Process(target=BrowserManager, args=args)
            browser_manager.start()

            # waits for BrowserManager to send success tuple i.e. (current_profile_path, browser pid, display pid)
            for i in xrange(0, int(spawn_timeout) * 1000):
                # no status for now -> sleep to avoid pegging CPU on blocking get
                if self.status_queue.empty():
                    time.sleep(0.001)
                    continue

                (self.current_profile_path, self.browser_pid, self.display_pid, self.browser_settings) \
                    = self.status_queue.get()
                successful_spawn = True
                break

            # kill the BrowserManager if it failed to start up the browser
            if not successful_spawn:
                os.kill(browser_manager.pid, signal.SIGKILL)

        # if recovering from a crash, new browser has a new profile dir
        # so the crashed dir and temporary tar dump can be cleaned up
        if tempdir is not None:
            subprocess.call(["rm", "-r", tempdir])
        if crashed_profile_path is not None:
            subprocess.call(["rm", "-r", crashed_profile_path])

        self.is_fresh = crashed_profile_path is None  # browser is fresh iff it starts from a blank profile
        return browser_manager
예제 #6
0
    def launch_browser_manager(self):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(self.current_profile_path,
                                          self.manager_params,
                                          self.browser_params,
                                          tempdir,
                                          close_webdriver=False,
                                          browser_settings=self.browser_settings)
            self.browser_params['profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params['random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crash_recovery = False
        self.is_fresh = not crash_recovery

        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        success = False

        def check_queue(launch_status):
            result = self.status_queue.get(True, self._SPAWN_TIMEOUT)
            if result[0] == 'STATUS':
                launch_status[result[1]] = True
                return result[2]
            elif result[0] == 'CRITICAL':
                reraise(*cPickle.loads(result[1]))
            elif result[0] == 'FAILED':
                raise BrowserCrashError('Browser spawn returned failure status')

        while not success and unsuccessful_spawns < self._UNSUCCESSFUL_SPAWN_LIMIT:
            self.logger.debug("BROWSER %i: Spawn attempt %i " % (self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params, self.manager_params, crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            launch_status = dict()
            try:
                check_queue(launch_status) # proxy enabled (if necessary)
                spawned_profile_path = check_queue(launch_status) # selenium profile created
                check_queue(launch_status) # profile tar loaded (if necessary)
                (self.display_pid, self.display_port) = check_queue(launch_status) # Display launched
                check_queue(launch_status) # browser launch attempted
                (self.browser_pid, self.browser_settings) = check_queue(launch_status) # Browser launched
                if check_queue(launch_status) != 'READY':
                    self.logger.error("BROWSER %i: Mismatch of status queue return values, trying again..." % self.crawl_id)
                    unsuccessful_spawns += 1
                    continue
                success = True
            except (EmptyQueue, BrowserCrashError):
                unsuccessful_spawns += 1
                error_string = ''
                status_strings = ['Proxy Ready','Profile Created','Profile Tar','Display','Launch Attempted', 'Browser Launched', 'Browser Ready']
                for string in status_strings:
                    error_string += " | %s: %s " % (string, launch_status.get(string, False))
                self.logger.error("BROWSER %i: Spawn unsuccessful %s" % (self.crawl_id, error_string))
                self.kill_browser_manager()
                if launch_status.has_key('Profile Created'):
                    shutil.rmtree(spawned_profile_path, ignore_errors=True)

        # If the browser spawned successfully, we should update the
        # current profile path class variable and clean up the tempdir
        # and previous profile path.
        if success:
            self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
            previous_profile_path = self.current_profile_path
            self.current_profile_path = spawned_profile_path
            if previous_profile_path is not None:
                shutil.rmtree(previous_profile_path, ignore_errors=True)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)

        return success
예제 #7
0
    def launch_browser_manager(self, spawn_timeout=120):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        <spawn_timeout> is the timeout for creating BrowserManager
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            crashed_profile_path = self.current_profile_path
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(
                crashed_profile_path,
                tempdir,
                close_webdriver=False,
                browser_settings=self.browser_settings,
                full_profile=True)
            self.browser_params[
                'profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params[
                'random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crashed_profile_path = None
            crash_recovery = False

        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        retry = False
        success = False
        while not success and unsuccessful_spawns < 4:
            self.logger.debug("BROWSER %i: Spawn attempt %i " %
                              (self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params,
                    crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            prof_done = disp_done = browser_done = ready_done = launch_attempted = False
            try:
                self.current_profile_path = self.status_queue.get(
                    True, spawn_timeout)
                prof_done = True
                (self.display_pid, self.display_port) = self.status_queue.get(
                    True, spawn_timeout)
                disp_done = True
                useless = self.status_queue.get(True, spawn_timeout)
                launch_attempted = True
                (self.browser_pid,
                 self.browser_settings) = self.status_queue.get(
                     True, spawn_timeout)
                browser_done = True
                if self.status_queue.get(True, spawn_timeout) != 'READY':
                    self.logger.error(
                        "BROWSER %i: Mismatch of status queue return values, trying again..."
                        % self.crawl_id)
                    unsuccessful_spawns += 1
                    continue
                success = True
            except EmptyQueue:
                unsuccessful_spawns += 1
                self.logger.error(
                    "BROWSER %i: Spawn unsuccessful | Profile: %s | Display: %s | Launch attempted: %s | Browser: %s"
                    % (self.crawl_id, str(prof_done), str(disp_done),
                       str(launch_attempted), str(browser_done)))
                self.kill_browser_manager()
                if self.current_profile_path is not None:
                    shutil.rmtree(self.current_profile_path,
                                  ignore_errors=True)

        # if recovering from a crash, new browser has a new profile dir
        # so the crashed dir and temporary tar dump can be cleaned up
        if success:
            self.logger.debug("BROWSER %i: Browser spawn sucessful!" %
                              self.crawl_id)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)
            if crashed_profile_path is not None:
                shutil.rmtree(crashed_profile_path, ignore_errors=True)

            self.is_fresh = crashed_profile_path is None  # browser is fresh iff it starts from a blank profile

        return success
예제 #8
0
    def launch_browser_manager(self, spawn_timeout=120):
        """
        sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid
        loads associated user profile if necessary
        <spawn_timeout> is the timeout for creating BrowserManager
        """
        # if this is restarting from a crash, update the tar location
        # to be a tar of the crashed browser's history
        if self.current_profile_path is not None:
            crashed_profile_path = self.current_profile_path
            # tar contents of crashed profile to a temp dir
            tempdir = tempfile.mkdtemp() + "/"
            profile_commands.dump_profile(crashed_profile_path,
                                          self.manager_params,
                                          self.browser_params,
                                          tempdir,
                                          close_webdriver=False,
                                          browser_settings=self.browser_settings)
            self.browser_params['profile_tar'] = tempdir  # make sure browser loads crashed profile
            self.browser_params['random_attributes'] = False  # don't re-randomize attributes
            crash_recovery = True
        else:
            tempdir = None
            crashed_profile_path = None
            crash_recovery = False
        
        # Try to spawn the browser within the timelimit
        unsuccessful_spawns = 0
        retry = False
        success = False
        while not success and unsuccessful_spawns < 4:
            self.logger.debug("BROWSER %i: Spawn attempt %i " % (self.crawl_id, unsuccessful_spawns))
            # Resets the command/status queues
            (self.command_queue, self.status_queue) = (Queue(), Queue())

            # builds and launches the browser_manager
            args = (self.command_queue, self.status_queue, self.browser_params, self.manager_params, crash_recovery)
            self.browser_manager = Process(target=BrowserManager, args=args)
            self.browser_manager.daemon = True
            self.browser_manager.start()

            # Read success status of browser manager
            prof_done = prof_tar_done = disp_done = browser_done = ready_done = launch_attempted = False
            try:
                self.current_profile_path = self.status_queue.get(True, spawn_timeout)
                prof_done = True
                useless = self.status_queue.get(True, spawn_timeout)
                prof_tar_done = True
                (self.display_pid, self.display_port) = self.status_queue.get(True, spawn_timeout)
                disp_done = True
                useless = self.status_queue.get(True, spawn_timeout)
                launch_attempted = True
                (self.browser_pid, self.browser_settings) = self.status_queue.get(True, spawn_timeout)
                browser_done = True
                if self.status_queue.get(True, spawn_timeout) != 'READY':
                    self.logger.error("BROWSER %i: Mismatch of status queue return values, trying again..." % self.crawl_id)
                    unsuccessful_spawns += 1
                    continue
                success = True
            except EmptyQueue:
                unsuccessful_spawns += 1
                self.logger.error("BROWSER %i: Spawn unsuccessful | Profile Created: %s | Profile Tar: %s | Display: %s | Launch attempted: %s | Browser: %s" %
                        (self.crawl_id, str(prof_done), str(prof_tar_done), str(disp_done), str(launch_attempted), str(browser_done)))
                self.kill_browser_manager()
                if self.current_profile_path is not None:
                    shutil.rmtree(self.current_profile_path, ignore_errors=True)

        # if recovering from a crash, new browser has a new profile dir
        # so the crashed dir and temporary tar dump can be cleaned up
        if success:
            self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.crawl_id)
            if tempdir is not None:
                shutil.rmtree(tempdir, ignore_errors=True)
            if crashed_profile_path is not None:
                shutil.rmtree(crashed_profile_path, ignore_errors=True)

            self.is_fresh = crashed_profile_path is None  # browser is fresh iff it starts from a blank profile
        
        return success