Exemple #1
0
    def serve_build(self):
        """This is the main function of the ``DeployAgent``.
        """
        log.info('The deploy agent is starting.')
        if not self._executor:
            self._executor = Executor(callback=PingServer(self),
                                      config=self._config)

        # start to ping server to get the latest deploy goal
        response = self._client.send_reports(self._envs)

        self._response = response
        if self._response:
            report = self._update_internal_deploy_goal(self._response)
            # failed to update
            if report.status_code != AgentStatus.SUCCEEDED:
                self._update_ping_reports(deploy_report=report)
                self._client.send_reports(self._envs)
                return

        while self._response and self._response.opCode and self._response.opCode != OpCode.NOOP:
            try:
                # update the current deploy goal
                if self._response.deployGoal:
                    deploy_report = self.process_deploy(self._response)
                else:
                    log.info('No new deploy goal to get updated')
                    deploy_report = DeployReport(AgentStatus.SUCCEEDED)

                if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER:
                    log.info('switch to the new deploy goal: {}'.format(
                        self._response.deployGoal))
                    continue

            except Exception:
                # anything catch-up here should be treated as agent failure
                deploy_report = DeployReport(
                    status_code=AgentStatus.AGENT_FAILED,
                    error_code=1,
                    output_msg=traceback.format_exc(),
                    retry_times=1)

            self.update_deploy_status(deploy_report)
            if deploy_report.status_code in [
                    AgentStatus.AGENT_FAILED, AgentStatus.TOO_MANY_RETRY,
                    AgentStatus.SCRIPT_TIMEOUT
            ]:
                log.error('Unexpeted exceptions: {}, error message {}'.format(
                    deploy_report.status_code, deploy_report.output_msg))
                return

        self.clean_stale_builds()
        if self._response and self._response.deployGoal:
            self._update_internal_deploy_goal(self._response)

        if self._response:
            log.info('Complete the current deploy with response: {}.'.format(
                self._response))
        else:
            log.info('Failed to get response from server, exit.')
Exemple #2
0
    def _update_internal_deploy_goal(self, response):
        deploy_goal = response.deployGoal
        if not deploy_goal:
            log.info('No deploy goal to be updated.')
            return DeployReport(status_code=AgentStatus.SUCCEEDED)

        # use envName as status map key
        env_name = deploy_goal.envName
        if (self._envs is None) or (self._envs.get(env_name) is None):
            self._envs[env_name] = DeployStatus()

        # update deploy_status from response for the environment
        self._envs[env_name].update_by_response(response)

        # update script variables
        if deploy_goal.scriptVariables:
            log.info('Start to generate script variables for deploy: {}'.
                     format(deploy_goal.deployId))
            env_dir = self._config.get_agent_directory()
            working_dir = os.path.join(env_dir, "{}_SCRIPT_CONFIG".format(env_name))
            with open(working_dir, "w+") as f:
                for key, value in deploy_goal.scriptVariables.items():
                    f.write("{}={}\n".format(key, value))

        # load deploy goal to the config
        self._curr_report = self._envs[env_name]
        self._config.update_variables(self._curr_report)
        self._executor.update_configs(self._config)
        log.info('current deploy goal is: {}'.format(deploy_goal))
        return DeployReport(status_code=AgentStatus.SUCCEEDED)
Exemple #3
0
    def process_deploy(self, response):
        op_code = response.opCode
        deploy_goal = response.deployGoal
        if op_code == OpCode.TERMINATE or op_code == OpCode.DELETE:
            envName = self._resolve_deleted_env_name(deploy_goal.envName, deploy_goal.envId)
            if envName in self._envs:
                del self._envs[envName]
            else:
                log.info('Cannot find env {} in the ping report'.format(envName))

            if self._curr_report.report.envName == deploy_goal.envName:
                self._curr_report = None

            return DeployReport(AgentStatus.SUCCEEDED, retry_times=1)
        else:
            curr_stage = deploy_goal.deployStage
            '''
            DOWNLOADING and STAGING are two reserved deploy stages owned by agent:
            DOWNLOADING: download the tarball from pinrepo
            STAGING: In this step, deploy agent will chmod and change the symlink pointing to
              new service code, and etc.
            '''
            if curr_stage == DeployStage.DOWNLOADING:
                return self._executor.run_cmd(self.get_download_script(deploy_goal=deploy_goal))
            elif curr_stage == DeployStage.STAGING:
                log.info("set up symbolink for the package: {}".format(deploy_goal.deployId))
                return self._executor.run_cmd(self.get_staging_script())
            else:
                return self._executor.execute_command(curr_stage)
Exemple #4
0
    def test_report_with_deploy_goal(self):
        if os.path.exists('/tmp/env_status'):
            os.remove('/tmp/env_status')

        build = {}
        build['id'] = '123'
        build['url'] = 'https://test'
        client = mock.Mock()
        deploy_goal = {}
        deploy_goal['deployId'] = '123'
        deploy_goal['envName'] = '456'
        deploy_goal['envId'] = '789'
        deploy_goal['stageName'] = 'beta'
        deploy_goal['deployStage'] = DeployStage.PRE_DOWNLOAD
        deploy_goal['scriptVariables'] = build
        ping_response = {'deployGoal': deploy_goal, 'opCode': OpCode.DEPLOY}

        responses = [
            PingResponse(jsonValue=ping_response),
            PingResponse(jsonValue=self.ping_noop_response)
        ]
        client.send_reports = mock.Mock(side_effect=responses)
        agent = DeployAgent(client=client, estatus=self.estatus, conf=self.config,
                            executor=self.executor, helper=self.helper)
        agent.process_deploy = mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED)))
        agent.serve_build()
        self.assertEqual(agent._curr_report.report.envId, '789')
        self.assertEqual(agent._curr_report.report.deployStage, DeployStage.PRE_DOWNLOAD)
        self.assertEqual(len(agent._envs), 1)
Exemple #5
0
    def execute_command(self, script):
        try:
            deploy_step = os.getenv('DEPLOY_STEP')
            if not os.path.exists(self._config.get_script_directory()):
                """if the teletraan directory does not exist in the pre stage steps. It
                means it's a newly added host (never deployed before). Show a warning message
                and exit. Otherwise, we treat it as an agent failure (nothing to execute)
                """
                error_msg = "teletraan directory cannot be found " \
                            "in the tar ball in step {}!".format(deploy_step)
                if deploy_step in PRE_STAGE_STEPS:
                    log.warning(error_msg)
                    return DeployReport(status_code=AgentStatus.SUCCEEDED)
                else:
                    log.error(error_msg)
                    return DeployReport(status_code=AgentStatus.AGENT_FAILED,
                                        error_code=1,
                                        retry_times=1,
                                        output_msg=error_msg)

            script = os.path.join(self._config.get_script_directory(), script)
            if not os.path.exists(script):
                if deploy_step == 'RESTARTING':
                    # RESTARTING script is required
                    error_msg = 'RESTARTING script does not exist.'
                    log.error(error_msg)
                    return DeployReport(status_code=AgentStatus.AGENT_FAILED,
                                        error_code=1,
                                        retry_times=1,
                                        output_msg=error_msg)
                else:
                    log.info('script: {} does not exist.'.format(script))
                    return DeployReport(status_code=AgentStatus.SUCCEEDED)

            os.chdir(self._config.get_script_directory())
            # change the mode of the script
            st = os.stat(script)
            os.chmod(script, st.st_mode | stat.S_IXUSR)
            return self.run_cmd(script)
        except Exception as e:
            error_msg = str(e)
            log.error('Failed to execute command: {}. Reason: {}'.format(
                script, error_msg))
            log.error(traceback.format_exc())
            return DeployReport(status_code=AgentStatus.AGENT_FAILED,
                                error_code=1,
                                output_msg=str(e))
Exemple #6
0
    def _update_ping_reports(self, deploy_report):
        if self._curr_report:
            self._curr_report.update_by_deploy_report(deploy_report)

        # if we failed to dump the status to the disk. We should notify the server
        # as agent failure. We set the current report to be agent failure, so server would
        # tell agent to abort current deploy, then exit
        result = self._env_status.dump_envs(self._envs)
        if (not result) and self._curr_report:
            self._curr_report.update_by_deploy_report(
                DeployReport(status_code=AgentStatus.AGENT_FAILED,
                             error_code=1,
                             output_msg='Failed to dump status to the disk'))
Exemple #7
0
    def run_cmd(self, cmd, **kw):
        if not isinstance(cmd, list):
            cmd = cmd.split(' ')
        cmd_str = ' '.join(cmd)
        log.info('Running: {} with {} retries.'.format(cmd_str, self.MAX_RETRY))

        deploy_report = DeployReport(status_code=AgentStatus.UNKNOWN,
                                     error_code=0,
                                     retry_times=0)
        process_interval = self.PROCESS_POLL_INTERVAL
        start = datetime.datetime.now()
        init_start = datetime.datetime.now()
        total_retry = 0

        with open(self.LOG_FILENAME, 'a+') as fdout:
            while total_retry < self.MAX_RETRY:
                try:
                    fdout.seek(0, 2)
                    file_pos = fdout.tell()
                    process = subprocess.Popen(cmd, stdout=fdout, stderr=fdout,
                                               preexec_fn=os.setsid, **kw)
                    while process.poll() is None:
                        start, deploy_report = \
                            self.ping_server_if_possible(start, cmd, deploy_report)
                        """
                        terminate case 1:
                        the server changed the deploy goal, return to the agent to handle next
                        deploy step
                        """
                        if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER:
                            Executor._kill_process(process)
                            return deploy_report

                        """
                        terminate case 2:
                        the script gets timeout error, return to the agent to report to the server
                        """
                        if (datetime.datetime.now() - init_start).seconds >= self.MAX_RUNNING_TIME:
                            Executor._kill_process(process)
                            # the best way to get output is to tail the log
                            deploy_report.output_msg = self.get_subprocess_output(fd=fdout,
                                                                                  file_pos=file_pos)
                            log.info("Exceed max running time: {}.".format(self.MAX_RUNNING_TIME))
                            log.info("Output from subprocess: {}".format(deploy_report.output_msg))
                            deploy_report.status_code = AgentStatus.SCRIPT_TIMEOUT
                            deploy_report.error_code = 1
                            return deploy_report

                        # sleep some seconds before next poll
                        sleep_time = self._get_sleep_interval(start, self.PROCESS_POLL_INTERVAL)
                        time.sleep(sleep_time)

                    # finish executing sub process
                    deploy_report.error_code = process.returncode
                    deploy_report.output_msg = self.get_subprocess_output(fd=fdout,
                                                                          file_pos=file_pos)
                    if process.returncode == 0:
                        log.info('Running: {} succeeded.'.format(cmd_str))
                        deploy_report.status_code = AgentStatus.SUCCEEDED
                        return deploy_report
                except Exception:
                    error_msg = traceback.format_exc()
                    deploy_report.error_code = 1
                    deploy_report.output_msg = error_msg
                    log.error(error_msg)

                # fails when:
                # subprocess execution fails
                # popen throws
                deploy_report.status_code = AgentStatus.SCRIPT_FAILED
                deploy_report.retry_times += 1
                total_retry += 1

                """
                Terminate case 3:
                Too many failed retries, return to the agent and report to the server.
                """
                if total_retry >= self.MAX_RETRY:
                    deploy_report.status_code = AgentStatus.TOO_MANY_RETRY
                    return deploy_report

                init_start = datetime.datetime.now()  # reset the initial start time

                log.info('Failed: {}, at {} retry. Error:\n{}'.format(cmd_str,
                                                                      deploy_report.retry_times,
                                                                      deploy_report.output_msg))
                sleep_time = self._get_sleep_interval(start, process_interval)
                time.sleep(sleep_time)
                start, deploy_report = self.ping_server_if_possible(start, cmd, deploy_report)
                if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER:
                    return deploy_report

                # sleep the rest of the time
                if process_interval - sleep_time > 0:
                    time.sleep(process_interval - sleep_time)
                # exponential backoff
                process_interval = min(process_interval * self.BACK_OFF, self.MAX_SLEEP_INTERVAL)

        deploy_report.status_code = AgentStatus.TOO_MANY_RETRY
        return deploy_report
Exemple #8
0
    def run_cmd(self, cmd, **kw):
        if not isinstance(cmd, list):
            cmd = cmd.split(' ')
        cmd_str = ' '.join(cmd)
        log.info('Running: {} with {} retries.'.format(cmd_str,
                                                       self.MAX_RETRY))

        deploy_report = DeployReport(status_code=AgentStatus.UNKNOWN,
                                     error_code=0,
                                     retry_times=0)
        process_interval = self.PROCESS_POLL_INTERVAL
        start = datetime.datetime.now()
        init_start = datetime.datetime.now()
        total_retry = 0

        with open(self.LOG_FILENAME, 'a+') as fdout:
            while total_retry < self.MAX_RETRY:
                try:
                    fdout.seek(0, 2)
                    file_pos = fdout.tell()
                    process = subprocess.Popen(cmd,
                                               stdout=fdout,
                                               stderr=fdout,
                                               preexec_fn=os.setsid,
                                               **kw)
                    while process.poll() is None:
                        start, deploy_report = \
                            self.ping_server_if_possible(start, cmd, deploy_report)
                        """
                        terminate case 1:
                        the server changed the deploy goal, return to the agent to handle next
                        deploy step
                        """
                        if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER:
                            Executor._kill_process(process)
                            return deploy_report
                        """
                        terminate case 2:
                        the script gets timeout error, return to the agent to report to the server
                        """
                        if (datetime.datetime.now() -
                                init_start).seconds >= self.MAX_RUNNING_TIME:
                            Executor._kill_process(process)
                            # the best way to get output is to tail the log
                            deploy_report.output_msg = self.get_subprocess_output(
                                fd=fdout, file_pos=file_pos)
                            log.info("Exceed max running time: {}.".format(
                                self.MAX_RUNNING_TIME))
                            log.info("Output from subprocess: {}".format(
                                deploy_report.output_msg))
                            deploy_report.status_code = AgentStatus.SCRIPT_TIMEOUT
                            deploy_report.error_code = 1
                            return deploy_report

                        # sleep some seconds before next poll
                        sleep_time = self._get_sleep_interval(
                            start, self.PROCESS_POLL_INTERVAL)
                        time.sleep(sleep_time)

                    # finish executing sub process
                    deploy_report.error_code = process.returncode
                    deploy_report.output_msg = self.get_subprocess_output(
                        fd=fdout, file_pos=file_pos)
                    if process.returncode == 0:
                        log.info('Running: {} succeeded.'.format(cmd_str))
                        deploy_report.status_code = AgentStatus.SUCCEEDED
                        return deploy_report
                except Exception:
                    error_msg = traceback.format_exc()
                    deploy_report.error_code = 1
                    deploy_report.output_msg = error_msg
                    log.error(error_msg)

                # fails when:
                # subprocess execution fails
                # popen throws
                deploy_report.status_code = AgentStatus.SCRIPT_FAILED
                deploy_report.retry_times += 1
                total_retry += 1
                """
                Terminate case 3:
                Too many failed retries, return to the agent and report to the server.
                """
                if total_retry >= self.MAX_RETRY:
                    deploy_report.status_code = AgentStatus.TOO_MANY_RETRY
                    return deploy_report

                init_start = datetime.datetime.now(
                )  # reset the initial start time

                log.info('Failed: {}, at {} retry. Error:\n{}'.format(
                    cmd_str, deploy_report.retry_times,
                    deploy_report.output_msg))
                sleep_time = self._get_sleep_interval(start, process_interval)
                time.sleep(sleep_time)
                start, deploy_report = self.ping_server_if_possible(
                    start, cmd, deploy_report)
                if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER:
                    return deploy_report

                # sleep the rest of the time
                if process_interval - sleep_time > 0:
                    time.sleep(process_interval - sleep_time)
                # exponential backoff
                process_interval = min(process_interval * self.BACK_OFF,
                                       self.MAX_SLEEP_INTERVAL)

        deploy_report.status_code = AgentStatus.TOO_MANY_RETRY
        return deploy_report
Exemple #9
0
    def setUpClass(cls):
        cls.estatus = mock.Mock()
        cls.estatus.load_envs = mock.Mock(return_value=None)
        cls.config = mock.Mock()
        cls.config.load_env_and_configs = mock.Mock()
        cls.config.get_var = mock.Mock(return_value='')
        cls.config.get_intvar(return_value=1)
        cls.config.get_target = mock.Mock(return_value='/tmp/tests')
        cls.config.get_config_filename = mock.Mock(return_value='/etc/deployagent.conf')
        cls.config.get_agent_directory = mock.Mock(return_value='/tmp/deployd/')
        cls.config.get_builds_directory = mock.Mock(return_value='/tmp/deployd/builds/')
        cls.config.get_log_directory = mock.Mock(return_value='/tmp/logs/')
        ensure_dirs(cls.config)
        cls.executor = mock.Mock()
        cls.executor.execute_command = \
            mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED)))
        cls.executor.run_cmd = mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED)))
        cls.helper = mock.Mock()
        cls.helper.get_stale_builds = mock.Mock(return_value=[])

        build = {}
        build['id'] = '123'
        build['name'] = 'abc'
        build['commitShort'] = '345'
        build['artifactUrl'] = 'https://test'

        envvar = {}
        envvar['id'] = 'abc'
        envvar['url'] = 'https://test'

        cls.deploy_goal1 = {}
        cls.deploy_goal1['deployId'] = '123'
        cls.deploy_goal1['envName'] = 'abc'
        cls.deploy_goal1['envId'] = 'def'
        cls.deploy_goal1['stageName'] = 'beta'
        cls.deploy_goal1['deployStage'] = DeployStage.PRE_DOWNLOAD
        cls.deploy_goal1['scriptVariables'] = envvar

        cls.deploy_goal2 = {}
        cls.deploy_goal2['deployId'] = '123'
        cls.deploy_goal2['envName'] = 'abc'
        cls.deploy_goal2['envId'] = 'def'
        cls.deploy_goal2['stageName'] = 'beta'
        cls.deploy_goal2['deployStage'] = DeployStage.DOWNLOADING
        cls.deploy_goal2['build'] = build

        cls.deploy_goal3 = {}
        cls.deploy_goal3['deployId'] = '123'
        cls.deploy_goal3['envName'] = 'abc'
        cls.deploy_goal3['envId'] = 'def'
        cls.deploy_goal3['stageName'] = 'beta'
        cls.deploy_goal3['deployStage'] = DeployStage.STAGING

        cls.deploy_goal4 = {}
        cls.deploy_goal4['deployId'] = '123'
        cls.deploy_goal4['envName'] = 'abc'
        cls.deploy_goal4['envId'] = 'def'
        cls.deploy_goal4['stageName'] = 'beta'
        cls.deploy_goal4['deployStage'] = DeployStage.PRE_RESTART

        cls.deploy_goal5 = {}
        cls.deploy_goal5['deployId'] = '123'
        cls.deploy_goal5['envName'] = 'abc'
        cls.deploy_goal5['envId'] = 'def'
        cls.deploy_goal5['stageName'] = 'beta'
        cls.deploy_goal5['deployId'] = '234'
        cls.deploy_goal5['deployStage'] = DeployStage.PRE_DOWNLOAD
        cls.deploy_goal5['build'] = build

        cls.deploy_goal6 = {}
        cls.deploy_goal6['deployId'] = '123'
        cls.deploy_goal6['envName'] = 'abc'
        cls.deploy_goal6['envId'] = 'def'
        cls.deploy_goal6['stageName'] = 'beta'
        cls.deploy_goal6['deployId'] = '234'
        cls.deploy_goal6['deployStage'] = DeployStage.SERVING_BUILD

        cls.ping_response1 = {'deployGoal': cls.deploy_goal1, 'opCode': OpCode.DEPLOY}
        cls.ping_response2 = {'deployGoal': cls.deploy_goal2, 'opCode': OpCode.DEPLOY}
        cls.ping_response3 = {'deployGoal': cls.deploy_goal3, 'opCode': OpCode.DEPLOY}
        cls.ping_response4 = {'deployGoal': cls.deploy_goal4, 'opCode': OpCode.DEPLOY}
        cls.ping_response5 = {'deployGoal': cls.deploy_goal5, 'opCode': OpCode.DELETE}
        cls.ping_response6 = {'deployGoal': cls.deploy_goal6, 'opCode': OpCode.DELETE}
        cls.ping_noop_response = {'deployGoal': None, 'opCode': OpCode.NOOP}