def serve_build(self): """This is the main function of the ``DeployAgent``. """ log.info('The deploy agent is starting.') if not self._executor: self._executor = Executor(callback=PingServer(self), config=self._config) # start to ping server to get the latest deploy goal response = self._client.send_reports(self._envs) self._response = response if self._response: report = self._update_internal_deploy_goal(self._response) # failed to update if report.status_code != AgentStatus.SUCCEEDED: self._update_ping_reports(deploy_report=report) self._client.send_reports(self._envs) return while self._response and self._response.opCode and self._response.opCode != OpCode.NOOP: try: # update the current deploy goal if self._response.deployGoal: deploy_report = self.process_deploy(self._response) else: log.info('No new deploy goal to get updated') deploy_report = DeployReport(AgentStatus.SUCCEEDED) if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER: log.info('switch to the new deploy goal: {}'.format( self._response.deployGoal)) continue except Exception: # anything catch-up here should be treated as agent failure deploy_report = DeployReport( status_code=AgentStatus.AGENT_FAILED, error_code=1, output_msg=traceback.format_exc(), retry_times=1) self.update_deploy_status(deploy_report) if deploy_report.status_code in [ AgentStatus.AGENT_FAILED, AgentStatus.TOO_MANY_RETRY, AgentStatus.SCRIPT_TIMEOUT ]: log.error('Unexpeted exceptions: {}, error message {}'.format( deploy_report.status_code, deploy_report.output_msg)) return self.clean_stale_builds() if self._response and self._response.deployGoal: self._update_internal_deploy_goal(self._response) if self._response: log.info('Complete the current deploy with response: {}.'.format( self._response)) else: log.info('Failed to get response from server, exit.')
def _update_internal_deploy_goal(self, response): deploy_goal = response.deployGoal if not deploy_goal: log.info('No deploy goal to be updated.') return DeployReport(status_code=AgentStatus.SUCCEEDED) # use envName as status map key env_name = deploy_goal.envName if (self._envs is None) or (self._envs.get(env_name) is None): self._envs[env_name] = DeployStatus() # update deploy_status from response for the environment self._envs[env_name].update_by_response(response) # update script variables if deploy_goal.scriptVariables: log.info('Start to generate script variables for deploy: {}'. format(deploy_goal.deployId)) env_dir = self._config.get_agent_directory() working_dir = os.path.join(env_dir, "{}_SCRIPT_CONFIG".format(env_name)) with open(working_dir, "w+") as f: for key, value in deploy_goal.scriptVariables.items(): f.write("{}={}\n".format(key, value)) # load deploy goal to the config self._curr_report = self._envs[env_name] self._config.update_variables(self._curr_report) self._executor.update_configs(self._config) log.info('current deploy goal is: {}'.format(deploy_goal)) return DeployReport(status_code=AgentStatus.SUCCEEDED)
def process_deploy(self, response): op_code = response.opCode deploy_goal = response.deployGoal if op_code == OpCode.TERMINATE or op_code == OpCode.DELETE: envName = self._resolve_deleted_env_name(deploy_goal.envName, deploy_goal.envId) if envName in self._envs: del self._envs[envName] else: log.info('Cannot find env {} in the ping report'.format(envName)) if self._curr_report.report.envName == deploy_goal.envName: self._curr_report = None return DeployReport(AgentStatus.SUCCEEDED, retry_times=1) else: curr_stage = deploy_goal.deployStage ''' DOWNLOADING and STAGING are two reserved deploy stages owned by agent: DOWNLOADING: download the tarball from pinrepo STAGING: In this step, deploy agent will chmod and change the symlink pointing to new service code, and etc. ''' if curr_stage == DeployStage.DOWNLOADING: return self._executor.run_cmd(self.get_download_script(deploy_goal=deploy_goal)) elif curr_stage == DeployStage.STAGING: log.info("set up symbolink for the package: {}".format(deploy_goal.deployId)) return self._executor.run_cmd(self.get_staging_script()) else: return self._executor.execute_command(curr_stage)
def test_report_with_deploy_goal(self): if os.path.exists('/tmp/env_status'): os.remove('/tmp/env_status') build = {} build['id'] = '123' build['url'] = 'https://test' client = mock.Mock() deploy_goal = {} deploy_goal['deployId'] = '123' deploy_goal['envName'] = '456' deploy_goal['envId'] = '789' deploy_goal['stageName'] = 'beta' deploy_goal['deployStage'] = DeployStage.PRE_DOWNLOAD deploy_goal['scriptVariables'] = build ping_response = {'deployGoal': deploy_goal, 'opCode': OpCode.DEPLOY} responses = [ PingResponse(jsonValue=ping_response), PingResponse(jsonValue=self.ping_noop_response) ] client.send_reports = mock.Mock(side_effect=responses) agent = DeployAgent(client=client, estatus=self.estatus, conf=self.config, executor=self.executor, helper=self.helper) agent.process_deploy = mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED))) agent.serve_build() self.assertEqual(agent._curr_report.report.envId, '789') self.assertEqual(agent._curr_report.report.deployStage, DeployStage.PRE_DOWNLOAD) self.assertEqual(len(agent._envs), 1)
def execute_command(self, script): try: deploy_step = os.getenv('DEPLOY_STEP') if not os.path.exists(self._config.get_script_directory()): """if the teletraan directory does not exist in the pre stage steps. It means it's a newly added host (never deployed before). Show a warning message and exit. Otherwise, we treat it as an agent failure (nothing to execute) """ error_msg = "teletraan directory cannot be found " \ "in the tar ball in step {}!".format(deploy_step) if deploy_step in PRE_STAGE_STEPS: log.warning(error_msg) return DeployReport(status_code=AgentStatus.SUCCEEDED) else: log.error(error_msg) return DeployReport(status_code=AgentStatus.AGENT_FAILED, error_code=1, retry_times=1, output_msg=error_msg) script = os.path.join(self._config.get_script_directory(), script) if not os.path.exists(script): if deploy_step == 'RESTARTING': # RESTARTING script is required error_msg = 'RESTARTING script does not exist.' log.error(error_msg) return DeployReport(status_code=AgentStatus.AGENT_FAILED, error_code=1, retry_times=1, output_msg=error_msg) else: log.info('script: {} does not exist.'.format(script)) return DeployReport(status_code=AgentStatus.SUCCEEDED) os.chdir(self._config.get_script_directory()) # change the mode of the script st = os.stat(script) os.chmod(script, st.st_mode | stat.S_IXUSR) return self.run_cmd(script) except Exception as e: error_msg = str(e) log.error('Failed to execute command: {}. Reason: {}'.format( script, error_msg)) log.error(traceback.format_exc()) return DeployReport(status_code=AgentStatus.AGENT_FAILED, error_code=1, output_msg=str(e))
def _update_ping_reports(self, deploy_report): if self._curr_report: self._curr_report.update_by_deploy_report(deploy_report) # if we failed to dump the status to the disk. We should notify the server # as agent failure. We set the current report to be agent failure, so server would # tell agent to abort current deploy, then exit result = self._env_status.dump_envs(self._envs) if (not result) and self._curr_report: self._curr_report.update_by_deploy_report( DeployReport(status_code=AgentStatus.AGENT_FAILED, error_code=1, output_msg='Failed to dump status to the disk'))
def run_cmd(self, cmd, **kw): if not isinstance(cmd, list): cmd = cmd.split(' ') cmd_str = ' '.join(cmd) log.info('Running: {} with {} retries.'.format(cmd_str, self.MAX_RETRY)) deploy_report = DeployReport(status_code=AgentStatus.UNKNOWN, error_code=0, retry_times=0) process_interval = self.PROCESS_POLL_INTERVAL start = datetime.datetime.now() init_start = datetime.datetime.now() total_retry = 0 with open(self.LOG_FILENAME, 'a+') as fdout: while total_retry < self.MAX_RETRY: try: fdout.seek(0, 2) file_pos = fdout.tell() process = subprocess.Popen(cmd, stdout=fdout, stderr=fdout, preexec_fn=os.setsid, **kw) while process.poll() is None: start, deploy_report = \ self.ping_server_if_possible(start, cmd, deploy_report) """ terminate case 1: the server changed the deploy goal, return to the agent to handle next deploy step """ if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER: Executor._kill_process(process) return deploy_report """ terminate case 2: the script gets timeout error, return to the agent to report to the server """ if (datetime.datetime.now() - init_start).seconds >= self.MAX_RUNNING_TIME: Executor._kill_process(process) # the best way to get output is to tail the log deploy_report.output_msg = self.get_subprocess_output(fd=fdout, file_pos=file_pos) log.info("Exceed max running time: {}.".format(self.MAX_RUNNING_TIME)) log.info("Output from subprocess: {}".format(deploy_report.output_msg)) deploy_report.status_code = AgentStatus.SCRIPT_TIMEOUT deploy_report.error_code = 1 return deploy_report # sleep some seconds before next poll sleep_time = self._get_sleep_interval(start, self.PROCESS_POLL_INTERVAL) time.sleep(sleep_time) # finish executing sub process deploy_report.error_code = process.returncode deploy_report.output_msg = self.get_subprocess_output(fd=fdout, file_pos=file_pos) if process.returncode == 0: log.info('Running: {} succeeded.'.format(cmd_str)) deploy_report.status_code = AgentStatus.SUCCEEDED return deploy_report except Exception: error_msg = traceback.format_exc() deploy_report.error_code = 1 deploy_report.output_msg = error_msg log.error(error_msg) # fails when: # subprocess execution fails # popen throws deploy_report.status_code = AgentStatus.SCRIPT_FAILED deploy_report.retry_times += 1 total_retry += 1 """ Terminate case 3: Too many failed retries, return to the agent and report to the server. """ if total_retry >= self.MAX_RETRY: deploy_report.status_code = AgentStatus.TOO_MANY_RETRY return deploy_report init_start = datetime.datetime.now() # reset the initial start time log.info('Failed: {}, at {} retry. Error:\n{}'.format(cmd_str, deploy_report.retry_times, deploy_report.output_msg)) sleep_time = self._get_sleep_interval(start, process_interval) time.sleep(sleep_time) start, deploy_report = self.ping_server_if_possible(start, cmd, deploy_report) if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER: return deploy_report # sleep the rest of the time if process_interval - sleep_time > 0: time.sleep(process_interval - sleep_time) # exponential backoff process_interval = min(process_interval * self.BACK_OFF, self.MAX_SLEEP_INTERVAL) deploy_report.status_code = AgentStatus.TOO_MANY_RETRY return deploy_report
def run_cmd(self, cmd, **kw): if not isinstance(cmd, list): cmd = cmd.split(' ') cmd_str = ' '.join(cmd) log.info('Running: {} with {} retries.'.format(cmd_str, self.MAX_RETRY)) deploy_report = DeployReport(status_code=AgentStatus.UNKNOWN, error_code=0, retry_times=0) process_interval = self.PROCESS_POLL_INTERVAL start = datetime.datetime.now() init_start = datetime.datetime.now() total_retry = 0 with open(self.LOG_FILENAME, 'a+') as fdout: while total_retry < self.MAX_RETRY: try: fdout.seek(0, 2) file_pos = fdout.tell() process = subprocess.Popen(cmd, stdout=fdout, stderr=fdout, preexec_fn=os.setsid, **kw) while process.poll() is None: start, deploy_report = \ self.ping_server_if_possible(start, cmd, deploy_report) """ terminate case 1: the server changed the deploy goal, return to the agent to handle next deploy step """ if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER: Executor._kill_process(process) return deploy_report """ terminate case 2: the script gets timeout error, return to the agent to report to the server """ if (datetime.datetime.now() - init_start).seconds >= self.MAX_RUNNING_TIME: Executor._kill_process(process) # the best way to get output is to tail the log deploy_report.output_msg = self.get_subprocess_output( fd=fdout, file_pos=file_pos) log.info("Exceed max running time: {}.".format( self.MAX_RUNNING_TIME)) log.info("Output from subprocess: {}".format( deploy_report.output_msg)) deploy_report.status_code = AgentStatus.SCRIPT_TIMEOUT deploy_report.error_code = 1 return deploy_report # sleep some seconds before next poll sleep_time = self._get_sleep_interval( start, self.PROCESS_POLL_INTERVAL) time.sleep(sleep_time) # finish executing sub process deploy_report.error_code = process.returncode deploy_report.output_msg = self.get_subprocess_output( fd=fdout, file_pos=file_pos) if process.returncode == 0: log.info('Running: {} succeeded.'.format(cmd_str)) deploy_report.status_code = AgentStatus.SUCCEEDED return deploy_report except Exception: error_msg = traceback.format_exc() deploy_report.error_code = 1 deploy_report.output_msg = error_msg log.error(error_msg) # fails when: # subprocess execution fails # popen throws deploy_report.status_code = AgentStatus.SCRIPT_FAILED deploy_report.retry_times += 1 total_retry += 1 """ Terminate case 3: Too many failed retries, return to the agent and report to the server. """ if total_retry >= self.MAX_RETRY: deploy_report.status_code = AgentStatus.TOO_MANY_RETRY return deploy_report init_start = datetime.datetime.now( ) # reset the initial start time log.info('Failed: {}, at {} retry. Error:\n{}'.format( cmd_str, deploy_report.retry_times, deploy_report.output_msg)) sleep_time = self._get_sleep_interval(start, process_interval) time.sleep(sleep_time) start, deploy_report = self.ping_server_if_possible( start, cmd, deploy_report) if deploy_report.status_code == AgentStatus.ABORTED_BY_SERVER: return deploy_report # sleep the rest of the time if process_interval - sleep_time > 0: time.sleep(process_interval - sleep_time) # exponential backoff process_interval = min(process_interval * self.BACK_OFF, self.MAX_SLEEP_INTERVAL) deploy_report.status_code = AgentStatus.TOO_MANY_RETRY return deploy_report
def setUpClass(cls): cls.estatus = mock.Mock() cls.estatus.load_envs = mock.Mock(return_value=None) cls.config = mock.Mock() cls.config.load_env_and_configs = mock.Mock() cls.config.get_var = mock.Mock(return_value='') cls.config.get_intvar(return_value=1) cls.config.get_target = mock.Mock(return_value='/tmp/tests') cls.config.get_config_filename = mock.Mock(return_value='/etc/deployagent.conf') cls.config.get_agent_directory = mock.Mock(return_value='/tmp/deployd/') cls.config.get_builds_directory = mock.Mock(return_value='/tmp/deployd/builds/') cls.config.get_log_directory = mock.Mock(return_value='/tmp/logs/') ensure_dirs(cls.config) cls.executor = mock.Mock() cls.executor.execute_command = \ mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED))) cls.executor.run_cmd = mock.Mock(return_value=(DeployReport(AgentStatus.SUCCEEDED))) cls.helper = mock.Mock() cls.helper.get_stale_builds = mock.Mock(return_value=[]) build = {} build['id'] = '123' build['name'] = 'abc' build['commitShort'] = '345' build['artifactUrl'] = 'https://test' envvar = {} envvar['id'] = 'abc' envvar['url'] = 'https://test' cls.deploy_goal1 = {} cls.deploy_goal1['deployId'] = '123' cls.deploy_goal1['envName'] = 'abc' cls.deploy_goal1['envId'] = 'def' cls.deploy_goal1['stageName'] = 'beta' cls.deploy_goal1['deployStage'] = DeployStage.PRE_DOWNLOAD cls.deploy_goal1['scriptVariables'] = envvar cls.deploy_goal2 = {} cls.deploy_goal2['deployId'] = '123' cls.deploy_goal2['envName'] = 'abc' cls.deploy_goal2['envId'] = 'def' cls.deploy_goal2['stageName'] = 'beta' cls.deploy_goal2['deployStage'] = DeployStage.DOWNLOADING cls.deploy_goal2['build'] = build cls.deploy_goal3 = {} cls.deploy_goal3['deployId'] = '123' cls.deploy_goal3['envName'] = 'abc' cls.deploy_goal3['envId'] = 'def' cls.deploy_goal3['stageName'] = 'beta' cls.deploy_goal3['deployStage'] = DeployStage.STAGING cls.deploy_goal4 = {} cls.deploy_goal4['deployId'] = '123' cls.deploy_goal4['envName'] = 'abc' cls.deploy_goal4['envId'] = 'def' cls.deploy_goal4['stageName'] = 'beta' cls.deploy_goal4['deployStage'] = DeployStage.PRE_RESTART cls.deploy_goal5 = {} cls.deploy_goal5['deployId'] = '123' cls.deploy_goal5['envName'] = 'abc' cls.deploy_goal5['envId'] = 'def' cls.deploy_goal5['stageName'] = 'beta' cls.deploy_goal5['deployId'] = '234' cls.deploy_goal5['deployStage'] = DeployStage.PRE_DOWNLOAD cls.deploy_goal5['build'] = build cls.deploy_goal6 = {} cls.deploy_goal6['deployId'] = '123' cls.deploy_goal6['envName'] = 'abc' cls.deploy_goal6['envId'] = 'def' cls.deploy_goal6['stageName'] = 'beta' cls.deploy_goal6['deployId'] = '234' cls.deploy_goal6['deployStage'] = DeployStage.SERVING_BUILD cls.ping_response1 = {'deployGoal': cls.deploy_goal1, 'opCode': OpCode.DEPLOY} cls.ping_response2 = {'deployGoal': cls.deploy_goal2, 'opCode': OpCode.DEPLOY} cls.ping_response3 = {'deployGoal': cls.deploy_goal3, 'opCode': OpCode.DEPLOY} cls.ping_response4 = {'deployGoal': cls.deploy_goal4, 'opCode': OpCode.DEPLOY} cls.ping_response5 = {'deployGoal': cls.deploy_goal5, 'opCode': OpCode.DELETE} cls.ping_response6 = {'deployGoal': cls.deploy_goal6, 'opCode': OpCode.DELETE} cls.ping_noop_response = {'deployGoal': None, 'opCode': OpCode.NOOP}