def run(self, state): for index, stage in enumerate(self.stages): state.log.info('running stage "{}"'.format(stage)) try: stage.run(state) state.log.info('finished "{}"'.format(stage)) state.log.info('active hosts after this stage: {}'.format( hosts.format_hosts(state.active_hosts))) except (KeyboardInterrupt, Exception) as e: state.log.exception(e) state.log.error('stage "{}" failed completely'.format(stage)) for host in set(state.active_hosts): host.fail(stage, 'stage completely failed') if len(state.failed_hosts) > 0: state.log.warning('failed hosts after "{}": {}'.format( stage, hosts.format_hosts(state.failed_hosts))) state.log.warning('doing rollback for those') for stage in reversed(self.stages[:index + 1]): try: stage.rollback(state) except Exception as e: state.log.exception(e) state.log.error('rollback of "{}" failed') state.all_failed_hosts.update(state.failed_hosts) state.failed_hosts.clear() if len(state.active_hosts) == 0: state.log.error('all the hosts failed, stopping now') return False if len(state.all_failed_hosts) > 0: state.log.warning('finished. Failed hosts are: ') for host in sorted(state.all_failed_hosts, key=lambda host: host.name): stage, reason = host.failure state.log.warning('{}, stage: {}, reason: {}'.format( host.name, stage, reason)) else: state.log.info('finished.') return True
def run(self, state): for index, stage in enumerate(self.stages): state.log.info('running stage "{}"'.format(stage)) try: stage.run(state) state.log.info('finished "{}"'.format(stage)) state.log.info('active hosts after this stage: {}'.format( hosts.format_hosts(state.active_hosts))) except (KeyboardInterrupt, Exception) as e: state.log.exception(e) state.log.error('stage "{}" failed completely'.format(stage)) for host in set(state.active_hosts): host.fail(stage, 'stage completely failed') if len(state.failed_hosts) > 0: state.log.warning('failed hosts after "{}": {}'.format( stage, hosts.format_hosts(state.failed_hosts))) state.log.warning('doing rollback for those') for stage in reversed(self.stages[:index+1]): try: stage.rollback(state) except Exception as e: state.log.exception(e) state.log.error('rollback of "{}" failed') state.all_failed_hosts.update(state.failed_hosts) state.failed_hosts.clear() if len(state.active_hosts) == 0: state.log.error('all the hosts failed, stopping now') return False if len(state.all_failed_hosts) > 0: state.log.warning('finished. Failed hosts are: ') for host in sorted(state.all_failed_hosts, key=lambda host: host.name): stage, reason = host.failure state.log.warning('{}, stage: {}, reason: {}'.format( host.name, stage, reason)) else: state.log.info('finished.') return True
def run(self, state): for i in range(self.tries): if i != 0: time.sleep(self.pause) rv, failed = self.step(state) if not rv: state.log.warning('call to sinfo failed') failed = state.active_hosts break if len(failed) == 0: return state.log.warning( 'still waiting for SLURM to come up on ' + 'the following hosts: {}'.format(hosts.format_hosts(failed))) for host in failed: host.fail(self, 'timed out while waiting for SLURM to come up')
def run(self, state): for i in range(self.tries): if i != 0: time.sleep(self.pause) rv, failed = self.step(state) if not rv: state.log.warning('call to sinfo failed') failed = state.active_hosts break if len(failed) == 0: return state.log.warning('still waiting for SLURM to come up on ' + 'the following hosts: {}'.format( hosts.format_hosts(failed))) for host in failed: host.fail(self, 'timed out while waiting for SLURM to come up')