def starter_run_impl(self): lh.subsection("instance setup") for manager in self.starter_instances: logging.info("Spawning instance") manager.run_starter() logging.info("waiting for the starters to become alive") not_started = self.starter_instances[:] # This is a explicit copy while not_started: logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file)) if not_started[-1].is_instance_up(): not_started.pop() progress(".") time.sleep(1) logging.info("waiting for the cluster instances to become alive") for node in self.starter_instances: node.detect_instances() node.detect_instance_pids() # self.basecfg.add_frontend('http', self.basecfg.publicip, str(node.get_frontend_port())) logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header()) count = 0 for node in self.starter_instances: node.set_passvoid("cluster", count == 0) count += 1 self.passvoid = "cluster"
def jam_attempt_impl(self): self.first_leader.terminate_instance() logging.info("waiting for new leader...") self.new_leader = None while self.new_leader is None: for node in self.follower_nodes: node.detect_leader() if node.is_leader: logging.info('have a new leader: %s', str(node.arguments)) self.new_leader = node self.leader = node break progress('.') time.sleep(1) if self.selenium: self.selenium.connect_server(self.leader.get_frontends(), '_system', self.new_cfg if self.new_cfg else self.cfg) self.selenium.check_old(self.new_cfg if self.new_cfg else self.cfg, 1) print() logging.info(str(self.new_leader)) url = '{host}/_db/_system/_admin/aardvark/index.html#replication'.format( host=self.new_leader.get_frontend().get_local_url('')) reply = requests.get(url, auth=HTTPBasicAuth('root', self.leader.passvoid)) logging.info(str(reply)) if reply.status_code != 200: logging.info(reply.text) self.success = False self.set_frontend_instances() prompt_user(self.basecfg, '''The leader failover has happened. please revalidate the UI states on the new leader; you should see *one* follower.''') self.first_leader.respawn_instance() self.first_leader.detect_instances() logging.info("waiting for old leader to show up as follower") while not self.first_leader.active_failover_detect_host_now_follower(): progress('.') time.sleep(1) print() url = self.first_leader.get_frontend().get_local_url('') reply = requests.get(url, auth=HTTPBasicAuth('root', self.leader.passvoid)) logging.info(str(reply)) logging.info(str(reply.text)) if reply.status_code != 503: self.success = False prompt_user(self.basecfg, 'The old leader has been respawned as follower (%s),' ' so there should be two followers again.' % self.first_leader.get_frontend().get_public_url('root@') ) logging.info("state of this test is: %s", "Success" if self.success else "Failed") if self.selenium: self.selenium.check_old(self.new_cfg if self.new_cfg else self.cfg, 2, 20)
def upload_status(self, backup_name: str, status_id: str, instance_count: int, timeout: int = 180): """checking the progress of up/download""" args = [ "upload", "--status-id", status_id, ] while True: out = self.run_backup(args, backup_name, True) progress(".") counts = { "ACK": 0, "STARTED": 0, "COMPLETED": 0, "FAILED": 0, "CANCELLED": 0, } for line in out.split("\n"): match = re.match(r".*Status: (.*)", str(line)) if match: which = match.group(1) try: counts[which] += 1 except AttributeError: print("Line with unknown status [%s]: %s %s" % (which, line, str(counts))) if counts["COMPLETED"] == instance_count: print("all nodes have completed to restore the backup") return if counts["FAILED"] > 0: raise Exception("failed to create backup: " + str(out)) print("have to retry. " + str(counts) + " - " + str(instance_count)) timeout -= 1 if timeout <= 0: raise TimeoutError("failed to find %d 'COMPLETED' status for upload status" % instance_count) time.sleep(1)