def grow(self, by=1, timeout=30, with_test=True): logging.debug("grow(by={}, timeout={}, with_test={})".format( by, timeout, with_test)) pre_partitions = self.get_partition_data() if with_test else None runners = [] new_ports = get_port_values(num = 3 * by, host = self.host, base_port=25000) addrs = [["{}:{}".format(self.host, p) for p in new_ports[i*3: i*3 + 3]] for i in range(by)] for x in range(by): runner = add_runner( worker_id=self._worker_id_counter, runners=self.workers, command=self.command, source_addrs=self.source_addrs, sink_addrs=self.sink_addrs, metrics_addr=self.metrics_addr, control_addr=self.workers[0].control, res_dir=self.res_dir, workers=by, my_control_addr=addrs[x][0], my_data_addr=addrs[x][1], my_external_addr=addrs[x][2]) self._worker_id_counter += 1 runners.append(runner) self.runners.append(runner) if with_test: workers = {'joining': [w.name for w in runners], 'leaving': []} self.confirm_migration(pre_partitions, workers) return runners
def __init__(self, command, host='127.0.0.1', sources=1, workers=1, sinks=1, sink_mode='framed', worker_join_timeout=30, is_ready_timeout=30, res_dir=None, runner_data=[]): # Create attributes self._finalized = False self.command = command self.host = host self.workers = TypedList(types=(Runner,)) self.dead_workers = TypedList(types=(Runner,)) self.runners = TypedList(types=(Runner,)) self.source_addrs = [] self.sink_addrs = [] self.sinks = [] self.senders = [] self.worker_join_timeout = worker_join_timeout self.is_ready_timeout = is_ready_timeout self.metrics = Metrics(host, mode='framed') self.errors = [] self._worker_id_counter = 0 if res_dir is None: self.res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') else: self.res_dir = res_dir self.runner_data = runner_data # Try to start everything... clean up on exception try: setup_resilience_path(self.res_dir) self.metrics.start() self.metrics_addr = ":".join( map(str, self.metrics.get_connection_info())) for s in range(sinks): self.sinks.append(Sink(host, mode=sink_mode)) self.sinks[-1].start() if self.sinks[-1].err is not None: raise self.sinks[-1].err self.sink_addrs = ["{}:{}" .format(*map(str,s.get_connection_info())) for s in self.sinks] num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) addresses = ['{}:{}'.format(host, p) for p in ports] (self.source_addrs, worker_addrs) = ( addresses[:sources], [addresses[sources:][i:i+3] for i in xrange(0, len(addresses[sources:]), 3)]) start_runners(self.workers, self.command, self.source_addrs, self.sink_addrs, self.metrics_addr, self.res_dir, workers, worker_addrs) self.runners.extend(self.workers) self._worker_id_counter = len(self.workers) # Wait for all runners to report ready to process self.wait_to_resume_processing(self.is_ready_timeout) # make sure `workers` runners are active and listed in the # cluster status query logging.debug("Testing cluster size via obs query") self.query_observability(cluster_status_query, self.runners[0].external, tests=[(worker_count_matches, [workers])]) except Exception as err: logging.error("Encountered and error when starting up the cluster") logging.exception(err) self.errors.append(err) self.__finally__() raise err