def transition_when_all_children_responded(self, sender, msg, expected_status, new_status, transition): """ Waits until all children have sent a specific response message and then transitions this actor to a new status. :param sender: The child actor that has responded. :param msg: The response message. :param expected_status: The status in which this actor should be upon calling this method. :param new_status: The new status once all child actors have responded. :param transition: A parameter-less function to call immediately after changing the status. """ if self.is_current_status_expected(expected_status): self.received_responses.append(msg) response_count = len(self.received_responses) expected_count = len(self.children) self.logger.debug("[%d] of [%d] child actors have responded for transition from [%s] to [%s].", response_count, expected_count, self.status, new_status) if response_count == expected_count: self.logger.debug("All [%d] child actors have responded. Transitioning now from [%s] to [%s].", expected_count, self.status, new_status) # all nodes have responded, change status self.status = new_status self.received_responses = [] transition() elif response_count > expected_count: raise exceptions.RallyAssertionError( "Received [%d] responses but only [%d] were expected to transition from [%s] to [%s]. The responses are: %s" % (response_count, expected_count, self.status, new_status, self.received_responses)) else: raise exceptions.RallyAssertionError("Received [%s] from [%s] but we are in status [%s] instead of [%s]." % (type(msg), sender, self.status, expected_status))
def drive(self): task = None # skip non-tasks in the task list while task is None: task = self.tasks[self.current_task] self.current_task += 1 if isinstance(task, JoinPoint): logger.info("client [%d] reached join point [%s]." % (self.client_id, task)) # clients that don't execute tasks don't need to care about waiting if self.executor_future is not None: self.executor_future.result() self.send_samples() self.executor_future = None self.sampler = None self.send(self.master, JoinPointReached(self.client_id, task)) elif isinstance(task, track.Task): logger.info("Client [%d] is executing [%s]." % (self.client_id, task)) self.sampler = Sampler(self.client_id, task.operation, self.start_timestamp) schedule = schedule_for(self.track, task, self.client_id) self.executor_future = self.pool.submit(execute_schedule, schedule, self.es, self.sampler) self.wakeupAfter( datetime.timedelta( seconds=LoadGenerator.WAKEUP_INTERVAL_SECONDS)) else: raise exceptions.RallyAssertionError("Unknown task type [%s]" % type(task))
def create(cfg, metrics_store, node_ip, node_http_port, all_node_ips, all_node_ids, sources=False, distribution=False, external=False, docker=False): race_root_path = paths.race_root(cfg) node_ids = cfg.opts("provisioning", "node.ids", mandatory=False) node_name_prefix = cfg.opts("provisioning", "node.name.prefix") car, plugins = load_team(cfg, external) if sources or distribution: s = supplier.create(cfg, sources, distribution, car, plugins) p = [] all_node_names = ["%s-%s" % (node_name_prefix, n) for n in all_node_ids] for node_id in node_ids: node_name = "%s-%s" % (node_name_prefix, node_id) p.append( provisioner.local(cfg, car, plugins, node_ip, node_http_port, all_node_ips, all_node_names, race_root_path, node_name)) l = launcher.ProcessLauncher(cfg) elif external: raise exceptions.RallyAssertionError("Externally provisioned clusters should not need to be managed by Rally's mechanic") elif docker: if len(plugins) > 0: raise exceptions.SystemSetupError("You cannot specify any plugins for Docker clusters. Please remove " "\"--elasticsearch-plugins\" and try again.") s = lambda: None p = [] for node_id in node_ids: node_name = "%s-%s" % (node_name_prefix, node_id) p.append(provisioner.docker(cfg, car, node_ip, node_http_port, race_root_path, node_name)) l = launcher.DockerLauncher(cfg) else: # It is a programmer error (and not a user error) if this function is called with wrong parameters raise RuntimeError("One of sources, distribution, docker or external must be True") return Mechanic(cfg, metrics_store, s, p, l)
def iteration_count_based(target_throughput, warmup_iterations, iterations, runner, params): """ Calculates the necessary schedule based on a given number of iterations. :param target_throughput: The desired target throughput in operations / second or None if throughput should not be limited. :param warmup_iterations: The number of warmup iterations to run. 0 if no warmup should be performed. :param iterations: The number of measurement iterations to run. :param runner: The runner for a given operation. :param params: The parameter source for a given operation. :return: A generator for the corresponding parameters. """ wait_time = 1 / target_throughput if target_throughput else 0 total_iterations = warmup_iterations + iterations if total_iterations == 0: raise exceptions.RallyAssertionError( "Operation must run at least for one iteration.") for i in range(0, warmup_iterations): yield (wait_time * i, lambda start: metrics.SampleType.Warmup, i, total_iterations, runner, params.params()) for i in range(0, iterations): yield (wait_time * (warmup_iterations + i), lambda start: metrics.SampleType.Normal, i, total_iterations, runner, params.params())
def partition(self, partition_index, total_partitions): if self.total_partitions is None: self.total_partitions = total_partitions elif self.total_partitions != total_partitions: raise exceptions.RallyAssertionError( f"Total partitions is expected to be [{self.total_partitions}] but was [{total_partitions}]") self.partitions.append(partition_index)
def after_request(self, now, weight, unit, request_meta_data): if weight > 0 and (self.first_request or self.current_weight != weight): expected_unit = self.task.target_throughput.unit actual_unit = f"{unit}/s" if actual_unit != expected_unit: # *temporary* workaround to convert mismatching units to ops/s to stay backwards-compatible. # # This ensures that we throttle based on ops/s but report based on the original unit (as before). if expected_unit == "ops/s": weight = 1 if self.first_request: logging.getLogger(__name__).warning( "Task [%s] throttles based on [%s] but reports [%s]. Please specify the target throughput in [%s] instead.", self.task, expected_unit, actual_unit, actual_unit, ) else: raise exceptions.RallyAssertionError( f"Target throughput for [{self.task}] is specified in " f"[{expected_unit}] but the task throughput is measured " f"in [{actual_unit}].") self.first_request = False self.current_weight = weight # throughput in requests/s for this client target_throughput = self.task.target_throughput.value / self.task.clients / self.current_weight self.scheduler = self.scheduler_class(self.task, target_throughput)
def _do_wait(es, expected_cluster_status): reached_cluster_status = None use_wait_for_no_relocating_shards = False for attempt in range(10): try: if use_wait_for_no_relocating_shards: result = es.cluster.health(wait_for_status=expected_cluster_status, wait_for_relocating_shards=0, timeout="3s") else: result = es.cluster.health(wait_for_status=expected_cluster_status, timeout="3s", params={"wait_for_no_relocating_shards": True}) except (socket.timeout, elasticsearch.exceptions.ConnectionError): pass except elasticsearch.exceptions.TransportError as e: if 400 <= e.status_code < 500: logger.exception("Client error in health API. Using 'wait_for_no_relocating_shards'.") use_wait_for_no_relocating_shards = True else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % es.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % es.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % es.cat.shards(v=True)) if reached_cluster_status == expected_cluster_status and relocating_shards == 0: return reached_cluster_status, relocating_shards else: time.sleep(0.5) msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % (expected_cluster_status, reached_cluster_status) logger.error(msg) raise exceptions.RallyAssertionError(msg)
def __next__(self): if self.conflicting_ids is not None: if self.conflict_probability and self.id_up_to > 0 and self.rand() <= self.conflict_probability: # a recency of zero means that we don't care about recency and just take a random number # within the whole interval. if self.recency == 0: idx = self.randint(0, self.id_up_to - 1) else: # A recency > 0 biases id selection towards more recent ids. The recency parameter decides # by how much we bias. See docs for the resulting curve. # # idx_range is in the interval [0, 1]. idx_range = min(self.randexp(GenerateActionMetaData.RECENCY_SLOPE * self.recency), 1) # the resulting index is in the range [0, self.id_up_to). Note that a smaller idx_range # biases towards more recently used ids (higher indexes). idx = round((self.id_up_to - 1) * (1 - idx_range)) doc_id = self.conflicting_ids[idx] action = self.on_conflict else: if self.id_up_to >= len(self.conflicting_ids): raise StopIteration() doc_id = self.conflicting_ids[self.id_up_to] self.id_up_to += 1 action = "index" if action == "index": return "index", self.meta_data_index_with_id % doc_id elif action == "update": return "update", self.meta_data_update_with_id % doc_id else: raise exceptions.RallyAssertionError("Unknown action [{}]".format(action)) else: return "index", self.meta_data_index_no_id
def union(self, other): if self.name != other.name: raise exceptions.RallyAssertionError( "Both document corpora must have the same name") if self is other: return self else: return DocumentCorpus( self.name, list(set(self.documents).union(other.documents)))
def receiveMsg_WakeupMessage(self, msg, sender): if msg.payload == MechanicActor.WAKEUP_RESET_RELATIVE_TIME: self.reset_relative_time() elif msg.payload == MechanicActor.WAKEUP_FLUSH_METRICS: logger.info("Flushing cluster-wide system metrics store.") self.metrics_store.flush(refresh=False) self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS, payload=MechanicActor.WAKEUP_FLUSH_METRICS) else: raise exceptions.RallyAssertionError("Unknown wakeup reason [{}]".format(msg.payload))
def partition(self, partition_index, total_partitions): chosen_indices = [idx for idx in self.indices if idx.matches(self.index_name)] if not chosen_indices: raise exceptions.RallyAssertionError("The provided index [%s] does not match any of the indices [%s]." % (self.index_name, ",".join([str(i) for i in self.indices]))) logger.info("Choosing indices [%s] for partition [%d] of [%d]." % (",".join([str(i) for i in chosen_indices]), partition_index, total_partitions)) return PartitionBulkIndexParamSource(chosen_indices, partition_index, total_partitions, self.action_metadata, self.batch_size, self.bulk_size, self.id_conflicts, self.pipeline, self._params)
def __call__(self, es, params): source_index = mandatory(params, "source-index", self) target_index = mandatory(params, "target-index", self) # we need to inject additional settings so we better copy the body target_body = deepcopy(mandatory(params, "target-body", self)) shrink_node = params.get("shrink-node") # Choose a random data node if none is specified if not shrink_node: node_names = [] # choose a random data node for node in es.nodes.info()["nodes"].values(): if "data" in node["roles"]: node_names.append(node["name"]) if not node_names: raise exceptions.RallyAssertionError( "Could not choose a suitable shrink-node automatically. Please specify it explicitly." ) shrink_node = random.choice(node_names) self.logger.info("Using [%s] as shrink node.", shrink_node) self.logger.info("Preparing [%s] for shrinking.", source_index) # prepare index for shrinking es.indices.put_settings( index=source_index, body={ "settings": { "index.routing.allocation.require._name": shrink_node, "index.blocks.write": "true" } }, preserve_existing=True) self.logger.info("Waiting for relocation to finish for index [%s]...", source_index) self._wait_for(es, source_index, "shard relocation for index [{}]".format(source_index)) self.logger.info("Shrinking [%s] to [%s].", source_index, target_index) if "settings" not in target_body: target_body["settings"] = {} target_body["settings"][ "index.routing.allocation.require._name"] = None target_body["settings"]["index.blocks.write"] = None # kick off the shrink operation es.indices.shrink(index=source_index, target=target_index, body=target_body) self.logger.info("Waiting for shrink to finish for index [%s]...", source_index) self._wait_for(es, target_index, "shrink for index [{}]".format(target_index)) self.logger.info("Shrinking [%s] to [%s] has finished.", source_index, target_index) # ops_count is not really important for this operation... return 1, "ops"
def assert_doc_count(self): if self.expected_doc_count is not None: stats = self.cluster.indices_stats(index=self.indices, metric="_all", level="shards") actual_doc_count = stats["_all"]["primaries"]["docs"]["count"] if self.expected_doc_count != actual_doc_count: msg = "Wrong number of documents: expected %s but got %s. If you benchmark against an external cluster be sure to " \ "start with all indices empty." % (self.expected_doc_count, actual_doc_count) logger.error(msg) raise exceptions.RallyAssertionError(msg)
def union(self, other): """ Creates a new corpus based on the current and the provided other corpus. This is not meant as a generic union of two arbitrary corpora but rather to unify the documents referenced by two instances of the same corpus. This is useful when two tasks reference different subsets of a corpus and a unified view (e.g. for downloading the appropriate document files) is required. :param other: The other corpus to unify with this one. Must have the same name and meta-data. :return: A document corpus instance with the same and meta-data but with documents from both corpora. """ if self.name != other.name: raise exceptions.RallyAssertionError(f"Corpora names differ: [{self.name}] and [{other.name}].") if self.meta_data != other.meta_data: raise exceptions.RallyAssertionError(f"Corpora meta-data differ: [{self.meta_data}] and [{other.meta_data}].") if self is other: return self else: return DocumentCorpus( name=self.name, documents=list(set(self.documents).union(other.documents)), meta_data=dict(self.meta_data) )
def _wait_for(self, es, idx, description): # wait a little bit before the first check time.sleep(3) result = self.cluster_health(es, params={ "index": idx, "retries": sys.maxsize, "request-params": { "wait_for_no_relocating_shards": "true" } }) if not result["success"]: raise exceptions.RallyAssertionError("Failed to wait for [{}].".format(description))
def _do_wait(es, es_version, expected_cluster_status): reached_cluster_status = None relocating_shards = -1 major, minor, patch, suffix = versions.components(es_version) if major < 5: use_wait_for_relocating_shards = True elif major == 5 and minor == 0 and patch == 0 and suffix and suffix.startswith( "alpha"): use_wait_for_relocating_shards = True else: use_wait_for_relocating_shards = False for attempt in range(10): try: if use_wait_for_relocating_shards: result = es.cluster.health( wait_for_status=expected_cluster_status, wait_for_relocating_shards=0, timeout="3s") else: result = es.cluster.health( wait_for_status=expected_cluster_status, timeout="3s", params={"wait_for_no_relocating_shards": True}) except (socket.timeout, elasticsearch.exceptions.ConnectionError): pass except elasticsearch.exceptions.TransportError as e: if e.status_code == 408: logger.info( "Timed out waiting for cluster health status. Retrying shortly..." ) time.sleep(0.5) else: raise e else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % es.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % es.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % es.cat.shards(v=True)) if reached_cluster_status == expected_cluster_status and relocating_shards == 0: return reached_cluster_status, relocating_shards else: time.sleep(0.5) if reached_cluster_status != expected_cluster_status: msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % ( expected_cluster_status, reached_cluster_status) else: msg = "Cluster reached expected status [%s] but there were [%d] relocating shards and we require zero relocating shards " \ "(Use the /_cat/shards API to check which shards are relocating.)" % (reached_cluster_status, relocating_shards) logger.error(msg) raise exceptions.RallyAssertionError(msg)
def send_to_children_and_transition(self, sender, msg, expected_status, new_status): """ Sends the provided message to all child actors and immediately transitions to the new status. :param sender: The actor from which we forward this message (in case it is message forwarding). Otherwise our own address. :param msg: The message to send. :param expected_status: The status in which this actor should be upon calling this method. :param new_status: The new status. """ if self.is_current_status_expected(expected_status): self.logger.info("Transitioning from [%s] to [%s].", self.status, new_status) self.status = new_status for m in filter(None, self.children): self.send(m, msg) else: raise exceptions.RallyAssertionError("Received [%s] from [%s] but we are in status [%s] instead of [%s]." % (type(msg), sender, self.status, expected_status))
def used_corpora(self, t, params): corpora = [] track_corpora_names = [corpus.name for corpus in t.corpora] corpora_names = params.get("corpora", track_corpora_names) if isinstance(corpora_names, str): corpora_names = [corpora_names] for corpus in t.corpora: if corpus.name in corpora_names: filtered_corpus = corpus.filter(source_format=track.Documents.SOURCE_FORMAT_BULK, target_indices=params.get("indices")) if filtered_corpus.number_of_documents(source_format=track.Documents.SOURCE_FORMAT_BULK) > 0: corpora.append(filtered_corpus) # the track has corpora but none of them match if t.corpora and not corpora: raise exceptions.RallyAssertionError("The provided corpus %s does not match any of the corpora %s." % (corpora_names, track_corpora_names)) return corpora
def __next__(self): if self.conflicting_ids is not None: if self.conflict_probability and self.id_up_to > 0 and self.rand() <= self.conflict_probability: doc_id = self.conflicting_ids[self.randint(0, self.id_up_to - 1)] action = self.on_conflict else: if self.id_up_to >= len(self.conflicting_ids): raise StopIteration() doc_id = self.conflicting_ids[self.id_up_to] self.id_up_to += 1 action = "index" if action == "index": return "index", '{"index": {"_index": "%s", "_type": "%s", "_id": "%s"}}' % (self.index_name, self.type_name, doc_id) elif action == "update": return "update", '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}' % (self.index_name, self.type_name, doc_id) else: raise exceptions.RallyAssertionError("Unknown action [{}]".format(action)) else: return "index", '{"index": {"_index": "%s", "_type": "%s"}}' % (self.index_name, self.type_name)
def receiveMsg_WakeupMessage(self, msg, sender): if msg.payload == MechanicActor.WAKEUP_RESET_RELATIVE_TIME: self.reset_relative_time() else: raise exceptions.RallyAssertionError("Unknown wakeup reason [{}]".format(msg.payload))
def _do_wait(es, expected_cluster_status, sleep=time.sleep): import elasticsearch from enum import Enum from functools import total_ordering @total_ordering class ClusterHealthStatus(Enum): UNKNOWN = 0 RED = 1 YELLOW = 2 GREEN = 3 def __lt__(self, other): if self.__class__ is other.__class__: return self.value < other.value return NotImplemented def status(v): try: return ClusterHealthStatus[v.upper()] except (KeyError, AttributeError): return ClusterHealthStatus.UNKNOWN reached_cluster_status = None relocating_shards = -1 major, minor, patch, suffix = versions.components( es.info()["version"]["number"]) if major < 5: use_wait_for_relocating_shards = True elif major == 5 and minor == 0 and patch == 0 and suffix and suffix.startswith( "alpha"): use_wait_for_relocating_shards = True else: use_wait_for_relocating_shards = False max_attempts = 10 for attempt in range(max_attempts): try: # Is this the last attempt? Then just retrieve the status if attempt + 1 == max_attempts: result = es.cluster.health() elif use_wait_for_relocating_shards: result = es.cluster.health( wait_for_status=expected_cluster_status, timeout="3s", params={"wait_for_relocating_shards": 0}) else: result = es.cluster.health( wait_for_status=expected_cluster_status, timeout="3s", wait_for_no_relocating_shards=True) except (socket.timeout, elasticsearch.exceptions.ConnectionError): pass except elasticsearch.exceptions.TransportError as e: if e.status_code == 408: logger.info( "Timed out waiting for cluster health status. Retrying shortly..." ) sleep(0.5) else: raise e else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % es.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % es.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % es.cat.shards(v=True)) if status(reached_cluster_status) >= status( expected_cluster_status) and relocating_shards == 0: return reached_cluster_status, relocating_shards else: sleep(0.5) if status(reached_cluster_status) < status(expected_cluster_status): msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % ( expected_cluster_status, reached_cluster_status) else: msg = "Cluster reached status [%s] which is equal or better than the expected status [%s] but there were [%d] relocating shards " \ "and we require zero relocating shards (Use the /_cat/shards API to check which shards are relocating.)" % \ (reached_cluster_status, expected_cluster_status, relocating_shards) logger.error(msg) raise exceptions.RallyAssertionError(msg)