def _add_clients(num_of_clients, version=None, version_separator='_'): # TODO make a generic function that _add_clients can use """ adds a clients to namespace :param num_of_clients: int, number of replicas :param version: string, the wanted client version :param version_separator: string, separator to separate between client key and client version :return: list, all created client pods """ if version and not isinstance(version, str): raise ValueError("version must be type string") if not setup_bootstrap.pods: raise Exception("Could not find bootstrap node") bs_info = setup_bootstrap.pods[0] client_key = 'client' if version: client_key += f'{version_separator}{version}' cspec = get_conf(bs_info, testconfig[client_key], testconfig['genesis_delta']) pods_names = add_multi_clients(testconfig, setup_bootstrap.deployment_id, cspec, size=num_of_clients) return pods_names
def _add_single_client(): global client_name if not setup_bootstrap.pods: raise Exception("Could not find bootstrap node") bs_info = setup_bootstrap.pods[0] cspec = get_conf(bs_info, testconfig['client'], testconfig['genesis_delta']) client_name = add_multi_clients(testconfig, setup_bootstrap.deployment_id, cspec, 1)[0] return client_name
def setup_clients_in_namespace(namespace, bs_deployment_info, client_deployment_info, client_config, genesis_time, name="client", file_path=None, oracle=None, poet=None, dep_time_out=120): # setting stateful and deployment configuration files # default deployment method is 'deployment' dep_method = client_config[ "deployment_type"] if "deployment_type" in client_config.keys( ) else "deployment" try: dep_file_path, ss_file_path = _setup_dep_ss_file_path( file_path, dep_method, 'client') except ValueError as e: print(f"error setting up client specification file: {e}") return None # this function used to be the way to extract the client title # in case we want a different title (client_v2 for example) we can specify it # directly in "name" input def _extract_label(): return client_deployment_info.deployment_name.split('-')[1] cspec = get_conf(bs_deployment_info, client_config, genesis_time, setup_oracle=oracle, setup_poet=poet) k8s_file, k8s_create_func = choose_k8s_object_create( client_config, dep_file_path, ss_file_path) resp = k8s_create_func(k8s_file, namespace, deployment_id=client_deployment_info.deployment_id, replica_size=client_config['replicas'], container_specs=cspec, time_out=dep_time_out) dep_name = resp.metadata._name client_deployment_info.deployment_name = dep_name client_pods = (CoreV1ApiClient().list_namespaced_pod( namespace, include_uninitialized=True, label_selector=("name={0}".format(name))).items) client_deployment_info.pods = [{ 'name': c.metadata.name, 'pod_ip': c.status.pod_ip } for c in client_pods if c.metadata.name.startswith(dep_name)] return client_deployment_info
def test_add_many_clients(init_session, setup_bootstrap, setup_clients): bs_info = setup_bootstrap.pods[0] cspec = get_conf(bs_info, testconfig['client'], testconfig['genesis_delta']) pods = add_multi_clients(testconfig, setup_bootstrap.deployment_id, cspec, size=4) time.sleep(40 * timeout_factor) # wait for the new clients to finish bootstrap and for logs to get to elasticsearch fields = {'M': 'discovery_bootstrap'} for p in pods: hits = poll_query_message(indx=current_index, namespace=testconfig['namespace'], client_po_name=p, fields=fields, findFails=True, expected=1) assert len(hits) == 1, "Could not find new Client bootstrap message pod:{0}".format(p)
def test_add_delayed_nodes(init_session, add_curl, setup_bootstrap, start_poet, save_log_on_exit): current_index = get_curr_ind() bs_info = setup_bootstrap.pods[0] cspec = get_conf(bs_info, test_config['client'], test_config['genesis_delta'], setup_oracle=None, setup_poet=setup_bootstrap.pods[0]['pod_ip']) ns = test_config['namespace'] layer_duration = int(test_config['client']['args']['layer-duration-sec']) layers_per_epoch = int(test_config['client']['args']['layers-per-epoch']) epoch_duration = layer_duration * layers_per_epoch # start with 20 miners start_count = 20 new_client_in_namespace(ns, setup_bootstrap, cspec, start_count) sleep_and_print(epoch_duration) # wait epoch duration # add 10 each epoch num_to_add = 10 num_epochs_to_add_clients = 4 clients = [] for i in range(num_epochs_to_add_clients): clients.append(new_client_in_namespace(ns, setup_bootstrap, cspec, num_to_add)) print("Added client batch ", i, clients[i].pods[i]['name']) sleep_and_print(epoch_duration) print("Done adding clients. Going to wait for two epochs") # wait two more epochs wait_epochs = 3 sleep_and_print(wait_epochs * epoch_duration) # total = bootstrap + first clients + added clients total = 1 + start_count + num_epochs_to_add_clients * num_to_add total_epochs = 1 + num_epochs_to_add_clients + wait_epochs # add 1 for first epoch total_layers = layers_per_epoch * total_epochs first_layer_of_last_epoch = total_layers - layers_per_epoch f = int(test_config['client']['args']['hare-max-adversaries']) # validate print("Waiting 2 minutes for logs to propagate") sleep_and_print(120) print("Running validation") expect_hare(current_index, ns, first_layer_of_last_epoch, total_layers - 1, total, f) # validate hare atx_last_epoch = query_atx_published(current_index, ns, first_layer_of_last_epoch) queries.assert_equal_layer_hashes(current_index, ns) assert len(atx_last_epoch) == total # validate num of atxs in last epoch
def test_late_bootstraps(init_session, setup_bootstrap, setup_clients): TEST_NUM = 10 testnames = [] for i in range(TEST_NUM): client = add_multi_clients(testconfig, setup_bootstrap.deployment_id, get_conf(setup_bootstrap.pods[0], testconfig['client'], testconfig['genesis_delta']), 1) testnames.append((client[0], datetime.now())) # Need to sleep for a while in order to enable the # propagation of the gossip message time.sleep(TEST_NUM * timeout_factor) fields = {'M': 'discovery_bootstrap'} for i in testnames: hits = poll_query_message(indx=current_index, namespace=testconfig['namespace'], client_po_name=i[0], fields=fields, findFails=False, expected=1) assert len(hits) == 1, "Could not find new Client bootstrap message. client: {0}".format(i[0])
def test_sync_stress(init_session, setup_bootstrap, save_log_on_exit): # currently the only data we have is for 2.5 days, ~700+ layers max_time_in_mins = 20 max_time_for_sync_mins = max_time_in_mins clients_num = testconfig["client"]["replicas"] bs_info = setup_bootstrap.pods[0] cspec = get_conf(bs_info, testconfig['client'], testconfig['genesis_delta']) _ = add_multi_clients(testconfig, init_session, cspec, clients_num) hits = [] number_of_pods = clients_num + 1 # add 1 for bootstrap pod tts = 70 while len(hits) != number_of_pods: print( f"waiting for all clients to finish downloading all files, sleeping for {tts} seconds" ) time.sleep(tts) hits = q.get_all_msg_containing(init_session, init_session, "Done downloading") del cspec.args['remote-data'] cspec.args['data-folder'] = "" # Adding a single new client res_lst = add_multi_clients(testconfig, init_session, cspec, 1, 'client') new_client = res_lst[0] # wait for the new node to start syncing while True: start_sync_hits = q.get_all_msg_containing(init_session, new_client, START_SYNC, is_print=False) if start_sync_hits: print(f"new client started syncing\n") break tts = 60 print(f"new client did not start syncing yet sleeping for {tts} secs") time.sleep(tts) curr_try = 0 # longest run witnessed ~18:00 minutes (12:00 minutes is the shortest), 2.5 days data, 700+ layers max_retries = max_time_in_mins interval_time = 60 print("waiting for new client to be synced") while True: hits = q.get_all_msg_containing(init_session, new_client, SYNC_DONE, is_print=False) if hits: print( f"synced after {curr_try}/{max_retries} tries of {interval_time} seconds each\n" ) break print( f"not synced after {curr_try}/{max_retries} tries of {interval_time} secs each", end="\r") time.sleep(interval_time) curr_try += 1 assert curr_try <= max_retries, f"node failed syncing after waiting for {max_retries} minutes" # There are several messages containing "start synchronize" according to Almog, # this is due to a bug in the sync test binary. # We would like the timestamp of the latest one. start_sync_hits = q.get_all_msg_containing(init_session, new_client, START_SYNC, is_print=False) last_sync_msg = start_sync_hits[-1] # parsing sync start time st = convert_ts_to_datetime(last_sync_msg["T"]) et = convert_ts_to_datetime(hits[0]["T"]) ass_err = f"it took too long for syncing: {str(et - st)}, max {max_retries} minutes" passed_minutes = (et - st).seconds / 60 assert passed_minutes < max_time_for_sync_mins, ass_err # total time since starting sync until finishing print(f"new client is synced after {str(et - st)}") assert 1
def test_sync_gradually_add_nodes(init_session, setup_bootstrap, save_log_on_exit): current_index = get_curr_ind() bs_info = setup_bootstrap.pods[0] gen_delt = testconfig['genesis_delta'] cspec = get_conf(bs_info, testconfig['client'], gen_delt) cspec2 = get_conf(bs_info, testconfig['clientv2'], gen_delt) inf = add_multi_clients(testconfig, init_session, cspec, 10) del cspec.args['remote-data'] del cspec.args['data-folder'] num_clients = 4 clients = [None] * num_clients clients[0] = add_multi_clients(testconfig, init_session, cspec2, 1, 'clientv2')[0] time.sleep(10) clients[1] = add_multi_clients(testconfig, init_session, cspec, 1, 'client')[0] time.sleep(20) clients[2] = add_multi_clients(testconfig, init_session, cspec, 1, 'client')[0] time.sleep(20) clients[3] = add_multi_clients(testconfig, init_session, cspec, 1, 'client')[0] print("take pod down ", clients[0]) delete_pod(testconfig['namespace'], clients[0]) print("sleep for 20 sec") time.sleep(20) print("waiting for pods to be done with sync") start = time.time() sleep = 30 # seconds num_iter = 25 # total of 5 minutes for i in range(num_iter): done = 0 for j in range(0, num_clients): pod_name = clients[j] if not check_pod_logs(pod_name, SYNC_DONE): # not all done print("pod " + pod_name + " still not done. Going to sleep") break # stop check and sleep else: print("pod " + pod_name + " done") done = done + 1 if done == num_clients: print("all pods done") break print("not done yet sleep for " + str(sleep) + " seconds") time.sleep(sleep) assert done == num_clients end = time.time() check_pod_logs(clients[0], PERSISTENT_DATA) queries.assert_equal_layer_hashes(current_index, testconfig['namespace']) print("it took " + str(end - start) + " to sync all nodes with " + cspec.args['expected-layers'] + "layers") print("done!!")
def test_add_node_validate_atx(init_session, setup_network): curr_epoch = 0 epochs_to_sleep = 2 layer_duration = int(testconfig['client']['args']['layer-duration-sec']) layers_per_epoch = int(testconfig['client']['args']['layers-per-epoch']) layer_avg_size = int(testconfig['client']['args']['layer-average-size']) num_miners = int(testconfig['client']['replicas']) + 1 # add 1 for bs node print( f"\nlayer duration={layer_duration}, layers per epoch={layers_per_epoch}, layer avg size={layer_avg_size}" ) # wait for 2 epochs last_layer = epochs_to_sleep * layers_per_epoch print(f"wait until second epoch to layer {last_layer}") _ = q.wait_for_latest_layer(init_session, last_layer, layers_per_epoch, num_miners) # ========================== epoch i+2 ========================== curr_epoch += epochs_to_sleep print("\n\n-------- current epoch", curr_epoch, "--------") print("adding a new miner") bs_info = setup_network.bootstrap.pods[0] cspec = get_conf(bs_info, testconfig['client'], testconfig['genesis_delta']) new_pod_name = add_multi_clients(testconfig, init_session, cspec, 1)[0] # wait for next epoch last_layer = layers_per_epoch * (curr_epoch + 1) print(f"wait until next epoch to layer {last_layer}") _ = q.wait_for_latest_layer(init_session, last_layer, layers_per_epoch, num_miners + 1) # ========================== epoch i+3 ========================== curr_epoch += 1 print("\n\n-------- current epoch", curr_epoch, "--------") block_map, _ = q.get_blocks_per_node_and_layer(init_session) print( f"-------- validating blocks per nodes up to layer {last_layer} --------" ) # we're querying for block creation without epoch constrain, this will result # with epochs where new or deleted nodes will return 0 blocks in certain epochs # we should ignore those new_pod_id = get_pod_id(init_session, new_pod_name) ignore_lst = [new_pod_id] validate_blocks_per_nodes(block_map, 0, last_layer, layers_per_epoch, layer_avg_size, num_miners, ignore_lst=ignore_lst) # wait an epoch prev_layer = last_layer last_layer = layers_per_epoch * (curr_epoch + 1) print(f"wait until next epoch to layer {last_layer}") _ = q.wait_for_latest_layer(init_session, last_layer, layers_per_epoch, num_miners + 1) # ========================== epoch i+4 ========================== curr_epoch += 1 print("\n\n-------- current epoch", curr_epoch, "--------") block_map, _ = q.get_blocks_per_node_and_layer(init_session) # assert that each node has created layer_avg/number_of_nodes print( f"-------- validating blocks per nodes up to layer {last_layer} --------" ) validate_blocks_per_nodes(block_map, prev_layer, last_layer, layers_per_epoch, layer_avg_size, num_miners, ignore_lst=ignore_lst) print("-------- validating all nodes ATX creation in last epoch --------") atx_hits = q.query_atx_per_epoch(init_session, curr_epoch - 1) assert len(atx_hits) == num_miners + 1 # add 1 for new miner print("-------- validation succeed --------") last_layer = layers_per_epoch * (curr_epoch + 2) print(f"wait 2 epochs for layer {last_layer}") _ = q.wait_for_latest_layer(init_session, last_layer, layers_per_epoch, num_miners + 1) # ========================== epoch i+6 ========================== curr_epoch += 2 print("\n\n-------- current epoch", curr_epoch, "--------") # previous epoch all nodes are supposed to know our new node ATX num_miners += 1 # assert each node has created layer_avg/number_of_nodes print( f"-------- validating blocks per nodes up to layer {last_layer} --------" ) block_map, _ = q.get_blocks_per_node_and_layer(init_session) prev_layer = last_layer - layers_per_epoch validate_blocks_per_nodes(block_map, prev_layer, last_layer, layers_per_epoch, layer_avg_size, num_miners)
def test_unsync_while_genesis(init_session, setup_bootstrap, start_poet, add_curl): time_to_create_block_since_startup = 10 time_before_first_block = testconfig["genesis_delta"] + time_to_create_block_since_startup layers_to_wait = 4 layer_duration = int(testconfig['client']['args']['layer-duration-sec']) bs_info = setup_bootstrap.pods[0] cspec = get_conf(bs_info, testconfig['client'], testconfig['genesis_delta'], setup_oracle=None, setup_poet=setup_bootstrap.pods[0]['pod_ip']) # Create a cluster of nodes _ = new_client_in_namespace(testconfig['namespace'], setup_bootstrap, cspec, 9) # Sleep to enable block creation print(f"sleeping for {time_before_first_block} seconds in order to enable blocks to be published\n") time.sleep(time_before_first_block) # Validate a block was published nodes_published_block, _ = q.get_blocks_per_node_and_layer(init_session) assert nodes_published_block, f"no blocks were published during the first {time_before_first_block} seconds" # Create a new node in cluster unsynced_cl = new_client_in_namespace(testconfig['namespace'], setup_bootstrap, cspec, 1) # Sleep until layers_to_wait layer, default is 4 print(f"sleeping for {layer_duration * layers_to_wait} seconds\n") time.sleep(layer_duration * layers_to_wait) # Found by Almogs: "validate votes failed" error has occurred following a known bug print("validating no 'validate votes failed' messages has arrived") hits_val_failed = q.get_all_msg_containing(init_session, init_session, "validate votes failed") assert hits_val_failed == [], 'got a "validate votes" failed message' print("validation succeeded") # Get the msg when app started on the late node app_started_hits = q.get_app_started_msgs(init_session, unsynced_cl.pods[0]["name"]) assert app_started_hits, f"app did not start for new node after {layers_to_wait} layers" # Check if the new node has finished syncing hits_synced = q.get_done_syncing_msgs(init_session, unsynced_cl.pods[0]["name"]) assert hits_synced, f"New node did not sync, waited for {layers_to_wait} layers" print(f"{hits_synced[0].kubernetes.pod_name} has performed sync") # validate no new blocks were received before being synced sync_ts = hits_synced[0].T app_started_ts = app_started_hits[0].T hits_msg_block = q.get_block_creation_msgs(init_session, unsynced_cl.pods[0]["name"], from_ts=app_started_ts, to_ts=sync_ts) if hits_msg_block: print("\n\n############ WARNING: node created blocks before syncing!!!! ############\n\n") hits_errors = q.find_error_log_msgs(init_session, init_session) if hits_errors: print_hits_entry_count(hits_errors, "message") # assert 0, "found log errors" print("successfully finished") assert 1