def _test_turnoff_translog_retention_after_upgraded(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() number_of_replicas = random.randint(0, 2) c.execute( ''' create table doc.test(x int) clustered into 1 shards with( number_of_replicas =?, "soft_deletes.enabled" = true) ''', (number_of_replicas, )) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) insert_data(conn, 'doc', 'test', random.randint(100, 200)) c.execute('refresh table doc.test') num_docs = random.randint(0, 100) if num_docs > 0: insert_data(conn, 'doc', 'test', num_docs) # update the cluster to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) c.execute('refresh table doc.test') self._assert_translog_is_empty(conn, 'doc', 'test')
def _test_recovery(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(x int) clustered into 1 shards with( number_of_replicas = 1, "unassigned.node_left.delayed_timeout" = '100ms', "allocation.max_retries" = '0') ''') num_docs = random.randint(0, 10) if num_docs > 0: insert_data(conn, 'doc', 'test', num_docs) if random.choice([True, False]): c.execute("refresh table doc.test") self._upgrade_to_mixed_cluster(cluster, path.to_version) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) if random.choice([True, False]): c.execute("refresh table doc.test") assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test'))
def _test_closed_index_during_rolling_upgrade(self, path): """ This test creates and closes a new table at every stage of the rolling upgrade. It then checks that the table is effectively closed and replicated. """ cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.old_cluster(x int) clustered into 1 shards with( number_of_replicas = 0) ''') self._assert_is_green(conn, 'doc', 'old_cluster') c.execute('alter table doc.old_cluster close') self._assert_is_closed(conn, 'doc', 'old_cluster') self._upgrade_to_mixed_cluster(cluster, path.to_version) self._assert_is_closed(conn, 'doc', 'old_cluster') c.execute(''' create table doc.mixed_cluster(x int) clustered into 1 shards with( number_of_replicas = 0) ''') assert_busy( lambda: self._assert_is_green(conn, 'doc', 'mixed_cluster')) c.execute('alter table doc.mixed_cluster close') self._assert_is_closed(conn, 'doc', 'mixed_cluster') # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) self._assert_is_closed(conn, 'doc', 'old_cluster') self._assert_is_closed(conn, 'doc', 'mixed_cluster') c.execute(''' create table doc.upgraded_cluster(x int) clustered into 1 shards with( number_of_replicas = 0) ''') assert_busy( lambda: self._assert_is_green(conn, 'doc', 'upgraded_cluster')) c.execute('alter table doc.upgraded_cluster close') self._assert_is_closed(conn, 'doc', 'upgraded_cluster')
def _test_retention_leases_established_when_promoting_primary(self, path): number_of_nodes = 3 cluster = self._new_cluster(path.from_version, number_of_nodes) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() number_of_shards = random.randint(1, 5) number_of_replicas = random.randint(0, 1) c.execute( '''create table doc.test(x int) clustered into ? shards with( "number_of_replicas" = ?, "soft_deletes.enabled" = false, "allocation.max_retries" = 0, "unassigned.node_left.delayed_timeout" = '100ms' )''', ( number_of_shards, number_of_replicas, )) number_of_docs = random.randint(0, 10) if number_of_docs > 0: insert_data(conn, 'doc', 'test', number_of_docs) if random.choice([True, False]): c.execute('refresh table doc.test') assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) self._upgrade_to_mixed_cluster(cluster, path.to_version) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) assert_busy( lambda: self. _assert_ensure_peer_recovery_retention_leases_renewed_and_synced( conn, 'doc', 'test')) # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, number_of_nodes) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) assert_busy( lambda: self. _assert_ensure_peer_recovery_retention_leases_renewed_and_synced( conn, 'doc', 'test'))
def _test_recovery_with_concurrent_indexing(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(x int) clustered into 1 shards with( number_of_replicas = 2, "unassigned.node_left.delayed_timeout" = '100ms', "allocation.max_retries" = '0') ''') # insert data into the initial homogeneous cluster insert_data(conn, 'doc', 'test', 10) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # make sure that we can index while the replicas are recovering c.execute('''alter table doc.test set ("routing.allocation.enable"='primaries')''') self._upgrade_to_mixed_cluster(cluster, path.to_version) c.execute('''alter table doc.test set ("routing.allocation.enable"='all')''') # insert data into a mixed cluster insert_data(conn, 'doc', 'test', 50) c.execute('refresh table doc.test') # make sure that we can index while the replicas are recovering c.execute('select count(*) from doc.test') self.assertEqual(c.fetchone()[0], 60) # check counts for each node individually c.execute('select id from sys.nodes') node_ids = c.fetchall() self.assertEqual(len(node_ids), self.NUMBER_OF_NODES) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) for node_id in node_ids: assert_busy(lambda: self._assert_num_docs_by_node_id(conn, 'doc', 'test', node_id[0], 60)) c.execute('''alter table doc.test set ("routing.allocation.enable"='primaries')''') # upgrade the full cluster self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) c.execute('''alter table doc.test set ("routing.allocation.enable"='all')''') insert_data(conn, 'doc', 'test', 45) c.execute('refresh table doc.test') c.execute('select count(*) from doc.test') res = c.fetchone() self.assertEqual(res[0], 105) c.execute('select id from sys.nodes') node_ids = c.fetchall() self.assertEqual(len(node_ids), self.NUMBER_OF_NODES) for node_id in node_ids: assert_busy(lambda: self._assert_num_docs_by_node_id(conn, 'doc', 'test', node_id[0], 105))
def _test_recovery_closed_index(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(x int) clustered into 1 shards with( number_of_replicas = 1, "unassigned.node_left.delayed_timeout" = '100ms', "allocation.max_retries" = '0') ''') assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) c.execute('alter table doc.test close') self._upgrade_to_mixed_cluster(cluster, path.to_version) self._assert_is_closed(conn, 'doc', 'test') # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) self._assert_is_closed(conn, 'doc', 'test')
def _test_update_docs(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(id int primary key, data text) clustered into 1 shards with( "unassigned.node_left.delayed_timeout" = '100ms', "number_of_replicas" = 2) ''') inserts = [(i, str(random.randint)) for i in range(0, 100)] c.executemany('''insert into doc.test(id, data) values (?, ?)''', inserts) # ensure all shards are active before upgrading a node. otherwise the cluster tries to allocate new # replicas if the upgraded node contained the primary, which will fail due to node version allocation rules. assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) self._upgrade_to_mixed_cluster(cluster, path.to_version) if random.choice([True, False]): assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # update the data in a mixed cluster updates = [(i, str(random.randint)) for i in range(0, 100)] res = c.executemany( 'insert into doc.test(id, data) values(?, ?) on conflict(id) do update set data = excluded.data', updates) self.assertEqual(len(res), 100) for result in res: self.assertEqual(result['rowcount'], 1) if random.choice([True, False]): assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) updates = [(i, str(random.randint)) for i in range(0, 100)] res = c.executemany( 'insert into doc.test(id, data) values(?, ?) on conflict(id) do update set data = excluded.data', updates) self.assertEqual(len(res), 100) for result in res: self.assertEqual(result['rowcount'], 1)
def _test_operation_based_recovery(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(x int) clustered into 1 shards with( "number_of_replicas" = 2, "soft_deletes.enabled" = true) ''') assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) insert_data(conn, 'doc', 'test', random.randint(100, 200)) c.execute('refresh table doc.test') self._assert_ensure_checkpoints_are_synced(conn, 'doc', 'test') num_docs = random.randint(0, 3) if num_docs > 0: insert_data(conn, 'doc', 'test', num_docs) self._upgrade_to_mixed_cluster(cluster, path.to_version) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) num_docs = random.randint(0, 3) if num_docs > 0: insert_data(conn, 'doc', 'test', num_docs) self._assert_ensure_checkpoints_are_synced(conn, 'doc', 'test') # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) num_docs = random.randint(0, 3) if num_docs > 0: insert_data(conn, 'doc', 'test', num_docs) self._assert_ensure_checkpoints_are_synced(conn, 'doc', 'test')
def test_snapshot_restore_and_drop_in_parallel(self): """Test to run the drop and restore operation on two different snapshots in parallel. The purpose of this test is to validate that the snapshot mechanism of CrateDB can handle the two operations in parallel. Here, Minio is used as s3 backend for the repository, but this should work on any other backend as well. """ with MinioServer() as minio: t = threading.Thread(target=minio.run) t.daemon = True t.start() wait_until(lambda: _is_up('127.0.0.1', 9000)) num_nodes = random.randint(3, 5) number_of_shards = random.randint(1, 3) number_of_replicas = random.randint(0, 2) num_docs = random.randint(1, 100) cluster_settings = { 'cluster.name': gen_id(), 'path.data': self.PATH_DATA } shutil.rmtree(self.PATH_DATA, ignore_errors=True) cluster = self._new_cluster('latest-nightly', num_nodes, settings=cluster_settings) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() wait_for_active_shards(c) c.execute( ''' create table doc.test(x int) clustered into ? shards with( number_of_replicas =?) ''', ( number_of_shards, number_of_replicas, )) insert_data(conn, 'doc', 'test', num_docs) c.execute(''' CREATE REPOSITORY repo TYPE S3 WITH (access_key = 'minio', secret_key = 'miniostorage', bucket='backups', endpoint = '127.0.0.1:9000', protocol = 'http') ''') c.execute( 'CREATE SNAPSHOT repo.snapshot1 TABLE doc.test WITH (wait_for_completion = true)' ) c.execute( 'CREATE SNAPSHOT repo.snapshot2 TABLE doc.test WITH (wait_for_completion = true)' ) c.execute('DROP TABLE doc.test') # Drop snapshot2 while the restore of snapshot1 is still running c.execute( 'RESTORE SNAPSHOT repo.snapshot1 ALL WITH (wait_for_completion = false)' ) try: c.execute('DROP SNAPSHOT repo.snapshot2') except ProgrammingError: self.fail( "Restore and Drop Snapshot operation should work in parallel" ) assert_busy(lambda: self._assert_num_docs(conn, num_docs)) cluster.stop()
def _test_auto_expand_indices_during_rolling_upgrade(self, path): number_of_nodes = 3 cluster = self._new_cluster(path.from_version, number_of_nodes) cluster.start() # all nodes without the primary number_of_replicas = number_of_nodes - 1 number_of_replicas_with_excluded_node = number_of_replicas - 1 with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute('''select id from sys.nodes''') node_ids = c.fetchall() self.assertEqual(len(node_ids), number_of_nodes) c.execute( '''create table doc.test(x int) clustered into 1 shards with( "number_of_replicas" = ?)''', (f"0-{number_of_replicas}", )) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # exclude one node from allocation, but this won't have any effect as all nodes are on the old version c.execute( 'alter table doc.test set ("routing.allocation.exclude._id" = ?)', (random.choice(node_ids)[0], )) # check that the replicas expanding automatically to all nodes, even that one is excluded assert_busy(lambda: self._assert_number_of_replicas( conn, 'doc', 'test', number_of_replicas)) self._upgrade_to_mixed_cluster(cluster, path.to_version) # health is yellow because the replicas are expanded, but one could not be allocated as the node # is excluded by allocation filtering assert_busy(lambda: self._assert_is_yellow(conn, 'doc', 'test')) # check that the replicas still expanding automatically to all nodes, even that one is excluded assert_busy(lambda: self._assert_number_of_replicas( conn, 'doc', 'test', number_of_replicas)) # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, number_of_nodes) # now that all nodes are on the same version including the path to expand replicas based on the # allocation filtering, replicas are expanded only to 1 and the health is green assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) assert_busy(lambda: self._assert_number_of_replicas( conn, 'doc', 'test', number_of_replicas_with_excluded_node))
def _test_relocation_with_concurrent_indexing(self, path): cluster = self._new_cluster(path.from_version, self.NUMBER_OF_NODES) cluster.start() with connect(cluster.node().http_url, error_trace=True) as conn: c = conn.cursor() c.execute(''' create table doc.test(x int) clustered into 1 shards with( "number_of_replicas" = 2, "unassigned.node_left.delayed_timeout" = '100ms', "allocation.max_retries" = '0') ''') insert_data(conn, 'doc', 'test', 10) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) # make sure that no shards are allocated, so we can make sure the primary stays # on the old node (when one node stops, we lose the master too, so a replica # will not be promoted) c.execute( '''alter table doc.test set("routing.allocation.enable"='none')''' ) self._upgrade_to_mixed_cluster(cluster, path.to_version) c.execute( '''select id from sys.nodes order by version['number'] desc limit 1''' ) new_node_id = c.fetchone()[0] c.execute( '''select id from sys.nodes order by version['number'] asc limit 1''' ) old_node_id = c.fetchone()[0] # remove the replica and guaranteed the primary is placed on the old node c.execute( '''alter table doc.test set ( "number_of_replicas"=0, "routing.allocation.enable"='all', "routing.allocation.include._id"=? )''', (old_node_id, )) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) c.execute( '''alter table doc.test set ("routing.allocation.include._id"=?)''', (new_node_id, )) insert_data(conn, 'doc', 'test', 50) # ensure the relocation from old node to new node has occurred; otherwise the table is green # even though shards haven't moved to the new node yet (allocation was throttled). assert_busy(lambda: self._assert_shard_state( conn, 'doc', 'test', new_node_id, 'STARTED')) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) c.execute('refresh table doc.test') self._assert_num_docs_by_node_id(conn, 'doc', 'test', new_node_id, 60) # upgrade fully to the new version self._upgrade_cluster(cluster, path.to_version, self.NUMBER_OF_NODES) c.execute('''alter table doc.test set("number_of_replicas"=2)''') c.execute( '''alter table doc.test reset("routing.allocation.include._id")''' ) insert_data(conn, 'doc', 'test', 45) assert_busy(lambda: self._assert_is_green(conn, 'doc', 'test')) c.execute('refresh table doc.test') c.execute('select id from sys.nodes') node_ids = c.fetchall() self.assertEqual(len(node_ids), self.NUMBER_OF_NODES) for node_id in node_ids: self._assert_num_docs_by_node_id(conn, 'doc', 'test', node_id[0], 105)