Ejemplo n.º 1
0
def storage_server(docker_client, container_network):

    bootstrap_script = "/twindb-backup/support/bootstrap/storage_server.sh"
    container = get_container(
        "storage_server",
        docker_client,
        container_network,
        bootstrap_script=bootstrap_script,
        image="centos:centos7",
        last_n=3,
    )

    timeout = time.time() + 30 * 60

    while time.time() < timeout:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if sock.connect_ex((container["ip"], 22)) == 0:
            break
        time.sleep(1)

    yield container

    if container:
        LOG.info("Removing container %s", container["Id"])
        docker_client.api.remove_container(container=container["Id"],
                                           force=True)
Ejemplo n.º 2
0
def storage_server(docker_client, container_network):

    bootstrap_script = '/twindb-backup/support/bootstrap/storage_server.sh'
    container = get_container(
        'storage_server',
        docker_client,
        container_network,
        bootstrap_script=bootstrap_script,
        image="centos:centos7",
        last_n=2
    )

    timeout = time.time() + 30 * 60

    while time.time() < timeout:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if sock.connect_ex((container['ip'], 22)) == 0:
            break
        time.sleep(1)

    yield container

    if container:
        LOG.info('Removing container %s', container['Id'])
        docker_client.api.remove_container(container=container['Id'],
                                           force=True)
Ejemplo n.º 3
0
def test__update_with_host_maintenance_and_agent_down(stateless_job,
                                                      maintenance):
    """
    1. Create a large stateless job (that take up more than two-thirds of
       the cluster resources) with MaximumUnavailableInstances=2.
    2. Start host maintenance on one of the hosts (say A) having pods of the job.
       MaximumUnavailableInstances=2 ensures that not more than 2 pods are
       unavailable due to host maintenance at a time.
    3. Take down another host which has pods running on it. This will TASK_LOST
       to be sent for all pods on the host after 75 seconds.
    4. Start an update to modify the instance spec of one of the pods.
    5. Since TASK_LOST would cause the job SLA to be violated, instances on the
       host A should not be killed once LOST event is received. Verify that
       host A does not transition to DOWN.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3
    stateless_job.job_spec.sla.maximum_unavailable_instances = 2
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    # pick another host which has pods of the job to take down
    host_container = get_container([sorted_hosts[1]])

    try:
        host_container.stop()
        maintenance["start"]([test_host])

        stateless_job.job_spec.instance_spec[10].containers.extend([
            pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec(
                disk_limit_mb=20))
        ])
        update = StatelessUpdate(stateless_job,
                                 updated_job_spec=stateless_job.job_spec,
                                 batch_size=0)
        update.create()
        update.wait_for_state(goal_state="SUCCEEDED")

        stateless_job.stop()

        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
        pass
    finally:
        host_container.start()
Ejemplo n.º 4
0
def master2(docker_client, container_network):

    bootstrap_script = '/twindb-backup/support/bootstrap/master2.sh'
    container = get_container('master2', bootstrap_script, docker_client,
                              container_network, 2)
    timeout = time.time() + 30 * 60
    while time.time() < timeout:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if sock.connect_ex((container['ip'], 22)) == 0:
            break
        time.sleep(1)

    yield container
    if container:
        LOG.info('Removing container %s', container['Id'])
        docker_client.api.remove_container(container=container['Id'],
                                           force=True)
Ejemplo n.º 5
0
def master1(docker_client, container_network):

    bootstrap_script = '/twindb-backup/support/bootstrap/master1.sh'
    container = get_container('master1', bootstrap_script, docker_client,
                              container_network, 1)

    timeout = time.time() + 30 * 60

    while time.time() < timeout:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if sock.connect_ex((container['ip'], 3306)) == 0:
            break
        time.sleep(1)

    raw_container = docker_client.containers.get(container['Id'])
    privileges_file = "/twindb-backup/vagrant/environment/puppet/" \
                      "modules/profile/files/mysql_grants.sql"
    raw_container.exec_run('bash -c "mysql mysql < %s"' % privileges_file)

    yield container
    if container:
        LOG.info('Removing container %s', container['Id'])
        docker_client.api.remove_container(container=container['Id'],
                                           force=True)