Exemple #1
0
                                      seconds=45,
                                      restart_daemons=["Accumulo-All"],
                                      use_flush=True)
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion,
                                          seconds=10,
                                          restart_daemons=["Accumulo-All"],
                                          use_flush=True)

profile = [
    triggers.Periodic(
        # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
        60,
        metafaults.maybe_fault(
            # How likely do you want a failure? decreasing this will make failures line up across nodes less often.
            0.33,
            metafaults.pick_fault([
                # You can change the weights here to see different kinds of flaky nodes
                (1, fail_node_long),
                (1, fail_node_short),
                (2, fail_node_transient),
            ]))),
]

########NEW FILE########
__FILENAME__ = hbase
#!/usr/bin/env python
#
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  Cloudera, Inc. licenses this file
Exemple #2
0
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3)

rs_pause = faults.pause_daemons(["HRegionServer"], 62)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
    triggers.Periodic(
        45,
        metafaults.pick_fault([
            # kill -9s
            (5, rs_kill_long),
            (1, dn_kill_long),
            # fast kill -9s
            (5, rs_kill_short),
            (1, dn_kill_short),

            # pauses (simulate GC?)
            (10, rs_pause),
            (1, dn_pause),

            # drop packets (simulate network outage)
            #(1, faults.drop_packets_to_daemons(["DataNode"], 20)),
            #(1, rs_drop_inbound_packets),
        ])),
    #  triggers.WebServerTrigger(12321)
]
Exemple #3
0
fail_node_long = faults.fail_network(bastion_host=bastion,
                                     seconds=300,
                                     restart_daemons=["Accumulo-All"],
                                     use_flush=True)
# XXX make sure this is greater than ZK heartbeats
fail_node_short = faults.fail_network(bastion_host=bastion,
                                      seconds=45,
                                      restart_daemons=["Accumulo-All"],
                                      use_flush=True)
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion,
                                          seconds=10,
                                          restart_daemons=["Accumulo-All"],
                                          use_flush=True)

profile = [
    triggers.Periodic(
        # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
        60,
        metafaults.maybe_fault(
            # How likely do you want a failure? decreasing this will make failures line up across nodes less often.
            0.33,
            metafaults.pick_fault([
                # You can change the weights here to see different kinds of flaky nodes
                (1, fail_node_long),
                (1, fail_node_short),
                (2, fail_node_transient),
            ]))),
]
Exemple #4
0
def random_fault():
  metafaults.pick_fault([
    (1, kill_short_zk),
    (1, kill_short_server),
    (1, kill_short_client),
    
    #(1, kill_long_zk),
    #(1, kill_long_server),
    #(1, kill_long_client),
    
    #(1, pause_zk),
    #(1, pause_server),
    #(1, pause_client),
  ])()

random_periodic = triggers.Periodic(10, random_fault)

profile = [
  random_periodic
]

def signal_handler(sig, frame):
  faults.kill_daemons(["QuorumPeerMain"], signal.SIGKILL, 60)
  faults.kill_daemons(["LoadBalancerEchoServer"], signal.SIGKILL, 60)
  faults.kill_daemons(["LoadBalancerEchoClient"], signal.SIGKILL, 60)
  random_periodic.stop
  random_periodic.join
  sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)
Exemple #5
0
from gremlins import faults, metafaults, triggers, tc

clear_network_faults = faults.clear_network_faults()
introduce_partition = faults.introduce_network_partition()
introduce_latency = faults.introduce_network_latency()

INTERVAL=30

profile = [
    # clear any existing configurations
    triggers.OneShot(clear_network_faults),
    # every 5 seconds, either clear faults, introduce a latency or a partition
    # other faults are available, but let's start-simply
    triggers.Periodic(
        INTERVAL, metafaults.pick_fault([
            (10, clear_network_faults),
            (10, introduce_partition),
        ])),
]