seconds=45, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ]))), ] ########NEW FILE######## __FILENAME__ = hbase #!/usr/bin/env python # # Licensed to Cloudera, Inc. under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. Cloudera, Inc. licenses this file
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3) rs_pause = faults.pause_daemons(["HRegionServer"], 62) dn_pause = faults.pause_daemons(["DataNode"], 20) # This fault isn't that useful yet, since it only drops inbound packets # but outbound packets (eg, the ZK pings) keep going. rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64) profile = [ triggers.Periodic( 45, metafaults.pick_fault([ # kill -9s (5, rs_kill_long), (1, dn_kill_long), # fast kill -9s (5, rs_kill_short), (1, dn_kill_short), # pauses (simulate GC?) (10, rs_pause), (1, dn_pause), # drop packets (simulate network outage) #(1, faults.drop_packets_to_daemons(["DataNode"], 20)), #(1, rs_drop_inbound_packets), ])), # triggers.WebServerTrigger(12321) ]
fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is greater than ZK heartbeats fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True) # XXX make sure this is less than ZK heartbeats fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True) profile = [ triggers.Periodic( # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes 60, metafaults.maybe_fault( # How likely do you want a failure? decreasing this will make failures line up across nodes less often. 0.33, metafaults.pick_fault([ # You can change the weights here to see different kinds of flaky nodes (1, fail_node_long), (1, fail_node_short), (2, fail_node_transient), ]))), ]
def random_fault(): metafaults.pick_fault([ (1, kill_short_zk), (1, kill_short_server), (1, kill_short_client), #(1, kill_long_zk), #(1, kill_long_server), #(1, kill_long_client), #(1, pause_zk), #(1, pause_server), #(1, pause_client), ])() random_periodic = triggers.Periodic(10, random_fault) profile = [ random_periodic ] def signal_handler(sig, frame): faults.kill_daemons(["QuorumPeerMain"], signal.SIGKILL, 60) faults.kill_daemons(["LoadBalancerEchoServer"], signal.SIGKILL, 60) faults.kill_daemons(["LoadBalancerEchoClient"], signal.SIGKILL, 60) random_periodic.stop random_periodic.join sys.exit(0) signal.signal(signal.SIGINT, signal_handler)
from gremlins import faults, metafaults, triggers, tc clear_network_faults = faults.clear_network_faults() introduce_partition = faults.introduce_network_partition() introduce_latency = faults.introduce_network_latency() INTERVAL=30 profile = [ # clear any existing configurations triggers.OneShot(clear_network_faults), # every 5 seconds, either clear faults, introduce a latency or a partition # other faults are available, but let's start-simply triggers.Periodic( INTERVAL, metafaults.pick_fault([ (10, clear_network_faults), (10, introduce_partition), ])), ]