Ejemplo n.º 1
0
def random_fault():
  metafaults.pick_fault([
    (1, kill_short_zk),
    (1, kill_short_server),
    (1, kill_short_client),
    
    #(1, kill_long_zk),
    #(1, kill_long_server),
    #(1, kill_long_client),
    
    #(1, pause_zk),
    #(1, pause_server),
    #(1, pause_client),
  ])()
Ejemplo n.º 2
0
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion,
                                          seconds=10,
                                          restart_daemons=["Accumulo-All"],
                                          use_flush=True)

profile = [
    triggers.Periodic(
        # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
        60,
        metafaults.maybe_fault(
            # How likely do you want a failure? decreasing this will make failures line up across nodes less often.
            0.33,
            metafaults.pick_fault([
                # You can change the weights here to see different kinds of flaky nodes
                (1, fail_node_long),
                (1, fail_node_short),
                (2, fail_node_transient),
            ]))),
]

########NEW FILE########
__FILENAME__ = hbase
#!/usr/bin/env python
#
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  Cloudera, Inc. licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
Ejemplo n.º 3
0
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3)

rs_pause = faults.pause_daemons(["HRegionServer"], 62)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
    triggers.Periodic(
        45,
        metafaults.pick_fault([
            # kill -9s
            (5, rs_kill_long),
            (1, dn_kill_long),
            # fast kill -9s
            (5, rs_kill_short),
            (1, dn_kill_short),

            # pauses (simulate GC?)
            (10, rs_pause),
            (1, dn_pause),

            # drop packets (simulate network outage)
            #(1, faults.drop_packets_to_daemons(["DataNode"], 20)),
            #(1, rs_drop_inbound_packets),
        ])),
    #  triggers.WebServerTrigger(12321)
]
Ejemplo n.º 4
0
bastion = os.getenv("GREMLINS_BASTION_HOST", hostutils.guess_remote_host())

if not bastion:
  raise Exception("GREMLINS_BASTION_HOST not set, and I couldn't guess your remote host.")

logging.info("Using %s as bastion host for network failures. You should be able to ssh from that host at all times." % bastion)

fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True)
# XXX make sure this is greater than ZK heartbeats
fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True)
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True)

profile = [
  triggers.Periodic(
# How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
    60,
    metafaults.maybe_fault(
# How likely do you want a failure? decreasing this will make failures line up across nodes less often.
      0.33,
      metafaults.pick_fault([
# You can change the weights here to see different kinds of flaky nodes
        (1, fail_node_long),
        (1, fail_node_short),
        (2, fail_node_transient),
      ]))
    ),
  ]

Ejemplo n.º 5
0
rs_pause = faults.pause_daemons(["HRegionServer"], 62)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
  triggers.Periodic(
    45,
    metafaults.pick_fault([
    # kill -9s
      (5, rs_kill_long),
      (1, dn_kill_long),
    # fast kill -9s
      (5, rs_kill_short),
      (1, dn_kill_short),

    # pauses (simulate GC?)
      (10, rs_pause),
      (1, dn_pause ),

    # drop packets (simulate network outage)
      #(1, faults.drop_packets_to_daemons(["DataNode"], 20)),
      #(1, rs_drop_inbound_packets),

      ])),
#  triggers.WebServerTrigger(12321)
  ]

Ejemplo n.º 6
0
fail_node_long = faults.fail_network(bastion_host=bastion, seconds=300, restart_daemons=["Accumulo-All"], use_flush=True)
# XXX make sure this is greater than ZK heartbeats
fail_node_short = faults.fail_network(bastion_host=bastion, seconds=45, restart_daemons=["Accumulo-All"], use_flush=True)
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion, seconds=10, restart_daemons=["Accumulo-All"], use_flush=True)

profile = [
  triggers.Periodic(
# How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
    60,
    metafaults.maybe_fault(
# How likely do you want a failure? decreasing this will make failures line up across nodes less often.
      0.33,
      metafaults.pick_fault([
# You can change the weights here to see different kinds of flaky nodes
        (1, fail_node_long),
        (1, fail_node_short),
        (2, fail_node_transient),
      ]))
    ),
  ]


########NEW FILE########
__FILENAME__ = hbase
#!/usr/bin/env python
#
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  Cloudera, Inc. licenses this file
# to you under the Apache License, Version 2.0 (the
Ejemplo n.º 7
0
fail_node_long = faults.fail_network(bastion_host=bastion,
                                     seconds=300,
                                     restart_daemons=["Accumulo-All"],
                                     use_flush=True)
# XXX make sure this is greater than ZK heartbeats
fail_node_short = faults.fail_network(bastion_host=bastion,
                                      seconds=45,
                                      restart_daemons=["Accumulo-All"],
                                      use_flush=True)
# XXX make sure this is less than ZK heartbeats
fail_node_transient = faults.fail_network(bastion_host=bastion,
                                          seconds=10,
                                          restart_daemons=["Accumulo-All"],
                                          use_flush=True)

profile = [
    triggers.Periodic(
        # How often do you want a failure? for master nodes, you should probably give enough time for recovery ~5-15 minutes
        60,
        metafaults.maybe_fault(
            # How likely do you want a failure? decreasing this will make failures line up across nodes less often.
            0.33,
            metafaults.pick_fault([
                # You can change the weights here to see different kinds of flaky nodes
                (1, fail_node_long),
                (1, fail_node_short),
                (2, fail_node_transient),
            ]))),
]
Ejemplo n.º 8
0
from gremlins import faults, metafaults, triggers, tc

clear_network_faults = faults.clear_network_faults()
introduce_partition = faults.introduce_network_partition()
introduce_latency = faults.introduce_network_latency()

INTERVAL=30

profile = [
    # clear any existing configurations
    triggers.OneShot(clear_network_faults),
    # every 5 seconds, either clear faults, introduce a latency or a partition
    # other faults are available, but let's start-simply
    triggers.Periodic(
        INTERVAL, metafaults.pick_fault([
            (10, clear_network_faults),
            (10, introduce_partition),
        ])),
]
Ejemplo n.º 9
0
clear_network_faults = faults.clear_network_faults()
introduce_packet_loss = faults.introduce_network_packet_loss()
introduce_partition = faults.introduce_network_partition()
introduce_latency = faults.introduce_network_latency()
introduce_packet_reordering = faults.introduce_packet_reordering()

server_cmd = "nc.*4242"
nc_kill = faults.kill_processes([server_cmd], signal.SIGKILL)
nc_pause = faults.pause_processes([server_cmd], 5)

profile = [
    triggers.OneShot(clear_network_faults),
    # triggers.Periodic(
        # 10, metafaults.pick_fault([
        #     # kill -9s
        #     # (5, nc_kill),
        #     # pauses (simulate GC)
        #     (10, nc_pause),
        # ])),
    triggers.Periodic(
        10, metafaults.pick_fault([
            (10, clear_network_faults),
            # (10, introduce_packet_loss),
            (10, introduce_partition),
            (10, introduce_latency),
            # (10, introduce_packet_reordering),
        ])),
    #  triggers.WebServerTrigger(12321)
]