Example #1
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import signal

from gremlins import faults, metafaults, triggers

rs_kill_long = faults.kill_daemons(["HRegionServer"], signal.SIGKILL, 100)
rs_kill_short = faults.kill_daemons(["HRegionServer"], signal.SIGKILL, 3)

dn_kill_long = faults.kill_daemons(["DataNode"], signal.SIGKILL, 100)
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3)

rs_pause = faults.pause_daemons(["HRegionServer"], 62)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
    triggers.Periodic(
        45,
        metafaults.pick_fault([
            # kill -9s
            (5, rs_kill_long),
            (1, dn_kill_long),
            # fast kill -9s
            (5, rs_kill_short),
Example #2
0
procutils.run(["/Users/criccomi/svn/pegasus/trunk/d2/scripts/lb-tool.sh", "--put_service", "service-2", "--cluster", "cluster-1", "--path", "/service-2", "--balancer", "degrader", "--store", "zk://localhost:2181/echo/lb/services"])

# start server and client
procutils.start_daemon('LoadBalancerEchoServer');
procutils.start_daemon('LoadBalancerEchoClient');

# declare faults
kill_short_zk = faults.kill_daemons(["QuorumPeerMain"], signal.SIGKILL, 5)
kill_short_server = faults.kill_daemons(["LoadBalancerEchoServer"], signal.SIGKILL, 5)
kill_short_client = faults.kill_daemons(["LoadBalancerEchoClient"], signal.SIGKILL, 5)

kill_long_zk = faults.kill_daemons(["QuorumPeerMain"], signal.SIGKILL, 60)
kill_long_server = faults.kill_daemons(["LoadBalancerEchoServer"], signal.SIGKILL, 60)
kill_long_client = faults.kill_daemons(["LoadBalancerEchoClient"], signal.SIGKILL, 60)

pause_zk = faults.pause_daemons(["QuorumPeerMain"], 60)
pause_server = faults.pause_daemons(["LoadBalancerEchoServer"], 60)
pause_client = faults.pause_daemons(["LoadBalancerEchoClient"], 60)

def random_fault():
  metafaults.pick_fault([
    (1, kill_short_zk),
    (1, kill_short_server),
    (1, kill_short_client),
    
    #(1, kill_long_zk),
    #(1, kill_long_server),
    #(1, kill_long_client),
    
    #(1, pause_zk),
    #(1, pause_server),
Example #3
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import signal

from gremlins import faults, metafaults, triggers

rs_kill_long = faults.kill_daemons(["aa1dc150d996"], signal.SIGKILL, 100)
rs_kill_short = faults.kill_daemons(["aa1dc150d996"], signal.SIGKILL, 3)

dn_kill_long = faults.kill_daemons(["DataNode"], signal.SIGKILL, 100)
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3)

rs_pause = faults.pause_daemons(["aa1dc150d996"], 120)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["aa1dc150d996"], 64)

profile = [
    triggers.Periodic(
        45,
        metafaults.pick_fault([
            # kill -9s
            (5, rs_kill_long),
            (1, dn_kill_long),
            # fast kill -9s
            (5, rs_kill_short),
Example #4
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import signal

from gremlins import faults, metafaults, triggers

rs_kill_long = faults.kill_daemons(["HRegionServer"], signal.SIGKILL, 100)
rs_kill_short = faults.kill_daemons(["HRegionServer"], signal.SIGKILL, 3)

dn_kill_long = faults.kill_daemons(["DataNode"], signal.SIGKILL, 100)
dn_kill_short = faults.kill_daemons(["DataNode"], signal.SIGKILL, 3)

rs_pause = faults.pause_daemons(["HRegionServer"], 62)
dn_pause = faults.pause_daemons(["DataNode"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
rs_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
  triggers.Periodic(
    45,
    metafaults.pick_fault([
    # kill -9s
      (5, rs_kill_long),
      (1, dn_kill_long),
    # fast kill -9s
      (5, rs_kill_short),
Example #5
0
# See the License for the specific language governing permissions and
# limitations under the License.

import signal

from gremlins import faults, metafaults, triggers

# how long before reset 
fn_kill_long = faults.kill_daemons(["FlumeNode"], signal.SIGKILL, 100)
fn_kill_short = faults.kill_daemons(["FlumeNode"], signal.SIGKILL, 3)

fm_kill_long = faults.kill_daemons(["FlumeMaster"], signal.SIGKILL, 100)
fm_kill_short = faults.kill_daemons(["FlumeMaster"], signal.SIGKILL, 3)

# 62 is zk timeout in hbase
fn_pause = faults.pause_daemons(["FlumeName"], 62)
fm_pause = faults.pause_daemons(["FlumeMaster"], 20)

# This fault isn't that useful yet, since it only drops inbound packets
# but outbound packets (eg, the ZK pings) keep going.
fn_drop_inbound_packets = faults.drop_packets_to_daemons(["HRegionServer"], 64)

profile = [
  triggers.Periodic(
    45, # ever 45 seconds 
    metafaults.pick_fault([  # (weight, fault to fire)
    # kill -9s
      (5, fn_kill_long),
      (1, fm_kill_long),
    # fast kill -9s
      (5, fn_kill_short),