def reset(self): # Reset clocks and state for each disk for disk in self.disks: disk.init_clock(0) disk.init_state() # Reset clocks and state for each node for node in self.nodes: node.init_clock(0) node.init_state() # Reset clocks and state for each rack for rack in self.racks: rack.init_state() # Reset system state self.state = State(self.num_disks, self.num_nodes) # Rest repair queue self.repair_queue = [] # Regenerate new placement self.placement = Placement(self.num_racks, self.nodes_per_rack, self.disks_per_node, self.capacity_per_disk, self.num_stripes, self.chunk_size, self.code_type, self.n, self.k, self.place_type, self.chunk_rack_config, self.l) # Reset LR self.lr = float(1.) self.total_failure_rate = 0. self.total_failrue_rate_cnt = 0 self.total_repair_rate = 0. self.total_repair_rate_cnt = 0
def __init__(self, mission_time, plus_one, num_servers, num_disks_per_server, num_spares_per_server, k, m, fb, dp_type, failure_type, mtbf, failure_percent, rebuildIO, slaTime, copybackIO, diskCap, useRatio): #--------------------------- # compressed time window #--------------------------- self.mission_time = mission_time #--------------------------- # system and placement #--------------------------- self.sys = Campaign(plus_one, num_servers, num_disks_per_server, num_spares_per_server, k, m, fb, dp_type, diskCap, useRatio) self.place = Placement(self.sys) #-------------------------------------- # fast rebuild + copyback phases #-------------------------------------- self.rebuild = Rebuild(self.sys, rebuildIO) self.copyback = Copyback(copybackIO, slaTime) #-------------------------------------- # failures distribution and mtbf #-------------------------------------- self.mtbf = mtbf self.failure_type = failure_type self.failure_percent = failure_percent
def testRedundancyMultipleOccurence(self): tiles = ['s', 'a', 'f', 'e', None, None, 's', 'a', 'f', None] nodes = self.create_horizontal_nodes(tiles) placements = Placement.placements('safe', nodes[0]) placement = placements[0] self.assertFalse(Collision.safe(placement)) placements = Placement.placements('safe', nodes[6]) placement = placements[0] self.assertTrue(Collision.safe(placement))
def create_cylinder(self,key,density,length,radius,pos,base=0,rot=0,R=0.): """Creates a cylinder body and corresponding geom. Arguments: key : number id to assign to the cylinder density : density of the given body length : length of the cylinder radius : radius of the cylinder pos : position of the center of the cylinder (x,y,z list) base : place new object at negative end of base object """ # Auto label the joint key or not. key = len(self.bodies) if key == -1 else key # create cylinder body (aligned along the z-axis so that it matches the # GeomCylinder created below, which is aligned along the z-axis by # default) body = ode.Body(self.world) M = ode.Mass() M.setCylinder(density, 3, radius, length) body.setMass(M) # create a cylinder geom for collision detection geom = ode.GeomCylinder(self.space, radius, length) geom.setBody(body) # set the position of the cylinder body.setPosition((pos[0],pos[1],pos[2])) # set parameters for drawing the body body.shape = "cylinder" body.length = length body.radius = radius # set the rotation of the cylinder if(rot): body.setRotation(self.form_rotation(rot)) # set the rotation of the cylinder directly if R: body.setRotation(R) self.bodies[key] = body self.geoms[key] = geom if(base): Placement.place_object(self.bodies[base],body) if(self.fluid_dynamics): self.create_surfaces(key,1.) return key
def testRedundancy(self): # Run the test again, but without any empty space. tiles = ['s', 'a', 'f', 'e', None, None] nodes = self.create_horizontal_nodes(tiles) placements = Placement.placements('safe', nodes[0]) placement = placements[0] self.assertFalse(Collision.safe(placement)) tiles = ['a', 'b', 's', 'a', 'f', 'e', 't', 'b', 'z', 'x', 'h'] nodes = self.create_horizontal_nodes(tiles) placements = Placement.placements('safe', nodes[2]) placement = placements[0] self.assertEqual(placement.node(0), nodes[2]) self.assertEqual(placement.node(0).letter, 's') self.assertFalse(Collision.safe(placement))
def record_tourney(self, tournament, player): """ Record the results from a tournament. Needs the tournament json object and the player json object. Player json object needs to be from the same tournament. """ self.placings.append(Placement(self.name, tournament, player))
def testHorizontalCollision(self): tiles = [None, 's', 'a', 'e', None] nodes = self.create_horizontal_nodes(tiles) placements = Placement.placements('safe', nodes[1]) # We expect the vertical and horizontal placements. placement = placements[0] self.assertEqual(placement.node(0), nodes[1]) self.assertFalse(Collision.safe(placement))
def create_capsule(self,key,density,length,radius,pos,base=0,rot=0): """Creates a capsule body and corresponding geom. Arguments: key : number id to assign to the capsule density : density of the given body length : length of the capsule radius : radius of the capsule pos : position of the center of the capsule (x,y,z list) base : place new object at negative end of base object """ # create capsule body (aligned along the z-axis so that it matches the # GeomCCylinder created below, which is aligned along the z-axis by # default) body = ode.Body(self.world) M = ode.Mass() M.setCapsule(density, 3, radius, length) body.setMass(M) # create a capsule geom for collision detection geom = ode.GeomCCylinder(self.space, radius, length) geom.setBody(body) # set the position of the capsule body.setPosition((pos[0],pos[1],pos[2])) # set parameters for drawing the body body.shape = "capsule" body.length = length body.radius = radius # set the rotation of the capsule if(rot): body.setRotation(self.form_rotation(rot)) self.bodies[key] = body self.geoms[key] = geom if(base): Placement.place_object(self.bodies[base],body)
def create_capsule(self, key, density, length, radius, pos, base=0, rot=0): """Creates a capsule body and corresponding geom. Arguments: key : number id to assign to the capsule density : density of the given body length : length of the capsule radius : radius of the capsule pos : position of the center of the capsule (x,y,z list) base : place new object at negative end of base object """ # create capsule body (aligned along the z-axis so that it matches the # GeomCCylinder created below, which is aligned along the z-axis by # default) body = ode.Body(self.world) M = ode.Mass() M.setCapsule(density, 3, radius, length) body.setMass(M) # create a capsule geom for collision detection geom = ode.GeomCCylinder(self.space, radius, length) geom.setBody(body) # set the position of the capsule body.setPosition((pos[0], pos[1], pos[2])) # set parameters for drawing the body body.shape = "capsule" body.length = length body.radius = radius # set the rotation of the capsule if (rot): body.setRotation(self.form_rotation(rot)) self.bodies[key] = body self.geoms[key] = geom if (base): Placement.place_object(self.bodies[base], body)
def testNodesMatchingLetter(self): tiles = [['', '', '', 'x', 'z', 's', '', 'f', '']] board = Board(tiles) first_row = board.nodes[0] placements = Placement.placements('safe', first_row[5]) placement = placements[0] nodes = Collision.nodes_matching_letter(placement, 'f') self.assertEqual(len(nodes), 1) f_node = nodes[0] self.assertEqual(f_node.letter, 'f') self.assertTrue(f_node.placed)
def initPlacements(self): """ Initialises the placement objects for this board (1 for each player) :return: List of placement objects """ player1placements = Placement(playerNumber=1) player2placements = Placement(playerNumber=2) player3placements = None player4placements = None if (self.playerCount >= 3): player3placements = Placement(playerNumber=3) if (self.playerCount == 4): player4placements = Placement(playerNumber=4) placements = [] for p in [ player1placements, player2placements, player3placements, player4placements ]: if (p != None): placements.append(p) return placements
def testPreexistingLetters(self): tiles = [None, None, None, 'x', 'z', 's', None, 'f', None] nodes = self.create_vertical_nodes(tiles) placements = Placement.placements('safe', nodes[5]) self.assertEqual(len(placements), 2) placement = placements[1] self.assertTrue(placement, Collision.safe(placement)) # The preexisting letters should contain the lettera 's' and 'f' because # when placing the word 'safe' across from the letter 's' the letter 'f' # would be used as well. letters = Collision.preexisting_letters(placement) self.assertEqual(len(letters), 2) self.assertEqual(letters[0], 's') self.assertEqual(letters[1], 'f')
def runjob(mission_time, num_racks, node_per_rack, disks_per_node, onumgroup, numgroup, capacity_per_disk,chunk_size, num_stripes, bandwidth, code_n, code_k, code_m, use_ratio, weibull, ssd_fail): placement = Placement(disks_per_node, node_per_rack, num_racks, onumgroup, numgroup, num_stripes, code_n, code_k) placement.generate_palcement() network = Network(num_racks, num_racks * node_per_rack, onumgroup, numgroup, node_per_rack, disks_per_node, bandwidth, capacity_per_disk*use_ratio, chunk_size, code_n, code_k, code_m) sim = Simulation(weibull, ssd_fail, placement, network, onumgroup, disks_per_node, node_per_rack, num_racks, mission_time) res = sim.run() print res ''' file =open("result", "a+") fcntl.flock(file.fileno(), fcntl.LOCK_EX) file.write(str(res)+"\n") file.close() ''' return res
def testHorizontalSafe(self): tiles = [None, 's', None, 'f', None] nodes = self.create_horizontal_nodes(tiles) placements = Placement.placements('safe', nodes[1]) self.assertEqual(len(placements), 2) placement = placements[0] # Test that we have the right start node. self.assertEqual(placement.node(0), nodes[1]) # Make sure that there is no collision. self.assertTrue(Collision.safe(placement)) vertical_placement = placements[1] self.assertEqual(vertical_placement.node(0), nodes[1]) self.assertFalse(Collision.safe(vertical_placement))
def testVerticalSafe(self): tiles = [None, 's', None, 'f', None] nodes = self.create_vertical_nodes(tiles) placements = Placement.placements('safe', nodes[1]) # There should be two placement objects returned from the previous static # method. # One would attempt to place it to the right, and the other downward. Of # course the downward placement would fall right off the board, and should # not pass the collision safety test. self.assertEqual(len(placements), 2) placement = placements[1] # Test that we have the right start node. self.assertEqual(placement.node(0), nodes[1]) # Make sure that there is no collision. self.assertTrue(Collision.safe(placement)) horizontal_placement = placements[0] self.assertEqual(horizontal_placement.node(0), nodes[1]) self.assertFalse(Collision.safe(horizontal_placement))
class RegularSimulation(Simulation): ## # __init__() from Simulation # ## # Initialize the simulation # def init(self): # Initialize the state of the system self.state = State(self.num_disks) # Employ priority queue to keep all the failures and repairs # The element in the queue is (event_time, event_type, device_id) self.events_queue = [] # Keep failed disks awaiting repair self.wait_repair_queue = [] # Keep delayed stripes due to unavailable nodes # Key is the disk_idx delayed, value is the list of delayed stripes self.delayed_repair_dict = dict() self.enable_transient_failures = False self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.ERROR) # self.logger.setLevel(logging.INFO) self.logger.addHandler(console) self.logger.propagate = False ## # Reset the simulation # def reset(self, ite=0): # Generate node transient and permanent failure events from trace if self.use_trace: for i in xrange(self.num_nodes): self.nodes[i] = Node(None, None, None, Trace(self.trace_id, i, 'p'), Trace(self.trace_id, i, 't'), Trace(self.trace_id, i, 'r')) self.state = State(self.num_disks) for disk in self.disks: disk.init_clock(0) disk.init_state() for node in self.nodes: node.init_state() for rack in self.racks: rack.init_state() self.events_queue = [] self.wait_repair_queue = [] self.delayed_repair_dict = dict() # generate disk failures and put them into events_queue for disk_id in xrange(len(self.disks)): disk_fail_time = self.disk_fail_dists.draw() if disk_fail_time <= self.mission_time: self.events_queue.append( (disk_fail_time, Disk.EVENT_DISK_FAIL, disk_id)) # generate node failures and push them into events_queue for node_id in xrange(self.num_nodes): if not self.use_trace: self.events_queue.append((self.node_fail_dists.draw(), Node.EVENT_NODE_FAIL, node_id)) if self.enable_transient_failures: self.events_queue.append( (self.node_transient_fail_dists.draw(), Node.EVENT_NODE_TRANSIENT_FAIL, node_id)) else: for node_failure_time in self.nodes[ node_id].node_fail_trace.get_trace_ls(): # push node failure event to event_queue self.events_queue.append( (node_failure_time, Node.EVENT_NODE_FAIL, node_id)) node_transient_failure_ls = self.nodes[ node_id].node_transient_fail_trace.get_trace_ls() node_transient_repair_ls = self.nodes[ node_id].node_transient_repair_trace.get_trace_ls() for ls_idx in xrange(len(node_transient_failure_ls)): node_transient_failure_time = node_transient_failure_ls[ ls_idx] node_transient_repair_time = node_transient_repair_ls[ ls_idx] self.events_queue.append( (node_transient_failure_time, Node.EVENT_NODE_TRANSIENT_FAIL, node_id)) self.events_queue.append( (node_transient_failure_time + node_transient_repair_time, Node.EVENT_NODE_TRANSIENT_REPAIR, node_id)) # generate rack failures and push them into events_queue if not self.use_power_outage and self.enable_transient_failures: for rack_id in xrange(len(self.racks)): self.events_queue.append((self.rack_fail_dists.draw(), Rack.EVENT_RACK_FAIL, rack_id)) # correlated failures caused by power outage if (not self.use_trace) and self.use_power_outage: for rack_id in xrange(self.num_racks): occur_time = float(0) + self.power_outage_dist.draw() while occur_time < self.mission_time: self.events_queue.append( (occur_time, Rack.EVENT_RACK_FAIL, rack_id)) occur_time += random.expovariate( (1 / float(self.power_outage_duration))) self.events_queue.append( (occur_time, Rack.EVENT_RACK_REPAIR, rack_id)) for i in xrange(self.nodes_per_rack): # draw a bernoulli distribution if nprandom.binomial(1, 0.01): self.events_queue.append( (occur_time, Node.EVENT_NODE_FAIL, (self.nodes_per_rack * rack_id + i))) occur_time += self.power_outage_dist.draw() heapify(self.events_queue) self.placement = Placement(self.num_racks, self.nodes_per_rack, self.disks_per_node, self.capacity_per_disk, self.num_stripes, self.chunk_size, self.code_type, self.n, self.k, self.place_type, self.chunk_rack_config, self.l) self.network = Network(self.num_racks, self.nodes_per_rack, self.network_setting) self.num_stripes_repaired = 0 self.num_stripes_repaired_single_chunk = 0 self.num_stripes_delayed = 0 ## # Generate permanent disk failure event # def set_disk_fail(self, disk_idx, curr_time): heappush(self.events_queue, (self.disk_fail_dists.draw() + curr_time, Disk.EVENT_DISK_FAIL, disk_idx)) ## # Generate repair event for permanent disk failure # def set_disk_repair(self, disk_idx, curr_time): if not self.use_network: # get the repair time from a pre-defined repair distribution heappush(self.events_queue, (self.disk_repair_dists.draw() + curr_time, Disk.EVENT_DISK_REPAIR, disk_idx)) else: # repair time = cross-rack repair traffic / available cross-rack bandwidth rack_id = disk_idx / (self.nodes_per_rack * self.disks_per_node) # If there is no available bandwidth or the rack is under transient failure if self.network.get_avail_cross_rack_repair_bwth() == 0 or \ self.racks[rack_id].get_curr_state() != Rack.STATE_RACK_NORMAL: heappush(self.wait_repair_queue, (curr_time, disk_idx)) else: cross_rack_download = 0 stripes_to_repair = self.placement.get_stripes_to_repair( disk_idx) self.num_stripes_repaired += len(stripes_to_repair) stripes_to_delay = [] # for each stripe to repair for stripe_id in stripes_to_repair: num_failed_chunk = 0 num_alive_chunk_same_rack = 0 num_unavail_chunk = 0 idx = 0 fail_idx = 0 alive_chunk_same_rack = [] # check the status of each chunk in the stripe for disk_id in self.placement.get_stripe_location( stripe_id): # get the total number of unavailable chunk (due to permanent/transient failures) in this stripe if self.disks[disk_id].state != Disk.STATE_NORMAL: num_unavail_chunk += 1 # for RS, DRC if self.placement.code_type != Placement.CODE_TYPE_LRC: if self.disks[disk_id].get_curr_state( ) == Disk.STATE_CRASHED: num_failed_chunk += 1 elif (disk_id / (self.nodes_per_rack * self.disks_per_node)) == rack_id: num_alive_chunk_same_rack += 1 # for LRC else: if self.disks[disk_id].get_curr_state( ) == Disk.STATE_CRASHED: num_failed_chunk += 1 if disk_idx == disk_id: fail_idx = idx elif (disk_id / (self.nodes_per_rack * self.disks_per_node)) == rack_id: num_alive_chunk_same_rack += 1 alive_chunk_same_rack.append(idx) idx += 1 # this is a single-chunk repair if num_failed_chunk == 1: self.num_stripes_repaired_single_chunk += 1 # the repair for this stripe is delayed if num_unavail_chunk > (self.n - self.k): stripes_to_delay.append(stripe_id) # RS if self.placement.code_type == Placement.CODE_TYPE_RS: if num_alive_chunk_same_rack < self.k: cross_rack_download += (self.k - num_alive_chunk_same_rack) # LRC elif self.placement.code_type == Placement.CODE_TYPE_LRC: if num_failed_chunk == 1: # global parity if fail_idx in self.placement.lrc_global_parity: if num_alive_chunk_same_rack < self.k: cross_rack_download += self.k - num_alive_chunk_same_rack # data chunk or local parity else: # find which group that the failed chunk is in fail_gid = 0 for gid in xrange(self.l): if fail_idx in self.placement.lrc_data_group[gid] or \ fail_idx == self.placement.lrc_local_parity[gid]: fail_gid = gid break # find how many chunk in the same rack can be used for repair num_alive_chunk_same_rack = 0 for each in alive_chunk_same_rack: if each in self.placement.lrc_data_group[fail_gid] or \ each == self.placement.lrc_data_group[fail_gid]: num_alive_chunk_same_rack += 1 if num_alive_chunk_same_rack < self.k / self.l: cross_rack_download += self.k / self.l - num_alive_chunk_same_rack else: if num_alive_chunk_same_rack < self.k: cross_rack_download += ( self.k - num_alive_chunk_same_rack) # DRC elif self.placement.code_type == Placement.CODE_TYPE_DRC: if num_failed_chunk == 1: if self.k == 5 and self.n == 9: cross_rack_download += 1.0 elif self.k == 6 and self.n == 9: cross_rack_download += 2.0 else: print "Only support DRC - (9,6,3), (9,5,3)" else: if num_alive_chunk_same_rack < self.k: cross_rack_download += ( self.k - num_alive_chunk_same_rack) else: print "Not correct code type in set_disk_repair()!" repair_bwth = self.network.get_avail_cross_rack_repair_bwth() self.network.update_avail_cross_rack_repair_bwth(0) repair_time = cross_rack_download * self.chunk_size / float( repair_bwth) # seconds repair_time /= float(3600) # hours if len(stripes_to_delay) != 0: self.num_stripes_delayed += len(stripes_to_delay) self.delayed_repair_dict[disk_idx] = stripes_to_delay self.logger.debug("repair_time = %d, repair_bwth = %d" % (repair_time, repair_bwth)) heappush(self.events_queue, (repair_time + curr_time, Disk.EVENT_DISK_REPAIR, disk_idx, repair_bwth)) ## # Generate permanent node failure event # def set_node_fail(self, node_idx, curr_time): heappush(self.events_queue, (self.node_fail_dists.draw() + curr_time, Node.EVENT_NODE_FAIL, node_idx)) ## # Generate repair event for permanent node failure # The repair for the failed node is conducted by the repair for the failed disks on that node # def set_node_repair(self, node_idx, curr_time): for i in xrange(self.disks_per_node): disk_idx = node_idx * self.disks_per_node + i self.set_disk_repair(disk_idx, curr_time) ## # Generate transient node failure event # def set_node_transient_fail(self, node_idx, curr_time): heappush(self.events_queue, (self.nodes[node_idx].node_transient_fail_distr.draw() + curr_time, Node.EVENT_NODE_TRANSIENT_FAIL, node_idx)) ## # Generate repair event for transient node failure # def set_node_transient_repair(self, node_idx, curr_time): heappush(self.events_queue, (self.nodes[node_idx].node_transient_repair_distr.draw() + curr_time, Node.EVENT_NODE_TRANSIENT_REPAIR, node_idx)) ## # Generate transient rack failure # def set_rack_fail(self, rack_idx, curr_time): heappush(self.events_queue, (self.rack_fail_dists.draw() + curr_time, Rack.EVENT_RACK_FAIL, rack_idx)) ## # Generate repair for transient rack failure # def set_rack_repair(self, rack_idx, curr_time): heappush(self.events_queue, (self.rack_repair_dists.draw() + curr_time, Rack.EVENT_RACK_REPAIR, rack_idx)) ## # Get the next event from the event queue # def get_next_event(self, curr_time): self.logger.debug( "len(delayed_repair_dict) = %d, len(wait_repair_queue) = %d" % (len(self.delayed_repair_dict), len(self.wait_repair_queue))) # If there are some stripes delayed if len(self.delayed_repair_dict) != 0: items_to_remove = [] # keep the key of the items to remove for key in self.delayed_repair_dict: tmp_dict_value = [] for stripe_id in self.delayed_repair_dict[key]: repair_delay = False num_unavail_chunk = 0 for disk_idx in self.placement.get_stripe_location( stripe_id): if self.disks[disk_idx].state != Disk.STATE_NORMAL: num_unavail_chunk += 1 if num_unavail_chunk > (self.n - self.k): repair_delay = True break if repair_delay: # stripe whose repair is delayed tmp_dict_value.append(stripe_id) if len(tmp_dict_value) == 0: items_to_remove.append(key) else: self.delayed_repair_dict[key] = tmp_dict_value for key in items_to_remove: self.delayed_repair_dict.pop(key) # If there are some failed disks awaiting repair if len(self.wait_repair_queue) != 0: disk_id = self.wait_repair_queue[0][1] rack_id = disk_id / (self.nodes_per_rack * self.disks_per_node) if self.use_network and self.network.get_avail_cross_rack_repair_bwth() != 0 and \ self.network.get_avail_intra_rack_repair_bwth(rack_id) != 0 and \ self.racks[rack_id].get_curr_state() == Rack.STATE_RACK_NORMAL: heappop(self.wait_repair_queue) self.set_disk_repair(disk_id, curr_time) next_event = heappop(self.events_queue) next_event_time = next_event[0] next_event_type = next_event[1] if next_event_time > self.mission_time: return (next_event_time, None, None) device_idx_set = [] device_idx_set.append(next_event[2]) repair_bwth_set = [] # If use network bandwidth to calculate repair_time if self.use_network and next_event_type == Disk.EVENT_DISK_REPAIR: repair_bwth_set.append(next_event[3]) # Gather the events with the same occurring time and event type while self.events_queue[0][0] == next_event_time and self.events_queue[ 0][1] == next_event_type: next_event = heappop(self.events_queue) device_idx_set.append(next_event[2]) if self.use_network and next_event_type == Disk.EVENT_DISK_REPAIR: repair_bwth_set.append(next_event[3]) # disk permanent failure if next_event_type == Disk.EVENT_DISK_FAIL: fail_time = next_event_time for device_idx in device_idx_set: # avoid the case that this disk is under repair if self.disks[device_idx].get_curr_state( ) != Disk.STATE_CRASHED: if self.delayed_repair_dict.has_key(device_idx): self.delayed_repair_dict.pop(device_idx) # update the state of the disk self.disks[device_idx].fail_disk(fail_time) # generate the repair event self.set_disk_repair(device_idx, fail_time) return (fail_time, Disk.EVENT_DISK_FAIL, device_idx_set) # node permanent failure elif next_event_type == Node.EVENT_NODE_FAIL: failed_disks_set = set([]) fail_time = next_event_time for device_idx in device_idx_set: # avoid the case that the node is under repair if self.nodes[device_idx].get_curr_state( ) != Node.STATE_NODE_CRASHED: # update the state of node self.nodes[device_idx].fail_node(fail_time) for i in xrange(self.disks_per_node): disk_idx = device_idx * self.disks_per_node + i failed_disks_set.add(disk_idx) # avoid the case that the disk is under repair if self.disks[disk_idx].get_curr_state( ) != Disk.STATE_CRASHED: if self.delayed_repair_dict.has_key(device_idx): self.delayed_repair_dict.pop(device_idx) # update the state of the disk self.disks[disk_idx].fail_disk(fail_time) # generate the repair event self.set_disk_repair(disk_idx, fail_time) return (fail_time, Node.EVENT_NODE_FAIL, failed_disks_set) # node transient failure elif next_event_type == Node.EVENT_NODE_TRANSIENT_FAIL: fail_time = next_event_time for device_idx in device_idx_set: if self.nodes[device_idx].get_curr_state( ) == Node.STATE_NODE_NORMAL: # update the state of node self.nodes[device_idx].offline_node() for i in xrange(self.disks_per_node): disk_id = device_idx * self.disks_per_node + i if self.disks[disk_id].get_curr_state( ) == Disk.STATE_NORMAL: # update the state of disk self.disks[disk_id].offline_disk(fail_time) # generate the repair event if not self.use_trace: self.set_node_transient_repair(device_idx, fail_time) return (fail_time, Node.EVENT_NODE_TRANSIENT_FAIL, None) # transient rack failure elif next_event_type == Rack.EVENT_RACK_FAIL: fail_time = next_event_time for device_idx in device_idx_set: if self.racks[device_idx].get_curr_state( ) == Rack.STATE_RACK_NORMAL: # update the state of the rack self.racks[device_idx].fail_rack(fail_time) for i in xrange(self.nodes_per_rack): # update the state of the node node_idx = device_idx * self.nodes_per_rack + i if self.nodes[node_idx].get_curr_state( ) == Node.STATE_NODE_NORMAL: self.nodes[node_idx].offline_node() for j in xrange(self.disks_per_node): # update the state of the disk disk_idx = node_idx * self.disks_per_node + j if self.disks[disk_idx].get_curr_state( ) == Disk.STATE_NORMAL: self.disks[disk_idx].offline_disk( fail_time) # generate the repair event if not self.use_power_outage: self.set_rack_repair(device_idx, fail_time) return (fail_time, Rack.EVENT_RACK_FAIL, None) # repair for permanent disk failure elif next_event_type == Disk.EVENT_DISK_REPAIR: repair_time = next_event_time for repair_disk_idx in device_idx_set: if self.disks[repair_disk_idx].get_curr_state( ) == Disk.STATE_CRASHED: # update the state of the disk self.disks[repair_disk_idx].repair_disk(repair_time) # generate next permanent disk failure self.set_disk_fail(repair_disk_idx, repair_time) # if the repair event is caused by permanent node failure node_idx = repair_disk_idx / self.disks_per_node if self.nodes[node_idx].get_curr_state( ) == Node.STATE_NODE_CRASHED: all_disk_ok = True for i in xrange(self.disks_per_node): disk = self.disks[node_idx * self.disks_per_node + i] if disk.get_curr_state() != disk.STATE_NORMAL: all_disk_ok = False break if all_disk_ok: # update the state of the node self.nodes[node_idx].repair_node() # generate next permanent node failure if not self.use_trace: self.set_node_fail(node_idx, repair_time) # update the network status if self.use_network: idx = 0 for repair_disk_idx in device_idx_set: repair_bwth = repair_bwth_set[idx] self.network.update_avail_cross_rack_repair_bwth( self.network.get_avail_cross_rack_repair_bwth() + repair_bwth) idx += 1 # return the set of repaired disks return (repair_time, Disk.EVENT_DISK_REPAIR, device_idx_set) # repair for node transient failure elif next_event_type == Node.EVENT_NODE_TRANSIENT_REPAIR: repair_time = next_event_time for repair_node_idx in device_idx_set: # update the state of the node if self.nodes[repair_node_idx].get_curr_state( ) == Node.STATE_NODE_UNAVAILABLE: self.nodes[repair_node_idx].online_node() # update the state of the disk for i in xrange(self.disks_per_node): disk_id = repair_node_idx * self.disks_per_node + i if self.disks[disk_id].get_curr_state( ) == Disk.STATE_UNAVAILABLE: self.disks[disk_id].online_disk(repair_time) # generate the next transient node failure if not self.use_trace: self.set_node_transient_fail(repair_node_idx, repair_time) return (repair_time, Node.EVENT_NODE_TRANSIENT_REPAIR, None) # repair for rack transient failure elif next_event_type == Rack.EVENT_RACK_REPAIR: repair_time = next_event_time for repair_rack_idx in device_idx_set: if self.racks[repair_rack_idx].get_curr_state( ) == Rack.STATE_RACK_UNAVAILABLE: # update the state of the rack self.racks[repair_rack_idx].repair_rack() for i in xrange(self.nodes_per_rack): node_idx = repair_rack_idx * self.nodes_per_rack + i # update the state of the node if self.nodes[node_idx].get_curr_state( ) == Node.STATE_NODE_UNAVAILABLE: self.nodes[node_idx].online_node() for j in xrange(self.disks_per_node): disk_idx = node_idx * self.disks_per_node + j # update the state of the disk if self.disks[disk_idx].get_curr_state( ) == Disk.STATE_UNAVAILABLE: self.disks[disk_idx].online_disk( repair_time) # generate the next transient rack failure if not self.use_power_outage: self.set_rack_fail(repair_rack_idx, repair_time) return (repair_time, Rack.EVENT_RACK_REPAIR, None) else: self.logger.error('Wrong type of next_event in get_next_event()!') return None ## # Run an iteration of the simulator # def run_iteration(self, ite=0): self.reset() curr_time = 0 self.logger.info( "Regular Simulator: begin an iteration %d, num_failed_disks = %d, " "avail_cross_rack_bwth = %d" % (ite, len(self.state.get_failed_disks()), self.network.get_avail_cross_rack_repair_bwth())) while True: (event_time, event_type, disk_id_set) = self.get_next_event(curr_time) curr_time = event_time if curr_time > self.mission_time: break # update the whole status if not self.state.update_state(event_type, disk_id_set): self.logger.error('update_state failed!') if event_type != None: self.logger.debug( "Time %s, Event type: %s, Number of failed disks: %s\n" % (event_time, event_type, self.state.get_num_failed_disks())) # Check durability when disk_failure/node_failure happens if event_type == Disk.EVENT_DISK_FAIL or event_type == Node.EVENT_NODE_FAIL: if ite == 1: self.logger.info( "Time %s, Event type: %s, Number of failed disks: %s\n" % (event_time, event_type, self.state.get_num_failed_disks())) failed_disks = self.state.get_failed_disks() if self.placement.check_data_loss(failed_disks): # the number of failed stripes and the number of lost chunks (num_failed_stripes, num_lost_chunks ) = self.placement.get_num_failed_status(failed_disks) # Count in the delayed stripes if len(self.delayed_repair_dict) != 0: for key in self.delayed_repair_dict: num_failed_stripes += len( self.delayed_repair_dict[key]) num_lost_chunks += len( self.delayed_repair_dict[key]) # Calculate blocked ratio sum_unavail_time = 0 for disk_id in xrange(self.num_disks): sum_unavail_time += self.disks[disk_id].get_unavail_time(curr_time) * \ self.placement.get_num_chunks_per_disk(disk_id) blocked_ratio = sum_unavail_time / ( self.placement.num_chunks * curr_time) # Calculate the single-chunk repair ratio single_chunk_repair_ratio = 0 self.logger.info( "num_stripes_repaired_single_chunk = %d, num_stripes_repaired = %d" % (self.num_stripes_repaired_single_chunk, self.num_stripes_repaired)) if self.num_stripes_repaired != 0: single_chunk_repair_ratio = float(self.num_stripes_repaired_single_chunk) / \ float(self.num_stripes_repaired) return (1, "(%d, %d, %f, %f)" % (num_failed_stripes, num_lost_chunks, blocked_ratio, single_chunk_repair_ratio)) # No data loss # Calculate blocked ratio sum_unavail_time = 0 for disk_id in xrange(self.num_disks): sum_unavail_time += self.disks[disk_id].get_unavail_time(self.mission_time) * \ self.placement.get_num_chunks_per_disk(disk_id) blocked_ratio = sum_unavail_time / (self.placement.num_chunks * self.mission_time) # Calculate the single-chunk repair ratio single_chunk_repair_ratio = 0 if self.num_stripes_repaired != 0: single_chunk_repair_ratio = float(self.num_stripes_repaired_single_chunk) / \ float(self.num_stripes_repaired) return (0, "(0, 0, %f, %f)" % (blocked_ratio, single_chunk_repair_ratio))
''' if False: ####### Placement routine. object_height = 0.1 #SCALE = 1 resolution = [.01*SCALE, .01*SCALE] #sets resolution of occupancy grid print 'NOTE: Resolution is ',100*resolution[0], 'cm' ### polygon = label_object() polygon.add_point([0,0]) polygon.add_point([0,5*SCALE]) polygon.add_point([10*SCALE,5*SCALE]) polygon.add_point([10*SCALE,0]) ###object_height = 0.1 print 'creating placement object' pl = Placement(pc, resolution) ###REPLACE WITH MY OWN CLASS DEFINITION WITH FUNCTIONs if displayOn: placement_point = pl.test_placement(polygon, object_height) else: placement_point = pl.find_placement(polygon, object_height)#Add param True to get debug popups placement_point -= pc.scan_dataset.ground_plane_translation #Assumes 'codyRobot'==ROBOT #This should be optional ### Formerly, the robot would reach out and place object at this point #import mekabot.coord_frames as mcf #placement_point_global = mcf.thok0Tglobal(placement_point) print 'placement point in global coordinate frame:', placement_point_global.T
class UnifBFBSimulation(Simulation): ## # __init__() from Simulation # ## # Initialize UnifBFBSimulation # def init(self): self.logger = logging.getLogger(__name__) # self.logger.setLevel(logging.ERROR) self.logger.setLevel(logging.INFO) # self.logger.setLevel(logging.DEBUG) self.logger.addHandler(console) self.logger.propagate = False # Failure biasing prob self.fb_prob = float(self.is_parms.fb_prob) # Arrival rate of homogeneous Poisson process, beta self.poisson_rate = float(self.is_parms.beta) # Likelihood ratio self.lr = float(1.) self.logger.debug( "UnifBFBSimulation init() - fb_prob = %.6f, poisson_rate = %.6f", self.fb_prob, self.poisson_rate) ## # Reset the simulator # def reset(self): # Reset clocks and state for each disk for disk in self.disks: disk.init_clock(0) disk.init_state() # Reset clocks and state for each node for node in self.nodes: node.init_clock(0) node.init_state() # Reset clocks and state for each rack for rack in self.racks: rack.init_state() # Reset system state self.state = State(self.num_disks, self.num_nodes) # Rest repair queue self.repair_queue = [] # Regenerate new placement self.placement = Placement(self.num_racks, self.nodes_per_rack, self.disks_per_node, self.capacity_per_disk, self.num_stripes, self.chunk_size, self.code_type, self.n, self.k, self.place_type, self.chunk_rack_config, self.l) # Reset LR self.lr = float(1.) self.total_failure_rate = 0. self.total_failrue_rate_cnt = 0 self.total_repair_rate = 0. self.total_repair_rate_cnt = 0 ## # Get failure rate # def get_failure_rate(self): fail_rate = float(0) for disk in self.disks: fail_rate += disk.curr_disk_fail_rate() for node in self.nodes: fail_rate += node.curr_node_fail_rate() # self.logger.debug("get_failure_rate(): fail_rate = %.6f", fail_rate) # print("get_failure_rate(): fail_rate = %.6f" % fail_rate) return fail_rate ## # Get the probability of node failure # To decide whether a failure event is node failure or disk failure # def get_node_failure_prob(self): comp_fail_rate = float(0) node_fail_rate = float(0) for disk in self.disks: comp_fail_rate += disk.curr_disk_fail_rate() for node in self.nodes: node_fail_rate += node.curr_node_fail_rate() return node_fail_rate / (node_fail_rate + comp_fail_rate) ## # Calculate the repair time for a failed component # The repair time = the amount of cross_rack data to download / cross_rack bandwidth # def get_disk_repair_duration(self, disk_idx): if not self.use_network: # get the repair time from a pre-defined repair distribution return self.disk_repair_dists.draw() else: # repair time = cross-rack repair traffic / available cross-rack bandwidth rack_id = disk_idx / (self.nodes_per_rack * self.disks_per_node) cross_rack_download = 0 stripes_to_repair = self.placement.get_stripes_to_repair(disk_idx) # self.num_stripes_repaired += len(stripes_to_repair) # stripes_to_delay = [] # print("len(stripes_to_repair) = %d" % len(stripes_to_repair)) # for each stripe to repair for stripe_id in stripes_to_repair: num_failed_chunk = 0 num_alive_chunk_same_rack = 0 idx = 0 fail_idx = 0 alive_chunk_same_rack = [] # check the status of each chunk in the stripe for disk_id in self.placement.get_stripe_location(stripe_id): # for RS, DRC if self.placement.code_type != Placement.CODE_TYPE_LRC: if self.disks[disk_id].get_curr_state( ) == Disk.STATE_CRASHED: num_failed_chunk += 1 elif (disk_id / (self.nodes_per_rack * self.disks_per_node)) == rack_id: num_alive_chunk_same_rack += 1 # for LRC else: if self.disks[disk_id].get_curr_state( ) == Disk.STATE_CRASHED: num_failed_chunk += 1 if disk_idx == disk_id: fail_idx = idx elif (disk_id / (self.nodes_per_rack * self.disks_per_node)) == rack_id: num_alive_chunk_same_rack += 1 alive_chunk_same_rack.append(idx) idx += 1 # # this is a single-chunk repair # if num_failed_chunk == 1: # self.num_stripes_repaired_single_chunk += 1 # RS if self.placement.code_type == Placement.CODE_TYPE_RS: if num_alive_chunk_same_rack < self.k: cross_rack_download += (self.k - num_alive_chunk_same_rack) # LRC elif self.placement.code_type == Placement.CODE_TYPE_LRC: if num_failed_chunk == 1: # global parity if fail_idx in self.placement.lrc_global_parity: if num_alive_chunk_same_rack < self.k: cross_rack_download += self.k - num_alive_chunk_same_rack # data chunk or local parity else: # find which group that the failed chunk is in fail_gid = 0 for gid in xrange(self.l): if fail_idx in self.placement.lrc_data_group[gid] or \ fail_idx == self.placement.lrc_local_parity[gid]: fail_gid = gid break # find how many chunk in the same rack can be used for repair num_alive_chunk_same_rack = 0 for each in alive_chunk_same_rack: if each in self.placement.lrc_data_group[fail_gid] or \ each == self.placement.lrc_data_group[fail_gid]: num_alive_chunk_same_rack += 1 if num_alive_chunk_same_rack < self.k / self.l: cross_rack_download += self.k / self.l - num_alive_chunk_same_rack else: if num_alive_chunk_same_rack < self.k: cross_rack_download += (self.k - num_alive_chunk_same_rack) # DRC elif self.placement.code_type == Placement.CODE_TYPE_DRC: if num_failed_chunk == 1: if self.k == 5 and self.n == 9: cross_rack_download += 1.0 elif self.k == 6 and self.n == 9: cross_rack_download += 2.0 else: print "Only support DRC - (9,6,3), (9,5,3)" else: if num_alive_chunk_same_rack < self.k: cross_rack_download += (self.k - num_alive_chunk_same_rack) else: print "Not correct code type in set_disk_repair()!" repair_duration = cross_rack_download * self.chunk_size / \ float(self.network.get_avail_cross_rack_repair_bwth()) # seconds # print "repair_time = %.1f" % (repair_duration / 3600.) # print("repair_duration = %.10f, cross_rack_download=%d" % \ # (repair_duration / 3600., cross_rack_download)) if repair_duration != 0: self.total_repair_rate += 3600. / repair_duration self.total_repair_rate_cnt += 1 return repair_duration / 3600. # hours def get_earliest_repair_time(self, curr_time): earliest_repair_time = curr_time if len(self.repair_queue) > 0: for repair_event in self.repair_queue: repair_event_time = repair_event[0] if repair_event_time > earliest_repair_time: earliest_repair_time = repair_event_time return earliest_repair_time ## # Set next repair time for disk indexed with disk_index # def set_disk_repair(self, disk_idx, curr_time): heappush(self.repair_queue, (self.get_disk_repair_duration(disk_idx) + self.get_earliest_repair_time(curr_time), Disk.EVENT_DISK_REPAIR, disk_idx)) ## # Set new node repair time for node node_idx # def set_node_repair(self, node_idx, curr_time): node_repair_duration = 0 # Get the repair duration of each disk on this node for i in xrange(self.disks_per_node): disk_idx = self.disks_per_node * node_idx + i node_repair_duration += self.get_disk_repair_duration(disk_idx) heappush( self.repair_queue, (node_repair_duration + self.get_earliest_repair_time(curr_time), Node.EVENT_NODE_REPAIR, node_idx)) ## # Get the next event in UnifBFBSimulation # def get_next_event(self, curr_time): # Update clock for each disk for disk in self.disks: disk.update_clock(curr_time) # Update clock for each node for node in self.nodes: node.update_clock(curr_time) # If not in a failed state, then draw for next failure if self.state.get_sys_state() == self.state.CURR_STATE_OK: failure_queue = [] for each_disk in range(self.num_disks): failure_queue.append( (self.disks[each_disk].disk_fail_distr. draw_inverse_transform(self.disks[each_disk].read_clock()) + curr_time, Disk.EVENT_DISK_FAIL, each_disk)) for each_node in range(self.num_nodes): failure_queue.append( (self.nodes[each_node].node_fail_distr. draw_inverse_transform(self.nodes[each_node].read_clock()) + curr_time, Node.EVENT_NODE_FAIL, each_node)) heapify(failure_queue) (next_event_time, next_event_type, next_event_subsystem) = heappop(failure_queue) if next_event_type == Disk.EVENT_DISK_FAIL: self.disks[next_event_subsystem].fail_disk(next_event_time) self.set_disk_repair(next_event_subsystem, next_event_time) elif next_event_type == Node.EVENT_NODE_FAIL: self.nodes[next_event_subsystem].fail_node(next_event_time) for each_disk_on_this_node in range( next_event_subsystem * self.disks_per_node, (next_event_subsystem + 1) * self.disks_per_node): self.disks[each_disk_on_this_node].fail_disk( next_event_time) self.set_node_repair(next_event_subsystem, next_event_time) else: self.logger.error( "UnifBFBSimulation - get_next_event(): wrong next_event_type!" ) return (next_event_time, next_event_type, next_event_subsystem) elif self.state.get_sys_state() == self.state.CURR_STATE_DEGRADED: if not self.repair_queue: self.logger.error( "UnifBFBSimulation - get_next_event(): repair_queue is empty!" ) sys.exit(2) (repair_time, repair_event, subsystem_idx) = self.repair_queue[0] next_event_time = nprandom.exponential( self.poisson_rate) + curr_time if repair_time <= next_event_time: heappop(self.repair_queue) if repair_event == Disk.EVENT_DISK_REPAIR: self.disks[subsystem_idx].repair_disk(repair_time) return (repair_time, Disk.EVENT_DISK_REPAIR, subsystem_idx) elif repair_event == Node.EVENT_NODE_REPAIR: self.nodes[subsystem_idx].repair_node() for i in range(self.disks_per_node): disk_idx = subsystem_idx * self.disks_per_node + i self.disks[disk_idx].repair_disk(repair_time) return (repair_time, Node.EVENT_NODE_REPAIR, subsystem_idx) else: self.logger.error( "UnifBFBSimulation - get_next_event(): wrong repair_event!" ) for disk in self.disks: disk.update_clock(next_event_time) for node in self.nodes: node.update_clock(next_event_time) self.total_failure_rate += self.get_failure_rate() self.total_failrue_rate_cnt += 1 draw = nprandom.uniform() # Determine whether it is a "real" event or "pseudo" event if draw > self.fb_prob: # It is a pseudo event old_lr = self.lr self.lr *= (1. - self.get_failure_rate() / self.poisson_rate) / (1. - self.fb_prob) self.logger.debug( "get_next_event(): pseudo event - old_lr = %.10f, update, lr = %.10f", old_lr, self.lr) # Return nothing because we are staying in the current state return (next_event_time, None, None) else: # Randomly fail a disk or node # prob_node_failure = self.get_node_failure_prob() if nprandom.uniform() > self.get_node_failure_prob(): # disk failure avail_disks = self.state.get_avail_disks() fail_disk_idx = avail_disks[random.randint( 0, len(avail_disks) - 1)] old_lr = self.lr # self.lr *= (self.disks[fail_disk_idx].curr_disk_fail_rate() / self.poisson_rate) \ # / (self.fb_prob * (1 - prob_node_failure) / len(avail_disks)) # The above equation equals to the following self.lr *= (self.get_failure_rate() / self.poisson_rate) / self.fb_prob self.logger.debug( "get_next_event(): disk failure event, lr = %.10f, update, lr = %.10f", old_lr, self.lr) self.disks[fail_disk_idx].fail_disk(next_event_time) self.set_disk_repair(fail_disk_idx, next_event_time) return (next_event_time, Disk.EVENT_DISK_FAIL, fail_disk_idx) else: avail_nodes = self.state.get_avail_nodes() fail_node_idx = avail_nodes[random.randint( 0, len(avail_nodes) - 1)] old_lr = self.lr # self.lr *= (self.nodes[fail_node_idx].curr_node_fail_rate() / self.poisson_rate) \ # / (self.fb_prob * prob_node_failure / len(avail_nodes)) # The above equation equals to the following self.lr *= (self.get_failure_rate() / self.poisson_rate) / self.fb_prob self.logger.debug( "get_next_event(): node failure event - old_lr = %.10f, update, lr = %.10f", old_lr, self.lr) # Update internal node state self.nodes[fail_node_idx].fail_node(next_event_time) for each_disk_on_failed_node in range( fail_node_idx * self.disks_per_node, (fail_node_idx + 1) * self.disks_per_node): self.disks[each_disk_on_failed_node].fail_disk( next_event_time) # Schedule repair for the failed node self.set_node_repair(fail_node_idx, next_event_time) return (next_event_time, Node.EVENT_NODE_FAIL, fail_node_idx) ## # Run an iteration in UnifBFBSimulation # def run_iteration(self, ite=0): self.reset() curr_time = 0 self.logger.info( "UnifBFBSimulator: begin an iteration %d, num_failed_disks = %d, " "avail_cross_rack_bwth = %d" % (ite, len(self.state.get_failed_disks()), self.network.get_avail_cross_rack_repair_bwth())) while True: (event_time, event_type, subsystem_idx) = self.get_next_event(curr_time) curr_time = event_time if event_time > self.mission_time: break if event_type != None: self.logger.debug( "Time: %.3f, event = %s, subsystem = %d, " "number_failed_disks = %d, number_failed_nodes = %d" % (event_time, event_type, subsystem_idx, self.state.get_num_failed_disks(), self.state.get_num_failed_nodes())) if not self.state.update_state_unifbfb(event_type, subsystem_idx): self.logger.error('Update_state_unifbfb failed!') # Check durability when disk failure or node failure happens if event_type == Disk.EVENT_DISK_FAIL or event_type == Node.EVENT_NODE_FAIL: failed_disks = self.state.get_failed_disks() if self.placement.check_data_loss(failed_disks): self.logger.debug( "===== END of one iteration, self.lr = %.10f", min(self.lr, 1)) (num_failed_stripes, num_lost_chunks ) = self.placement.get_num_failed_status(failed_disks) self.logger.info("avg_failure_rate = %.6f" % (self.total_failure_rate / self.total_failrue_rate_cnt)) self.logger.info( "avg_repair_rate = %.6f" % (self.total_repair_rate / self.total_repair_rate_cnt)) return (min(self.lr, 1), "(%d, %d, 0, 0)" % (num_failed_stripes, num_lost_chunks)) # No data loss self.logger.debug( "END of one iteration, self.lr = 0 because no data loss") return (0, "(0, 0, 0, 0)")
print "# ble", index, "pin", pin, ":", selection, "(", subblock[ pin], ")" inputs[index * self.bitgen.inputs + pin - 1] = selection self.bitgen.gen_lb(inputs, functions, flops) if __name__ == '__main__': import sys if len(sys.argv) != 6: sys.stderr.write( "usage: {:s} <placement.out> <routing.out> <netlist.net> <logic.blif>\n" .format(sys.argv[0])) sys.exit(1) placement = Placement(sys.argv[1]) routing = Routing(sys.argv[2]) netlist = NET(sys.argv[3]) blif = BLIF(sys.argv[4]) tracks = int(sys.argv[5]) / 2 bitgen = Bitgen(cluster_size=4, ble_inputs=6, lb_inputs_per_side=4, tracks_per_direction=tracks, mux_size=5) fpga = FPGA(placement, routing, netlist, blif, bitgen) fpga.generate()
def reset(self, ite=0): # Generate node transient and permanent failure events from trace if self.use_trace: for i in xrange(self.num_nodes): self.nodes[i] = Node(None, None, None, Trace(self.trace_id, i, 'p'), Trace(self.trace_id, i, 't'), Trace(self.trace_id, i, 'r')) self.state = State(self.num_disks) for disk in self.disks: disk.init_clock(0) disk.init_state() for node in self.nodes: node.init_state() for rack in self.racks: rack.init_state() self.events_queue = [] self.wait_repair_queue = [] self.delayed_repair_dict = dict() # generate disk failures and put them into events_queue for disk_id in xrange(len(self.disks)): disk_fail_time = self.disk_fail_dists.draw() if disk_fail_time <= self.mission_time: self.events_queue.append( (disk_fail_time, Disk.EVENT_DISK_FAIL, disk_id)) # generate node failures and push them into events_queue for node_id in xrange(self.num_nodes): if not self.use_trace: self.events_queue.append((self.node_fail_dists.draw(), Node.EVENT_NODE_FAIL, node_id)) if self.enable_transient_failures: self.events_queue.append( (self.node_transient_fail_dists.draw(), Node.EVENT_NODE_TRANSIENT_FAIL, node_id)) else: for node_failure_time in self.nodes[ node_id].node_fail_trace.get_trace_ls(): # push node failure event to event_queue self.events_queue.append( (node_failure_time, Node.EVENT_NODE_FAIL, node_id)) node_transient_failure_ls = self.nodes[ node_id].node_transient_fail_trace.get_trace_ls() node_transient_repair_ls = self.nodes[ node_id].node_transient_repair_trace.get_trace_ls() for ls_idx in xrange(len(node_transient_failure_ls)): node_transient_failure_time = node_transient_failure_ls[ ls_idx] node_transient_repair_time = node_transient_repair_ls[ ls_idx] self.events_queue.append( (node_transient_failure_time, Node.EVENT_NODE_TRANSIENT_FAIL, node_id)) self.events_queue.append( (node_transient_failure_time + node_transient_repair_time, Node.EVENT_NODE_TRANSIENT_REPAIR, node_id)) # generate rack failures and push them into events_queue if not self.use_power_outage and self.enable_transient_failures: for rack_id in xrange(len(self.racks)): self.events_queue.append((self.rack_fail_dists.draw(), Rack.EVENT_RACK_FAIL, rack_id)) # correlated failures caused by power outage if (not self.use_trace) and self.use_power_outage: for rack_id in xrange(self.num_racks): occur_time = float(0) + self.power_outage_dist.draw() while occur_time < self.mission_time: self.events_queue.append( (occur_time, Rack.EVENT_RACK_FAIL, rack_id)) occur_time += random.expovariate( (1 / float(self.power_outage_duration))) self.events_queue.append( (occur_time, Rack.EVENT_RACK_REPAIR, rack_id)) for i in xrange(self.nodes_per_rack): # draw a bernoulli distribution if nprandom.binomial(1, 0.01): self.events_queue.append( (occur_time, Node.EVENT_NODE_FAIL, (self.nodes_per_rack * rack_id + i))) occur_time += self.power_outage_dist.draw() heapify(self.events_queue) self.placement = Placement(self.num_racks, self.nodes_per_rack, self.disks_per_node, self.capacity_per_disk, self.num_stripes, self.chunk_size, self.code_type, self.n, self.k, self.place_type, self.chunk_rack_config, self.l) self.network = Network(self.num_racks, self.nodes_per_rack, self.network_setting) self.num_stripes_repaired = 0 self.num_stripes_repaired_single_chunk = 0 self.num_stripes_delayed = 0
class Simulate: def __init__(self, mission_time, plus_one, num_servers, num_disks_per_server, num_spares_per_server, k, m, fb, dp_type, failure_type, mtbf, failure_percent, rebuildIO, slaTime, copybackIO, diskCap, useRatio): #--------------------------- # compressed time window #--------------------------- self.mission_time = mission_time #--------------------------- # system and placement #--------------------------- self.sys = Campaign(plus_one, num_servers, num_disks_per_server, num_spares_per_server, k, m, fb, dp_type, diskCap, useRatio) self.place = Placement(self.sys) #-------------------------------------- # fast rebuild + copyback phases #-------------------------------------- self.rebuild = Rebuild(self.sys, rebuildIO) self.copyback = Copyback(copybackIO, slaTime) #-------------------------------------- # failures distribution and mtbf #-------------------------------------- self.mtbf = mtbf self.failure_type = failure_type self.failure_percent = failure_percent def reset(self): #---------------------------------------------- # failures arrive by using poisson distribution #---------------------------------------------- if self.failure_type == 0: trace = Poisson(self.sys.num_disks, self.failure_percent, self.mtbf) if self.failure_type == 1: trace = Exponential(self.sys.num_disks, self.failure_percent, self.mtbf) if self.failure_type == 2: trace = Batch(self.sys.num_disks, self.failure_percent, self.mtbf, cascade_factor=10.0) self.trace_entry = trace.generate_failures() #------------------------------------------ # put the disk failures in the event queue #------------------------------------------ self.events_queue = [] for disk_fail_time, diskId in self.trace_entry: heappush(self.events_queue, (disk_fail_time, Disk.EVENT_FAIL, diskId)) print ">>>>> reset disk", diskId, Disk.EVENT_FAIL, "@", disk_fail_time self.mission_time = disk_fail_time print " - system mission time - ", self.mission_time #------------------------------ # initialize the system state #------------------------------ self.state = State(self.sys, self.rebuild, self.copyback, self.events_queue) def get_next_wait_events(self): events = [] #--------------------------------------------------------------------------------------- if self.sys.dp_type == 0 or self.sys.dp_type == 1 or self.sys.dp_type == 2: #--------------------------------------------------------------------------------------- for serverId in self.sys.servers: if self.state.servers[serverId].wait_queue: avail_spares = self.state.servers[serverId].avail_spares while avail_spares and self.state.servers[ serverId].wait_queue: print "\n@wait_queue in server [", serverId, "] avail spares:", self.state.servers[ serverId].avail_spares deviceset = [] next_event = heappop( self.state.servers[serverId].wait_queue) #------------------------------------------ next_event_time = next_event[0] next_event_type = next_event[1] deviceset.append(next_event[2]) avail_spares -= 1 while self.state.servers[ serverId].wait_queue and self.state.servers[ serverId].wait_queue[0][ 0] == next_event_time and self.state.servers[ serverId].wait_queue[0][ 1] == next_event_type and avail_spares > 0: simultaneous_event = heappop( self.state.servers[serverId].wait_queue) deviceset.append(simultaneous_event[2]) avail_spares -= 1 print ">>>>> pop server wait disk", deviceset, next_event_type, " - time - ", next_event_time events.append( (next_event_time, next_event_type, deviceset)) return events def get_next_events(self): #-------------------------------------------------------------- wait_events = self.get_next_wait_events() if len(wait_events) > 0: return wait_events #-------------------------------------------------------------- if self.events_queue: deviceset = [] next_event = heappop(self.events_queue) #------------------------------------------ next_event_time = next_event[0] next_event_type = next_event[1] deviceset.append(next_event[2]) #---------------------------------------------- # gather the simultaneous failure/repair events #---------------------------------------------- while self.events_queue and self.events_queue[0][ 0] == next_event_time and self.events_queue[0][ 1] == next_event_type: simultaneous_event = heappop(self.events_queue) deviceset.append(simultaneous_event[2]) print "\n\n>>>>> pop next event -", deviceset, next_event_type, next_event_time return [(next_event_time, next_event_type, deviceset)] else: return [(None, None, None)] def run_simulation(self, iterations_per_worker, traces_per_worker): results = [] for one_iter in range(iterations_per_worker): results.append(self.run_iteration(one_iter)) return results def run_iteration(self, num_iter): self.reset() curr_time = 0 loss = 0 loopflag = True eventDL = 0 while loopflag: for each_event in self.get_next_events(): (event_time, event_type, deviceset) = each_event #----------------------------- # if invalid event, then exit #----------------------------- if event_time == None: loopflag = False break #---------------------------------- # update the system time and state #---------------------------------- if curr_time < event_time: curr_time = event_time #--------------------------- # exceed mission-time, exit #--------------------------- if curr_time > self.mission_time: loopflag = False loss = self.place.calculate_dataloss(self.state) break #---------------------------------- self.state.update_clock(event_type, curr_time) self.state.update_state(event_type, deviceset) self.state.update_event(event_type, deviceset) #------------------------------------------------------- # degraded rebuild or copyback event, continue #------------------------------------------------------- if event_type == Disk.EVENT_DEGRADEDREBUILD or event_type == Disk.EVENT_COPYBACK: continue #------------------------------------------ # check the PDL according to failure events #------------------------------------------ if event_type == Disk.EVENT_FAIL: eventDL = eventDL + 1 if self.place.check_global_dataloss(self.state, deviceset): print "############### data loss ##############", eventDL, "deviceset", deviceset, curr_time, ">>> unrecoverables - ", self.state.MTTDL, "\n" return (self.state.MTTDL, loss)