def printAll(self): default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \ ", disk capacity: " + str(self.disk_capacity) + "TB" + \ ", disks per machine: " + str(self.disks_per_machine) + \ ", machines per rack: " + str(self.machines_per_rack) + \ ", rack count: " + str(self.rack_count) + \ ", chunk size: " + str(self.chunk_size) + "MB" + \ ", total active storage: " + str(self.total_active_storage) + "PB" +\ ", data redundancy: " + self.data_redundancy + \ ", data placement: " + self.data_placement + \ ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \ ", installment size: " + str(self.installment_size) + \ ", event file path: " + self.event_file + \ ", outputs: " + str(self.outputs) + \ ", auto repair: " + str(self.auto_repair) + \ ", hierarchical: " + str(self.hierarchical) + \ ", parallel repair: " + str(self.parallel_repair) + \ ", lazy recovery flag: " + str(self.lazy_recovery) + \ ", lazy only available: " + str(self.lazy_only_available) + \ ", recovery threshold: " + str(self.recovery_threshold) + \ ", rafi recovery flag: " + str(self.rafi_recovery) if self.data_placement == "copyset": default_infos += ", scatter width: " + str(self.scatter_width) if self.hierarchical: default_infos += ", distinct racks: " + str(self.distinct_racks) info_logger.info(default_infos) recovery_infos = "" if self.rafi_recovery: recovery_infos += " detect intervals: " + str(self.detect_intervals) if self.lazy_recovery or self.rafi_recovery: info_logger.info(recovery_infos)
def run(self): conf = Configuration(self.conf_path) xml = XMLParser(conf) if conf.hier: self.distributer = HierSSSDistribute(xml) else: self.distributer = SSSDistribute(xml) self.conf = self.distributer.returnConf() self.event_handler = EventHandler self.distributer.start() events_handled = 0 events = EventQueue() if self.conf.system_upgrade: for info in self.conf.system_upgrade_infos: if info[0] == 1: upgrade_start_times = self.addSystemUpgrade(info, self.conf.total_time) if info[-1] is not None: self.addUpgradeCheckEvents(events, upgrade_start_times, info[-1]) if self.conf.correlated_failures: for info in self.conf.correlated_failures_infos: for i in xrange(10): cf_info = deepcopy(list(info)) cf_info[0] += i * 8760 print "correlated_failures info:", cf_info self.addCorrelatedFailures(cf_info) if self.conf.system_scaling: for info in self.conf.system_scaling_infos: self.addSystemScaling(info) info_logger.info("disk usage is: " + str(self.distributer.diskUsage()*100) + "%\n") self.distributer.getRoot().printAll() root = self.distributer.getRoot() root.generateEvents(events, 0, self.conf.total_time, True) for ts in self.conf.upgrade_ts: full_system_check_event = Event(Event.EventType.UpgradeCheck, ts, root, 6) events.addEvent(full_system_check_event) if self.conf.event_file != None: events_file = self.conf.event_file + '-' + self.ts events.printAll(events_file, "Iteration number: "+str(self.iteration_times)) self.iteration_times += 1 handler = self.event_handler(self.distributer) print "total slices:", handler.total_slices e = events.removeFirst() while e is not None: handler.handleEvent(e, events) e = events.removeFirst() events_handled += 1 self.total_events_handled += events_handled result = handler.end() info_logger.info(result.toString()) return result
def handleLatentDefect(self, u, time, e): current_total_slices = self.calCurrentTotalSlices(time) if isinstance(u, Disk): slice_count = len(u.getChildren()) if slice_count == 0: return self._my_assert(slice_count > 10) slice_index = choice(u.getChildren()) if slice_index >= current_total_slices: return if self.status[slice_index] == self.lost_slice: self.total_skipped_latent += 1 return repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) # A LSE cannot hit lost blocks or a same block multiple times if self.status[slice_index][ index] == UnitState.Corrupted or self.status[slice_index][ index] == UnitState.LatentError: self.total_skipped_latent += 1 return self._my_assert(self.durableCount(slice_index) >= 0) self.sliceDegraded(slice_index) self.status[slice_index][index] = UnitState.LatentError u.slices_hit_by_LSE.append(slice_index) self.total_latent_failures += 1 repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 self.startUnavailable(slice_index, time) if self.isLost(slice_index): info_logger.info( str(time) + " slice: " + str(slice_index) + " durCount: " + str(self.durableCount(slice_index)) + " latDefect " + str(True) + " due to ===latent=== error " + " on disk " + str(u.getID())) self.undurable_slice_count += 1 self.endUnavailable(slice_index, time) self.status[slice_index] = self.lost_slice else: raise Exception("Latent defect should only happen for disk") self.slices_degraded_list.append( (e.getTime(), self.current_slice_degraded)) self.slices_degraded_avail_list.append( (e.getTime(), self.current_avail_slice_degraded))
def printPerYearStart(self, per_day_start, description): d = 0 year = 365 for t in xrange(1, len(per_day_start)): d += per_day_start[t] if t % year == 0: d /= 365 info_logger.info(description + " " + str(t / year) + " " + str(d)) d = 0 info_logger.info(description + " " + str(len(per_day_start) / year) + " " + str(d / 365))
def end(self): ret = Result() # data loss probability and data unvailable probability data_loss_prob = format( float(self.undurable_slice_count) / self.total_slices, ".4e") unavailable_prob = self.calUnavailProb() Result.undurable_count = self.undurable_slice_count Result.unavailable_durations = self.unavailable_durations Result.data_loss_prob = data_loss_prob Result.unavailable_prob = unavailable_prob # repair bandwidth in GBs Result.total_repair_transfers = format( float(self.total_repair_transfers) / 1024, ".4e") info_logger.info( "anomalous available count: %d, total latent failure: %d,\ total scrubs: %d, total scrubs repairs: %d, \ total disk failures:%d, total disk repairs:%d, \ total machine failures:%d, total machine repairs:%d, \ total permanent machine failures:%d, \ total short temperary machine failures:%d, \ total long temperary machine failures:%d, \ total machine failures due to rack failures:%d, \ total eager machine repairs:%d, total eager slice repairs:%d, \ total skipped latent:%d, total incomplete recovery:%d\n \ max recovery bandwidth:%f\n \ undurable_slice_count:%d\n \ total repairs:%d, total optimal repairs:%d" % (self.anomalous_available_count, self.total_latent_failures, self.total_scrubs, self.total_scrub_repairs, self.total_disk_failures, self.total_disk_repairs, self.total_machine_failures, self.total_machine_repairs, self.total_perm_machine_failures, self.total_short_temp_machine_failures, self.total_long_temp_machine_failures, self.total_machine_failures_due_to_rack_failures, self.total_eager_machine_repairs, self.total_eager_slice_repairs, self.total_skipped_latent, self.total_incomplete_recovery_attempts, self.max_recovery_bandwidth, self.undurable_slice_count, self.total_repairs, self.total_optimal_repairs)) self.printDegradedStat(self.slices_degraded_list, "Avg_durable_degraded_", "slices") self.printDegradedStat(self.slices_degraded_avail_list, "Avg_available_degraded_", "slices") self.analyzeBandwidth() return ret
def handleLatentDefect(self, u, time, e): if isinstance(u, Disk): slice_count = len(u.getChildren()) if slice_count == 0: return self._my_assert(slice_count > 10) slice_index = choice(u.getChildren()) if slice_index >= self.total_slices: return if self.status[slice_index] == self.lost_slice: self.total_skipped_latent += 1 return repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) # A LSE cannot hit lost blocks or a same block multiple times if self.status[slice_index][index] == -1 or self.status[ slice_index][index] == -2: self.total_skipped_latent += 1 return self._my_assert(self.durableCount(slice_index) >= 0) self.sliceDegraded(slice_index) self.status[slice_index][index] = -2 u.slices_hit_by_LSE.append(slice_index) self.total_latent_failures += 1 repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 if slice_index in self.unavailable_slice_durations.keys(): self.unavailable_slice_durations[slice_index].append( [time]) else: self.unavailable_slice_durations[slice_index] = [[time]] if self.isLost(slice_index): info_logger.info( str(time) + " slice: " + str(slice_index) + " durCount: " + str(self.durableCount(slice_index)) + " latDefect " + str(True) + " due to ===latent=== error " + " on disk " + str(u.getID())) self.undurable_slice_count += 1 self.undurable_slice_infos.append( (slice_index, time, "LSE " + str(u.getID()))) self.status[slice_index] = self.lost_slice else: raise Exception("Latent defect should only happen for disk")
def printAll(self): default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \ ", disk capacity: " + str(self.disk_capacity) + "TB" + \ ", disks per machine: " + str(self.disks_per_machine) + \ ", machines per rack: " + str(self.machines_per_rack) + \ ", rack count: " + str(self.rack_count) + \ ", chunk size: " + str(self.chunk_size) + "MB" + \ ", total active storage: " + str(self.total_active_storage) + "PB" +\ ", data redundancy: " + str(self.data_redundancy) + \ ", hierarchical:" + str(self.hier) + \ ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \ ", xml file path: " + self.xml_file_path + \ ", event file path: " + self.event_file + \ ", parallel repair: " + str(self.parallel_repair) + \ ", upgrade flag: " + str(self.upgrades) + \ ", correlated failures flag: " + str(self.correlated_failures) info_logger.info(default_infos) if self.upgrades: info_logger.info("Upgrade Configurations: " + str(self.hard_upgrade_infos)) info_logger.info("Upgrade Configurations: " + str(self.soft_upgrade_infos)) if self.correlated_failures: info_logger.info("Correlated Failures Configurations: " + str(self.correlated_failures_infos))
def printAll(self): default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \ ", disk capacity: " + str(self.disk_capacity) + "TB" + \ ", disks per machine: " + str(self.disks_per_machine) + \ ", machines per rack: " + str(self.machines_per_rack) + \ ", rack count: " + str(self.rack_count) + \ ", chunk size: " + str(self.chunk_size) + "MB" + \ ", total active storage: " + str(self.total_active_storage) + "PB" +\ ", data redundancy: " + str(self.data_redundancy) + \ ", hierarchical:" + str(self.hier) + \ ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \ ", xml file path: " + self.xml_file_path + \ ", event file path: " + self.event_file + \ ", outputs: " + str(self.outputs) + \ ", parallel repair: " + str(self.parallel_repair) + \ ", system Scaling flag: " + str(self.system_scaling) + \ ", system upgrade flag: " + str(self.system_upgrade) + \ ", correlated failures flag: " + str(self.correlated_failures) info_logger.info(default_infos) if self.system_scaling: info_logger.info("System scaling Configurations: " + str(self.system_scaling_infos)) if self.system_upgrade: # info_logger.info("System upgrade format:(style, domain, freq, interval, downtime)") info_logger.info("System upgrade Configurations: " + str(self.system_upgrade_infos)) if self.correlated_failures: info_logger.info("Correlated Failures Configurations: " + str(self.correlated_failures_infos))
def run(self): conf = Configuration(self.conf_path) xml = XMLParser(conf) distributer_class = returnDistributer(conf.data_placement, conf.hierarchical) self.distributer = distributer_class(xml) self.conf = self.distributer.returnConf() if self.conf.rafi_recovery: self.event_handler = RAFIEventHandler else: self.event_handler = EventHandler self.distributer.start() # self.distributer.printGroupsToFile() info_logger.info("disk usage is: " + str(self.distributer.diskUsage() * 100) + "%\n") self.distributer.getRoot().printAll() events_handled = 0 events = EventQueue() root = self.distributer.getRoot() root.generateEvents(events, 0, self.conf.total_time, True) # if False: if self.conf.event_file != None: events_file = self.conf.event_file + '-' + self.ts events.printAll(events_file, "Iteration number: " + str(self.iteration_times)) self.iteration_times += 1 handler = self.event_handler(self.distributer) print "total slices:", handler.total_slices e = events.removeFirst() while e is not None: handler.handleEvent(e, events) e = events.removeFirst() events_handled += 1 self.total_events_handled += events_handled result = handler.end() info_logger.info(result.toString()) return result
def main(self, conf_path): events_handled = 0 ge = GenerateEvents(conf_path) distributer = ge.getDistributer() events = ge.main() handler = NormalDistributeEventHandler(distributer) print "total slices:", handler.total_slices e = events.removeFirst() while e is not None: handler.handleEvent(e, events) e = events.removeFirst() events_handled += 1 result = handler.end() info_logger.info(result.toString()) info_logger.info("Events handled: %d" % events_handled)
def handleFailure(self, u, time, e, queue): if e.ignore: return UnfinishRAFIEvents.queue = queue outtoin_slices = {} intoin_slices = {} current_total_slices = self.calCurrentTotalSlices(time) if isinstance(u, Machine): self.total_machine_failures += 1 u.setLastFailureTime(e.getTime()) if e.info == 3: self.total_perm_machine_failures += 1 else: if e.info == 1: self.total_short_temp_machine_failures += 1 elif e.info == 2: self.total_long_temp_machine_failures += 1 else: self.total_machine_failures_due_to_rack_failures += 1 if e.next_recovery_time - e.getTime() <= u.fail_timeout: self.total_short_temp_machine_failures += 1 else: self.total_long_temp_machine_failures += 1 disks = u.getChildren() for child in disks: slice_indexes = child.getChildren() for slice_index in slice_indexes: if slice_index >= current_total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegradedAvailability(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(child) if self.status[slice_index][index] == UnitState.Normal: self.status[slice_index][index] = UnitState.Crashed self._my_assert(self.availableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 self.startUnavailable(slice_index, time) # rafi start unavailable = self.n - self.availableCount(slice_index) fs = FailedSlice() fs.addInfo(time, e.next_recovery_time) self.failed_slices[slice_index] = fs rafi_flag = fs.check(time) # slice from not in rafi event to in a rafi event if rafi_flag == FailedSlice.RAFITransition.OutToIn: outtoin_slices[slice_index] = fs # slice from lower risk rafi event to higher risk rafi event elif rafi_flag == FailedSlice.RAFITransition.InToIn: intoin_slices[slice_index] = fs else: # don't care other two situations pass outtoin_slice_indexes = outtoin_slices.keys() intoin_slice_indexes = intoin_slices.keys() new_rafi_slices = [] upgraded_rafi_slices = [] for slice_index in outtoin_slice_indexes: if self.availableCount(slice_index) <= self.recovery_threshold: new_rafi_slices.append(slice_index) for slice_index in intoin_slice_indexes: if self.availableCount(slice_index) <= self.recovery_threshold: upgraded_rafi_slices.append(slice_index) if new_rafi_slices != []: groups_in_new = [[] for i in xrange(self.n - self.k)] for slice_index in new_rafi_slices: unavailable = outtoin_slices[slice_index].failedNum() groups_in_new[unavailable-1].append(slice_index) for group in groups_in_new: if group != []: # timestamp of data starts to recover(ts+detect time+identify time) recover_time = ceil(time/self.node_state_check)*self.node_state_check + \ self.detect_intervals[unavailable-1] self.unfinished_rafi_events.addEvent(group, recover_time) if upgraded_rafi_slices != []: groups_in_upgraded = [[] for i in xrange(self.n - self.k)] for slice_index in upgraded_rafi_slices: unavailable = intoin_slices[slice_index].failedNum() groups_in_upgraded[unavailable-1].append(slice_index) for group in groups_in_upgraded: if group != []: recover_time = ceil(time/self.node_state_check)*self.node_state_check + \ self.detect_intervals[unavailable-1] self.unfinished_rafi_events.updateEvent(group, recover_time) self.slices_degraded_avail_list.append((e.getTime(), self.current_avail_slice_degraded)) elif isinstance(u, Disk): self.total_disk_failures += 1 u.setLastFailureTime(e.getTime()) # need to compute projected reovery b/w needed projected_bandwidth_need = 0.0 slice_indexes = u.getChildren() for slice_index in slice_indexes: if slice_index >= current_total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegraded(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) if self.status[slice_index][index] == UnitState.Corrupted: continue self.status[slice_index][index] = UnitState.Corrupted self._my_assert(self.durableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) # exclude the disk lost caused by node lost, it has already considered in node lost if e.info != self.inherit_lost and repairable_before and not repairable_current: self.unavailable_slice_count += 1 self.startUnavailable(slice_index, time) if self.isLost(slice_index): info_logger.info( "time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to disk " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.endUnavailable(slice_index, time) continue # is this slice one that needs recovering? if so, how much # data to recover? if self.status[slice_index] != self.lost_slice: threshold_crossed = False num_undurable = self.n - self.durableCount(slice_index) if num_undurable >= self.n - self.recovery_threshold: threshold_crossed = True num_unavailable = 0 if self.availability_counts_for_recovery: num_unavailable = self.n - \ self.availableCount(slice_index) if num_unavailable >= self.n - self.recovery_threshold: threshold_crossed = True if threshold_crossed: projected_bandwidth_need += self.k - 1 + \ (self.n - self.status[slice_index].count(UnitState.Normal)) # current recovery bandwidth goes up by projected bandwidth need projected_bandwidth_need /= (e.next_recovery_time - e.getTime()) u.setLastBandwidthNeed(projected_bandwidth_need) self._my_assert(self.current_recovery_bandwidth >= 0) self.current_recovery_bandwidth += projected_bandwidth_need self._my_assert(self.current_recovery_bandwidth >= 0) if self.current_recovery_bandwidth > self.max_recovery_bandwidth: self.max_recovery_bandwidth = self.current_recovery_bandwidth self._my_assert(self.current_recovery_bandwidth >= 0) self.slices_degraded_list.append((e.getTime(), self.current_slice_degraded)) self.slices_degraded_avail_list.append( (e.getTime(), self.current_avail_slice_degraded)) else: for child in u.getChildren(): self.handleFailure(child, time, e, queue)
def handleFailure(self, u, time, e, queue): if e.ignore: return UnfinishRAFIEvents.queue = queue outtoin_slices = {} intoin_slices = {} if isinstance(u, Machine): self.total_machine_failures += 1 u.setLastFailureTime(e.getTime()) if e.info == 3: self.total_perm_machine_failures += 1 else: if e.info == 1: self.total_short_temp_machine_failures += 1 elif e.info == 2: self.total_long_temp_machine_failures += 1 else: self.total_machine_failures_due_to_rack_failures += 1 if e.next_recovery_time - e.getTime() <= u.fail_timeout: self.total_short_temp_machine_failures += 1 else: self.total_long_temp_machine_failures += 1 disks = u.getChildren() for child in disks: slice_indexes = child.getChildren() for slice_index in slice_indexes: if slice_index >= self.total_slices: continue if self.status[slice_index] == self.lost_slice: continue if e.info == 3: self.sliceDegraded(slice_index) else: self.sliceDegradedAvailability(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(child) if self.status[slice_index][index] == -1: continue if e.info == 3: self.status[slice_index][index] == -1 self._my_assert(self.durableCount(slice_index) >= 0) else: if self.status[slice_index][index] == 1: self.status[slice_index][index] = 0 self._my_assert(self.availableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 if slice_index in self.unavailable_slice_durations.keys( ): self.unavailable_slice_durations[ slice_index].append([time]) else: self.unavailable_slice_durations[slice_index] = [[ time ]] # rafi start unavailable = self.n - self.availableCount(slice_index) fs = FailedSlice() fs.addInfo(time, e.next_recovery_time) self.failed_slices[slice_index] = fs rafi_flag = fs.check(time) # slice from not in rafi event to in a rafi event if rafi_flag == FailedSlice.RAFITransition.OutToIn: outtoin_slices[slice_index] = fs # slice from lower risk rafi event to higher risk rafi event elif rafi_flag == FailedSlice.RAFITransition.InToIn: intoin_slices[slice_index] = fs else: # don't care other two situations pass if e.info == 3: # lost stripes have been recorded in unavailable_slice_durations if self.isLost(slice_index): info_logger.info( "time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to machine " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.undurable_slice_infos.append( (slice_index, time, "machine " + str(u.getID()))) continue outtoin_slice_indexes = outtoin_slices.keys() intoin_slice_indexes = intoin_slices.keys() new_rafi_slices = [] upgraded_rafi_slices = [] for slice_index in outtoin_slice_indexes: if self.availableCount(slice_index) <= self.recovery_threshold: new_rafi_slices.append(slice_index) for slice_index in intoin_slice_indexes: if self.availableCount(slice_index) <= self.recovery_threshold: upgraded_rafi_slices.append(slice_index) if new_rafi_slices != []: groups_in_new = [[] for i in xrange(self.n - self.k)] for slice_index in new_rafi_slices: unavailable = outtoin_slices[slice_index].failedNum() groups_in_new[unavailable - 1].append(slice_index) for group in groups_in_new: if group != []: # timestamp of data starts to recover(ts+detect time) recover_time = time + self.detect_intervals[unavailable - 1] self.unfinished_rafi_events.addEvent( group, recover_time) if upgraded_rafi_slices != []: groups_in_upgraded = [[] for i in xrange(self.n - self.k)] for slice_index in upgraded_rafi_slices: unavailable = intoin_slices[slice_index].failedNum() groups_in_upgraded[unavailable - 1].append(slice_index) for group in groups_in_upgraded: if group != []: recover_time = time + self.detect_intervals[unavailable - 1] self.unfinished_rafi_events.updateEvent( group, recover_time) elif isinstance(u, Disk): self.total_disk_failures += 1 u.setLastFailureTime(e.getTime()) # need to compute projected reovery b/w needed projected_bandwidth_need = 0.0 slice_indexes = u.getChildren() for slice_index in slice_indexes: if slice_index >= self.total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegraded(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) if self.status[slice_index][index] == -1: continue self.status[slice_index][index] = -1 self._my_assert(self.durableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 if slice_index in self.unavailable_slice_durations.keys(): self.unavailable_slice_durations[slice_index].append( [time]) else: self.unavailable_slice_durations[slice_index] = [[ time ]] if self.isLost(slice_index): info_logger.info("time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to disk " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.undurable_slice_infos.append( (slice_index, time, "disk " + str(u.getID()))) continue else: for child in u.getChildren(): self.handleFailure(child, time, e, queue)
def handleFailure(self, u, time, e, queue): if e.ignore: return current_total_slices = self.calCurrentTotalSlices(time) if isinstance(u, Machine): self.total_machine_failures += 1 u.setLastFailureTime(e.getTime()) if e.info == 3: self.total_perm_machine_failures += 1 else: if e.info == 1: self.total_short_temp_machine_failures += 1 elif e.info == 2: self.total_long_temp_machine_failures += 1 else: self.total_machine_failures_due_to_rack_failures += 1 if e.next_recovery_time - e.getTime() <= u.fail_timeout: self.total_short_temp_machine_failures += 1 else: self.total_long_temp_machine_failures += 1 disks = u.getChildren() for child in disks: slice_indexes = child.getChildren() for slice_index in slice_indexes: if slice_index >= current_total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegradedAvailability(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(child) if self.status[slice_index][index] == UnitState.Normal: self.status[slice_index][index] = UnitState.Crashed self._my_assert(self.availableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 self.startUnavailable(slice_index, time) self.slices_degraded_avail_list.append( (e.getTime(), self.current_avail_slice_degraded)) elif isinstance(u, Disk): self.total_disk_failures += 1 u.setLastFailureTime(e.getTime()) # need to compute projected reovery b/w needed projected_bandwidth_need = 0.0 slice_indexes = u.getChildren() for slice_index in slice_indexes: if slice_index >= current_total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegraded(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) if self.status[slice_index][index] == UnitState.Corrupted: continue self.status[slice_index][index] = UnitState.Corrupted self._my_assert(self.durableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) # exclude the disk lost caused by node lost, it has already considered in node lost if e.info != self.inherit_lost and repairable_before and not repairable_current: self.unavailable_slice_count += 1 self.startUnavailable(slice_index, time) if self.isLost(slice_index): info_logger.info("time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to disk " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.endUnavailable(slice_index, time) continue # is this slice one that needs recovering? if so, how much # data to recover? if self.status[slice_index] != self.lost_slice: threshold_crossed = False num_undurable = self.n - self.durableCount(slice_index) if num_undurable >= self.n - self.recovery_threshold: threshold_crossed = True num_unavailable = 0 if self.availability_counts_for_recovery: num_unavailable = self.n - \ self.availableCount(slice_index) if num_unavailable + num_undurable >= self.n - \ self.recovery_threshold: threshold_crossed = True if threshold_crossed: projected_bandwidth_need += self.k - 1 + \ (self.n - self.status[slice_index].count(UnitState.Normal)) # current recovery bandwidth goes up by projected bandwidth need projected_bandwidth_need /= (e.next_recovery_time - e.getTime()) u.setLastBandwidthNeed(projected_bandwidth_need) self._my_assert(self.current_recovery_bandwidth >= 0) self.current_recovery_bandwidth += projected_bandwidth_need self._my_assert(self.current_recovery_bandwidth >= 0) if self.current_recovery_bandwidth > self.max_recovery_bandwidth: self.max_recovery_bandwidth = self.current_recovery_bandwidth self._my_assert(self.current_recovery_bandwidth >= 0) self.slices_degraded_list.append( (e.getTime(), self.current_slice_degraded)) self.slices_degraded_avail_list.append( (e.getTime(), self.current_avail_slice_degraded)) else: for child in u.getChildren(): self.handleFailure(child, time, e, queue)
def printDegradedStat(self, degraded, description, unit): current_sample_average = 0 current_time = 0 sampling_period = 24 # sampling per min, so 24*60 items in below list values_per_sample = [] samples = int(self.conf.total_time / 24) if self.conf.total_time % 24 != 0: samples += 1 samples += 1 day_samples = [0] * samples previous_window_value = 0 avg_of_avgs = 0 avg_count = 0 max_v = 0 it = iter(degraded) try: t = it.next() except StopIteration: t = None while t is not None: values_per_sample = [0] * (24 * 60) for i in xrange(sampling_period * 60): if t is None: break per_sample_count = 0 while True: if t[0] > current_time + i / 60: per_sample_count = 0 values_per_sample[i] = previous_window_value break else: values_per_sample[i] = (values_per_sample[i] * per_sample_count+t[1]) /\ (per_sample_count+1) previous_window_value = t[1] per_sample_count += 1 try: t = it.next() except StopIteration: t = None break current_sample_average = 0 for i in xrange(sampling_period * 60): current_sample_average += values_per_sample[i] if max_v < values_per_sample[i]: max_v = values_per_sample[i] current_sample_average /= (sampling_period * 60) if int(current_time / 24) >= samples: break day_samples[int(current_time / 24)] = current_sample_average current_time += sampling_period avg_of_avgs += current_sample_average avg_count += 1 avg_of_avgs /= avg_count stdev = 0.0 for val in day_samples: stdev += (val - avg_of_avgs) * (val - avg_of_avgs) info_logger.info("%s_per_%dh_%s %d stdev:%f max:%d" % (description, sampling_period, unit, avg_of_avgs, sqrt(stdev / (len(day_samples) - 1)), max_v)) self.printPerYearStart(day_samples, description)
def end(self): ret = Result() # data loss probability and data unvailable probability data_loss_prob = format( float(self.undurable_slice_count) / (self.total_slices * self.n), ".4e") Result.undurable_count = self.undurable_slice_count Result.unavailable_count = self.unavailable_slice_count Result.undurable_infos = self.undurable_slice_infos Result.unavailable_slice_durations = self.unavailable_slice_durations Result.PDL = data_loss_prob TTFs, TTRs = self.processDuration() Result.PUA = self.calUA(TTFs, TTRs) # Result.unavailable_prob1 = self.calUADowntime(TTRs) Result.undurable_count_details = self.calUndurableDetails() Result.NOMDL = self.NOMDL() # total repair cost in PiBs Result.TRC = format( float(self.total_repair_transfers) / pow(2, 30), ".2e") years = self.end_time / 8760 # total storage cost in PiB*year Result.TSC = format( float(self.conf.total_active_storage) * self.n / self.k * years, ".2e") if not self.queue_disable: queue_times, avg_queue_time = self.contention_model.statistics() Result.queue_times = queue_times Result.avg_queue_time = format(avg_queue_time, ".4f") info_logger.info( "total times of queuing: %d, average queue time: %f" % (queue_times, avg_queue_time)) info_logger.info( "anomalous available count: %d, total latent failure: %d,\ total scrubs: %d, total scrubs repairs: %d, \ total disk failures:%d, total disk repairs:%d, \ total machine failures:%d, total machine repairs:%d, \ total permanent machine failures:%d, \ total short temperary machine failures:%d, \ total long temperary machine failures:%d, \ total machine failures due to rack failures:%d, \ total eager machine repairs:%d, total eager slice repairs:%d, \ total skipped latent:%d, total incomplete recovery:%d\n \ max recovery bandwidth:%f\n \ undurable_slice_count:%d\n \ total repairs:%d, total optimal repairs:%d" % (self.anomalous_available_count, self.total_latent_failures, self.total_scrubs, self.total_scrub_repairs, self.total_disk_failures, self.total_disk_repairs, self.total_machine_failures, self.total_machine_repairs, self.total_perm_machine_failures, self.total_short_temp_machine_failures, self.total_long_temp_machine_failures, self.total_machine_failures_due_to_rack_failures, self.total_eager_machine_repairs, self.total_eager_slice_repairs, self.total_skipped_latent, self.total_incomplete_recovery_attempts, self.max_recovery_bandwidth, self.undurable_slice_count, self.total_repairs, self.total_optimal_repairs)) return ret
def handleFailure(self, u, time, e, queue): if e.ignore: return if isinstance(u, Machine): self.total_machine_failures += 1 u.setLastFailureTime(e.getTime()) if e.info == 3: self.total_perm_machine_failures += 1 else: if e.info == 1: self.total_short_temp_machine_failures += 1 elif e.info == 2: self.total_long_temp_machine_failures += 1 else: self.total_machine_failures_due_to_rack_failures += 1 if e.next_recovery_time - e.getTime() <= u.fail_timeout: self.total_short_temp_machine_failures += 1 else: self.total_long_temp_machine_failures += 1 disks = u.getChildren() for child in disks: slice_indexes = child.getChildren() for slice_index in slice_indexes: if slice_index >= self.total_slices: continue if self.status[slice_index] == self.lost_slice: continue if e.info == 3: self.sliceDegraded(slice_index) else: self.sliceDegradedAvailability(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(child) if self.status[slice_index][index] == -1: continue if e.info == 3: self.status[slice_index][index] = -1 self._my_assert(self.durableCount(slice_index) >= 0) else: if self.status[slice_index][index] == 1: self.status[slice_index][index] = 0 self._my_assert(self.availableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 if slice_index in self.unavailable_slice_durations.keys( ): self.unavailable_slice_durations[ slice_index].append([time]) else: self.unavailable_slice_durations[slice_index] = [[ time ]] if e.info == 3: # lost stripes have been recorded in unavailable_slice_durations if self.isLost(slice_index): info_logger.info( "time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to machine " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.undurable_slice_infos.append( (slice_index, time, "machine " + str(u.getID()))) continue elif isinstance(u, Disk): self.total_disk_failures += 1 u.setLastFailureTime(e.getTime()) # need to compute projected reovery b/w needed projected_bandwidth_need = 0.0 slice_indexes = u.getChildren() for slice_index in slice_indexes: if slice_index >= self.total_slices: continue if self.status[slice_index] == self.lost_slice: continue self.sliceDegraded(slice_index) repairable_before = self.isRepairable(slice_index) index = self.slice_locations[slice_index].index(u) if self.status[slice_index][index] == -1: continue self.status[slice_index][index] = -1 self._my_assert(self.durableCount(slice_index) >= 0) repairable_current = self.isRepairable(slice_index) if repairable_before and not repairable_current: self.unavailable_slice_count += 1 if slice_index in self.unavailable_slice_durations.keys(): self.unavailable_slice_durations[slice_index].append( [time]) else: self.unavailable_slice_durations[slice_index] = [[ time ]] if self.isLost(slice_index): info_logger.info("time: " + str(time) + " slice:" + str(slice_index) + " durCount:" + str(self.durableCount(slice_index)) + " due to disk " + str(u.getID())) self.status[slice_index] = self.lost_slice self.undurable_slice_count += 1 self.undurable_slice_infos.append( (slice_index, time, "disk " + str(u.getID()))) continue else: for child in u.getChildren(): self.handleFailure(child, time, e, queue)