Ejemplo n.º 1
0
    def addCorrelatedFailures(self, cf_info):
        choose_from_una = cf_info[-1]

        # interval formats: [failure start time, failure end time, lost flag]
        #                   lost flag = False, unavailable event, default
        #                   lost flag = True, lost event.
        una_interval = [cf_info[0], cf_info[2] + cf_info[0], False]
        una_components = self._failedComponents(cf_info[1], una_interval)

        if cf_info[3] != None:
            dl_interval = [cf_info[0], cf_info[4] + cf_info[0], True]
            if cf_info[3] == cf_info[1]:
                for component in una_components:
                    component.addFailureInterval(dl_interval)
            else:
                if choose_from_una:
                    failed_num = splitMethod(cf_info[3], '_')
                    components_for_choosen = []
                    if failed_num[1] == "machine":
                        for component in una_components:
                            if isinstance(component, Rack):
                                for child in component.getChildren():
                                    components_for_choosen.append(child)
                            if isinstance(component, Machine):
                                components_for_choosen.append(component)
                        dl_components = sample(components_for_choosen, int(failed_num[0]))
                    elif failed_num[1] == "disk":
                        for component in una_components:
                            if isinstance(component, Rack):
                                disks = []
                                self.distributer.getAllDisksInRack(component, disks)
                                for disk in disks:
                                    components_for_choosen.append(disk)
                            if isinstance(component, Machine):
                                for disk in component.getChildren():
                                    components_for_choosen.append(disk)
                            if isinstance(component, Disk):
                                components_for_choosen.append(disk)
                        dl_components = sample(components_for_choosen, int(failed_num[0]))
                    else:
                        raise Exception("Unsupport lost event!")
                    for component in dl_components:
                        component.addFailureInterval(dl_interval)
                else:
                    # here, unavailable compoent should be excluded.
                    # But, a little complicate to implement.
                    dl_components = self._failedComponents(cf_info[3], dl_interval)

                for component in dl_components:
                    if (component in una_components) or (component.getParent() in una_components) or (component.getParent().getParent() in una_components):
                        component.updateFailureInterval(dl_interval, cf_info[2] + cf_info[0])

        return self.distributer.getRoot()
Ejemplo n.º 2
0
    def _failedComponents(self, info, interval):
        info.strip()
        scope = splitMethod(info, '_')
        failure_components = []
        failed_amounts = []

        length = len(scope) / 2
        for i in xrange(length):
            failed_amounts.append(scope[2 * i:2 * (i + 1)])

        # racks come from system tree, so we can not modify it.
        racks = self.distributer.getAllRacks()
        for num, component in failed_amounts:
            if component == "rack":
                failed_racks = sample(racks, int(num))
                for rack in failed_racks:
                    rack.addFailureInterval(interval)
                    failure_components.append(rack)
            elif component == "machine":
                machines = []
                for rack in racks:
                    if rack in failure_components:
                        continue
                    else:
                        machines += rack.getChildren()
                failed_machines = sample(machines, int(num))
                for machine in failed_machines:
                    machines.remove(machine)
                    machine.addFailureInterval(interval)
                    failure_components.append(machine)
            elif component == "disk":
                disks = []
                for rack in racks:
                    if rack in failure_components:
                        continue
                    rack_disks = []
                    self.distributer.getAllDisksInRack(rack, rack_disks)
                    disks += rack_disks
                failed_disks = sample(disks, int(num))
                for disk in failed_disks:
                    disk.addFailureInterval(interval)
                    failure_components.append(disk)

        return failure_components
Ejemplo n.º 3
0
    def insertUpgradeDurations(self, upgrade_infos):
        freq = int(upgrade_infos[0])
        domain = upgrade_infos[1]
        unit_num, unit = splitMethod(domain, '_')
        num = int(unit_num)

        upgrade_durations = []

        num_of_rollings = int(self.conf.total_time) / freq
        start_ts = round(uniform(0, float(self.conf.total_time) % freq), 2)
        for j in xrange(1, num_of_rollings + 1):
            rolling_start = start_ts + j * freq

            if unit == "rack":
                loops = self.conf.rack_count / num
                remainder = self.conf.rack_count % num
                racks = self.distributer.getAllRacks()
                for i in xrange(loops):
                    start_time = rolling_start + downtime * i
                    duration = Duration(Duration.DurationType.Unavailable,
                                        start_time, start_time + downtime,
                                        racks[i * num, (i + 1) * num])
                    upgrade_durations.append(duration)
                if remainder != 0:
                    start_time = rolling_start + downtime * loops
                    duration = Duration(Duration.DurationType.Unavailable,
                                        start_time, start_time + downtime,
                                        racks[-remainder:])
                    upgrade_durations.append(duration)
            elif unit == "machine":
                pass
            else:
                raise Exception("Upgrade domain must be in rack or machine")

        if len(upgrade_infos) == 6:
            self.insertHardUpgrades(upgrade_infos)
        if len(upgrade_infos) == 4:
            self.insertSoftUpgrades(soft_infos)
Ejemplo n.º 4
0
    def __init__(self, path=None):
        if path is None:
            conf_path = Configuration.path
        else:
            conf_path = path
        self.conf = getConfParser(conf_path)

        try:
            d = self.conf.defaults()
        except ConfigParser.NoSectionError:
            raise Exception("No Default Section!")

        self.total_time = int(d["total_time"])
        # total active storage in PBs
        self.total_active_storage = float(d["total_active_storage"])

        self.chunk_size = int(d["chunk_size"])
        self.disk_capacity = float(d["disk_capacity"])
        # tanslate TB of manufacturrer(10^12 bytes) into GBs(2^30 bytes)
        self.actual_disk_capacity = self.disk_capacity * pow(10, 12)/ pow(2, 30)
        self.max_chunks_per_disk = int(floor(self.actual_disk_capacity*1024/self.chunk_size))

        self.disks_per_machine = int(d["disks_per_machine"])
        self.machines_per_rack = int(d["machines_per_rack"])
        self.rack_count = int(d["rack_count"])

        self.datacenters = int(d.pop("datacenters", 1))

        self.event_file = d.pop("event_file", None)

        # If n <= 15 in each stripe, no two chunks are on the same rack.
        self.num_chunks_diff_racks = 15

        self.data_placement = d["data_placement"]
        if self.data_placement.lower() == "copyset":
            self.scatter_width = int(d["scatter_width"])

        self.data_redundancy = d.pop("data_redundancy")
        data_redundancy = extractDRS(self.data_redundancy)

        # True means auto repair; False means manual repair
        self.auto_repair = self._bool(d.pop("auto_repair", "true"))
        self.overall_calculation = self._bool(d.pop("overall_calculation", "true"))

        self.lazy_recovery = self._bool(d.pop("lazy_recovery", "false"))
        self.lazy_only_available = self._bool(d.pop("lazy_only_available", "true"))
        self.recovery_bandwidth_cross_rack = int(d["recovery_bandwidth_cross_rack"])
        self.queue_disable = self._bool(d.pop("queue_disable", "true"))
        self.bandwidth_contention = d["bandwidth_contention"]
        self.node_bandwidth = int(d["node_bandwidth"])

        self.hierarchical = self._bool(d.pop("hierarchical", "false"))
        if self.hierarchical:
            self.distinct_racks = int(d.pop("distinct_racks"))
            self.recovery_bandwidth_intra_rack = int(d["recovery_bandwidth_intra_rack"])

        self.parallel_repair = self._bool(d.pop("parallel_repair", "false"))

        self.availability_counts_for_recovery = self._bool(d[
            "availability_counts_for_recovery"])

        self.availability_to_durability_threshold = splitIntMethod(d["availability_to_durability_threshold"])
        self.recovery_probability = splitIntMethod(d["recovery_probability"])
        self.max_degraded_slices = float(d["max_degraded_slices"])
        self.installment_size = int(d["installment_size"])

        self.outputs = splitMethod(d["outputs"])

        detect_intervals = d.pop("detect_intervals", None)
        if detect_intervals is None:
            self.rafi_recovery = False
            self.detect_intervals = detect_intervals
        else:
            self.rafi_recovery = True
            self.detect_intervals = splitFloatMethod(detect_intervals)

        self.drs_handler = getDRSHandler(data_redundancy[0], data_redundancy[1:])
        if not self.lazy_recovery:
            self.recovery_threshold = self.drs_handler.n - 1
        else:
            self.recovery_threshold = int(d.pop("recovery_threshold"))

        # total slices calculate like this only without heterogeneous redundancy
        self.total_slices = int(ceil(self.total_active_storage*pow(2,30)/(self.drs_handler.k*self.chunk_size)))

        self.chunk_repair_time, self.disk_repair_time, self.node_repair_time = self._repairTime()
Ejemplo n.º 5
0
    def addSystemUpgrade(self, upgrade_info, end_time):
        style = upgrade_info[0]
        domain_infos = splitMethod(upgrade_info[1], '_')
        freq = upgrade_info[2]
        interval = upgrade_info[3]
        downtime = upgrade_info[4]

        upgrade_start = round(uniform(0, float(end_time)%freq), 3)

        upgrade_start_times = []

        upgrade_times = int((end_time - upgrade_start)/freq)
        for j in xrange(1, upgrade_times+1):
            system_upgrade_start = j*freq + upgrade_start
            upgrade_start_times.append(system_upgrade_start)

            if domain_infos[1] == "rack":
                upgrade_domain_in_racks = int(domain_infos[0])
                machines = self.distributer.getAllMachines()
                # racks = self.distributer.getAllRacks()
                loops = len(machines)/upgrade_domain_in_racks
                remainder = len(machines) % upgrade_domain_in_racks
                for i in xrange(loops):
                    start_time = system_upgrade_start + (downtime + interval)*i
                    upgrade_machines = machines[i*upgrade_domain_in_racks:(i+1)*upgrade_domain_in_racks]
                    for item in upgrade_machines:
                        for machine in item:
                            machine.addFailureInterval([start_time, start_time + downtime, False])
                if remainder != 0:
                    start_time = system_upgrade_start + (downtime + interval) * loops
                    upgrade_machines = machines[-remainder:]
                    for item in upgrade_machines:
                        for machine in item:
                            machine.addFailureInterval([start_time, start_time + downtime, False])
            elif domain_infos[1] == "machine":
                # machines per rack is divisible by upgrade_domain_in_machines
                upgrade_domain_in_machines = int(domain_infos[0])
                machines_in_racks = self.distributer.getAllMachines()
                rack_count = len(machines_in_racks)
                machines_per_rack = len(machines_in_racks[0])
                loops = (rack_count*machines_per_rack)/upgrade_domain_in_machines
                remainder = (rack_count*machines_per_rack)%upgrade_domain_in_machines

                machines = []
                for rack in machines_in_racks:
                    machines += rack

                for i in xrange(loops):
                    # rack_index = i*upgrade_domain_in_machines/machines_per_rack
                    # current_rack = machines[rack_index]
                    start_time = system_upgrade_start + (downtime + interval)*i
                    for a in xrange(upgrade_domain_in_machines):
                        machine = machines[a + i*upgrade_domain_in_machines]
                        machine.addFailureInterval([start_time, start_time+downtime, False])
                if remainder != 0:
                    # current_rack = machines[-1]
                    start_time = system_upgrade_start + (downtime + interval) * loops
                    # upgrade domain in "machine" means remainder <= machines_per_rack
                    for j in xrange(-remainder, 0):
                        machine = machines[j]
                        machine.addFailureInterval([start_time, start_time+downtime, False])
            else:
                pass

        return upgrade_start_times
Ejemplo n.º 6
0
    def __init__(self, path=None):
        if path is None:
            conf_path = Configuration.path
        else:
            conf_path = path
        self.conf = getConfParser(conf_path)

        try:
            d = self.conf.defaults()
        except ConfigParser.NoSectionError:
            raise Exception("No Default Section!")

        self.total_time = int(d["total_time"])
        # total active storage in PBs
        self.total_active_storage = float(d["total_active_storage"])

        self.chunk_size = int(d["chunk_size"])
        self.disk_capacity = float(d["disk_capacity"])
        # tanslate TB of manufacturrer(10^12 bytes) into GBs(2^30 bytes)
        self.actual_disk_capacity = self.disk_capacity * pow(10, 12) / pow(
            2, 30)
        self.max_chunks_per_disk = floor(self.actual_disk_capacity * 1024 /
                                         self.chunk_size)
        self.disks_per_machine = int(d["disks_per_machine"])
        self.machines_per_rack = int(d["machines_per_rack"])
        self.rack_count = int(d["rack_count"])
        self.datacenters = 1

        self.xml_file_path = d.pop("xml_file_path")
        self.event_file = d.pop("event_file", None)

        # If n <= 15 in each stripe, no two chunks are on the same rack.
        self.num_chunks_diff_racks = 15

        self.data_redundancy = d["data_redundancy"]
        data_redundancy = extractDRS(self.data_redundancy)

        self.lazy_recovery = self._bool(d.pop("lazy_recovery", "false"))
        self.lazy_only_available = self._bool(
            d.pop("lazy_only_available", "true"))

        self.recovery_bandwidth_cross_rack = int(
            d["recovery_bandwidth_cross_rack"])
        self.queue_disable = self._bool(d.pop("queue_disable", "true"))
        self.bandwidth_contention = d["bandwidth_contention"]
        self.node_bandwidth = int(d["node_bandwidth"])

        self.parallel_repair = self._bool(d.pop("parallel_repair", "false"))

        self.availability_counts_for_recovery = self._bool(
            d["availability_counts_for_recovery"])

        self.availability_to_durability_threshold = splitIntMethod(
            d["availability_to_durability_threshold"])
        self.recovery_probability = splitIntMethod(d["recovery_probability"])
        self.max_degraded_slices = float(d["max_degraded_slices"])
        self.installment_size = int(d["installment_size"])

        self.outputs = splitMethod(d["outputs"])

        self.drs_handler = getDRSHandler(data_redundancy[0],
                                         data_redundancy[1:])
        if not self.lazy_recovery:
            self.recovery_threshold = self.drs_handler.n - 1
        else:
            self.recovery_threshold = int(d.pop("recovery_threshold"))

        # flat or hier, if hier, r is the distinct_racks
        self.hier = self._bool(d.pop("hierarchical", "false"))
        if self.hier:
            self.r = int(d["distinct_racks"])
        else:
            self.r = self.drs_handler.n

        self.system_scaling = self._bool(d.pop("system_scaling", "false"))
        if self.system_scaling:
            sections = self._getSections("System Scaling")
            if sections == []:
                raise Exception(
                    "No System Scaling section in configuration file")
            self.system_scaling_infos = []
            for section in sections:
                self.system_scaling_infos.append(
                    self.parserScalingSettings(section))

        self.system_upgrade = self._bool(d.pop("system_upgrade", "false"))
        self.upgrade_ts = []
        self.failure_generator = None
        self.lse_generator = None
        if self.system_upgrade:
            sections = self._getSections("System Upgrade")
            if sections == []:
                raise Exception(
                    "No System Upgrade section in configuration file")
            self.system_upgrade_infos = []
            for section in sections:
                upgrade_info = self.parserUpgradeSettings(section)
                self.system_upgrade_infos.append(upgrade_info)
                if upgrade_info[0] != 1:
                    times = self.total_time / upgrade_info[2]
                    self.upgrade_ts = [
                        upgrade_info[2] * i for i in xrange(1, times + 1)
                    ]

                    # upgrade with new disks(like SSD) needs new failure generator and lse generator
                    if upgrade_info[0] == 3:
                        disk_failure_generator = self.conf.get(
                            section, "disk_failure_generator")
                        self.failure_generator = returnEventGenerator(
                            "failureGenerator", disk_failure_generator)
                        latent_error_generator = self.conf.get(
                            section, "latent_error_generator")
                        self.lse_generator = returnEventGenerator(
                            "latentErrorGenerator", latent_error_generator)

        self.correlated_failures = self._bool(
            d.pop("correlated_failures", "false"))
        if self.correlated_failures:
            sections = self._getSections("Correlated Failures")
            if sections == []:
                raise Exception(
                    "No Correlated Failures section in configuration file")
            self.correlated_failures_infos = []
            for section in sections:
                self.correlated_failures_infos.append(
                    self.parserCorrelatedSetting(section))

        self.disk_repair_time, self.node_repair_time = self.comRepairTime()

        self.total_slices = int(
            ceil(self.total_active_storage * pow(2, 30) /
                 (self.drs_handler.k * self.chunk_size)))