Ejemplo n.º 1
0
    def __init__(self, name, parent, parameters):
        self.my_id = Machine.id_counter
        Machine.id_counter += 1
        super(Machine, self).__init__(name, parent, parameters)

        # recovery generator for permanent machine failure
        self.recovery_generator2 = None

        # amount of time after which a machine failure is treated as permanent,
        # and eager disk recovery is begun, if eager_recovery_enabled is True.
        self.fail_timeout = -1
        if self.fail_timeout == -1:
            # Fraction of machine failures that are permanent.
            Machine.fail_fraction = float(
                parameters.get("fail_fraction", 0.008))
            self.fail_timeout = float(parameters.get("fail_timeout", 0.25))
            # If True, machine failure and recovery durations will be generated
            # but ignored.
            self.fast_forward = bool(parameters.get("fast_forward"))
            self.eager_recovery_enabled = bool(
                parameters.get("eager_recovery_enabled"))

        self.fail_durations = []

        conf = Configuration()
        self.machine_repair_time = conf.node_repair_time
Ejemplo n.º 2
0
    def run(self):
        conf = Configuration(self.conf_path)
        xml = XMLParser(conf)
        if conf.hier:
            self.distributer = HierSSSDistribute(xml)
        else:
            self.distributer = SSSDistribute(xml)
        self.conf = self.distributer.returnConf()

        self.event_handler = EventHandler
        self.distributer.start()
        events_handled = 0
        events = EventQueue()

        if self.conf.system_upgrade:
            for info in self.conf.system_upgrade_infos:
                if info[0] == 1:
                    upgrade_start_times = self.addSystemUpgrade(info, self.conf.total_time)
                    if info[-1] is not None:
                        self.addUpgradeCheckEvents(events, upgrade_start_times, info[-1])
        if self.conf.correlated_failures:
            for info in self.conf.correlated_failures_infos:
                for i in xrange(10):
                    cf_info = deepcopy(list(info))
                    cf_info[0] += i * 8760
                    print "correlated_failures info:", cf_info
                    self.addCorrelatedFailures(cf_info)
        if self.conf.system_scaling:
            for info in self.conf.system_scaling_infos:
                self.addSystemScaling(info)

        info_logger.info("disk usage is: " + str(self.distributer.diskUsage()*100) + "%\n")
        self.distributer.getRoot().printAll()

        root = self.distributer.getRoot()
        root.generateEvents(events, 0, self.conf.total_time, True)
        for ts in self.conf.upgrade_ts:
            full_system_check_event = Event(Event.EventType.UpgradeCheck, ts, root, 6)
            events.addEvent(full_system_check_event)

        if self.conf.event_file != None:
            events_file = self.conf.event_file + '-' + self.ts
            events.printAll(events_file, "Iteration number: "+str(self.iteration_times))
        self.iteration_times += 1

        handler = self.event_handler(self.distributer)

        print "total slices:", handler.total_slices
        e = events.removeFirst()
        while e is not None:
            handler.handleEvent(e, events)
            e = events.removeFirst()
            events_handled += 1

        self.total_events_handled += events_handled

        result = handler.end()
        info_logger.info(result.toString())
        return result
Ejemplo n.º 3
0
    def __init__(self, durations):
        self.durations = durations

        self.conf = Configuration()
        self.drs_handler = self.conf.getDRSHandler()
        self.isMDS = self.drs_handler.isMDS
        ft = self.drs_handler.n - self.drs_handler.k
        if self.isMDS:
            self.ft = ft
        else:
            # We only consider LRC with l = 2, so we
            # need to consider failures more than n-k-1
            self.ft = ft - 1

        self.concurrent_count = 0
        self.lost_concurrent_count = 0
        self.total_failure_slice_count = 0
Ejemplo n.º 4
0
 def __init__(self, name, parent, parameters):
     super(Disk, self).__init__(name, parent, parameters)
     conf = Configuration()
     self.disk_capacity = conf.disk_capacity
     self.disk_repair_time = conf.disk_repair_time
     self.chunk_repair_time = conf.chunk_repair_time
     self.slices_hit_by_LSE = []
     self.latent_error_generator = None
     self.scrub_generator = None
Ejemplo n.º 5
0
    def __init__(self, conf_path):
        self.iteration_times = 1
        self.ts = strftime("%Y%m%d.%H.%M.%S")
        self.total_events_handled = 0

        self.conf = Configuration(conf_path)
        xml = XMLParser(self.conf)
        if self.conf.hier:
            self.distributer = HierSSSDistribute(xml)
        else:
            self.distributer = SSSDistribute(xml)
Ejemplo n.º 6
0
    def run(self):
        conf = Configuration(self.conf_path)
        xml = XMLParser(conf)
        distributer_class = returnDistributer(conf.data_placement,
                                              conf.hierarchical)
        self.distributer = distributer_class(xml)
        self.conf = self.distributer.returnConf()

        if self.conf.rafi_recovery:
            self.event_handler = RAFIEventHandler
        else:
            self.event_handler = EventHandler
        self.distributer.start()
        # self.distributer.printGroupsToFile()

        info_logger.info("disk usage is: " +
                         str(self.distributer.diskUsage() * 100) + "%\n")
        self.distributer.getRoot().printAll()

        events_handled = 0
        events = EventQueue()

        root = self.distributer.getRoot()
        root.generateEvents(events, 0, self.conf.total_time, True)

        # if False:
        if self.conf.event_file != None:
            events_file = self.conf.event_file + '-' + self.ts
            events.printAll(events_file,
                            "Iteration number: " + str(self.iteration_times))
        self.iteration_times += 1

        handler = self.event_handler(self.distributer)

        print "total slices:", handler.total_slices
        e = events.removeFirst()
        while e is not None:
            handler.handleEvent(e, events)
            e = events.removeFirst()
            events_handled += 1

        self.total_events_handled += events_handled

        result = handler.end()
        info_logger.info(result.toString())
        return result
Ejemplo n.º 7
0
def returnLayerArch(layer):
    conf = Configuration()
    dcs = conf.datacenters
    racks = conf.racks
    machines = conf.machines_per_rack
    disks = conf.disks_per_machine

    layer_tree = Tree("SYS")
    for dc_id in xrange(dcs):
        dc_node = layer_tree.addChild("DC" + str(dc_id))
        for rack_id in xrange(racks):
            rack_node = dc_node.addChild("R" + str(rack_id))
            for machine_id in xrange(machines):
                machine_node = rack_node.addChild("M" + str(machine_id))

                # Haven't consider the medium is SSD.
                for disk_id in xrange(disks):
                    machine_node.addChild("H" + str(disk_id))

    return layer_tree
Ejemplo n.º 8
0
    CopySet(Cidon et al. 2013)
    """
    pass


class RandomDistributeSameRack(RandomDistribute):
    """
    HDFS: three replicas, two of them on different machines of same rack, the third one on a
        different rack
    QFS: n blocks, several of them on same rack
    """


class RandomDistributeDRC(RandomDistribute):
    """
    DRC: Double Regenerating Codes.(Hu et al. 2017)
    """
    pass


if __name__ == "__main__":
    conf = Configuration()
    xml = XMLParser(conf)
    sss = RandomDistributeSSS(xml)
    sss.start()
    sss.printTest()
    sss.printToFile()
    sss.systemScaling(1000, 0.1, 20000, 3, 9000, True)
    sss.printTest()
    sss.printToFile()
Ejemplo n.º 9
0
def returnLayers():
    layers = []
    conf = Configuration()
    for i in xrange(1, conf.tier_num + 1):
        layers.append(Layer(i))
    return layers
Ejemplo n.º 10
0
 def __init__(self, name, parent, parameters):
     super(Disk, self).__init__(name, parent, parameters)
     conf = Configuration()
     self.disk_capacity = conf.disk_capacity
     self.disk_repair_time = conf.disk_repair_time
     self.slices_hit_by_LSE = []
Ejemplo n.º 11
0
        machines_set = [[0 for i in xrange(len(machines[0]))]
                        for j in xrange(len(machines))]
        for i, copy_set in enumerate(copy_sets):
            format_output = "copy set " + str(i) + ": "
            for machine in copy_set:
                format_output += "  " + machine.toString()
                for j, rack_machines in enumerate(machines):
                    if machine in rack_machines:
                        machine_index = rack_machines.index(machine)
                        machines_set[j][machine_index] += 1
        return copy_sets


if __name__ == "__main__":
    conf = Configuration("/root/CR-SIM/conf/cr-sim.conf")
    xml = XMLParser(conf)
    distribute = COPYSETDistribute(xml)

    machines = distribute.getAllMachines()
    distribute.distributeSlices(distribute.getRoot(),
                                distribute.conf.total_slices)
    distribute.printToFile()
"""
    total_slices = 419431
    distribute.distributeSlices(distribute.getRoot(), total_slices)
    for i in xrange(total_slices):
        format_output = "slice index " + str(i) + ": "
        for disk in distribute.slice_locations[i]:
            format_output += "  " + disk.toString()
        print format_output"""
Ejemplo n.º 12
0
 def __init__(self, layer_id):
     layer_path = CONF_PATH + os.sep + "layer_" + str(layer_id) + ".xml"
     self.tree = ET.parse(layer_path)
     self.root = self.tree.getroot()
     self.conf = Configuration()
Ejemplo n.º 13
0
class HandleDuration(object):
    def __init__(self, durations):
        self.durations = durations

        self.conf = Configuration()
        self.drs_handler = self.conf.getDRSHandler()
        self.isMDS = self.drs_handler.isMDS
        ft = self.drs_handler.n - self.drs_handler.k
        if self.isMDS:
            self.ft = ft
        else:
            # We only consider LRC with l = 2, so we
            # need to consider failures more than n-k-1
            self.ft = ft - 1

        self.concurrent_count = 0
        self.lost_concurrent_count = 0
        self.total_failure_slice_count = 0

    def returnConcurrentCount(self):
        return self.concurrent_count

    def returnFailureSliceCount(self):
        return self.total_failure_slice_count

    def isHandleLost(self):
        return self.handle_only_lost

    # Return concurrent durations
    # format: {(start time, end time):[list of units], ...}
    def findConcurrent(self):
        tmp_durations = []
        concurrent_durations = {}
        lost_concurrent_durations = {}
        last_concurrent_period = None

        durations = self.durations.clone()
        print "duration size:", durations.size()

        while durations.size() != 0:
            d = durations.removeFirst()
            current_time = d.getStartTime()

            for tmp_d in reversed(tmp_durations):
                if tmp_d.getEndTime() <= current_time:
                    tmp_durations.remove(tmp_d)

            tmp_durations.append(d)
            if len(tmp_durations) <= self.ft:
                continue

            concurrent_period = (max([
                tmp.getStartTime() for tmp in tmp_durations
            ]), min([tmp.getEndTime() for tmp in tmp_durations]))
            if last_concurrent_period is None:
                last_concurrent_period = concurrent_period
            else:
                if concurrent_period[0] < last_concurrent_period[1]:
                    pop_units = concurrent_durations.pop(
                        last_concurrent_period)
                    concurrent_durations[(last_concurrent_period[0],
                                          concurrent_period[0])] = pop_units
                    if concurrent_period[1] < last_concurrent_period[1]:
                        concurrent_durations[(
                            concurrent_period[1],
                            last_concurrent_period[1])] = pop_units

            concurrent_units = [tmp.getUnit() for tmp in tmp_durations]
            concurrent_durations[concurrent_period] = concurrent_units

            if d.getType() == Duration.DurationType.Loss:
                lost_concurrent_durations[concurrent_period] = concurrent_units
                self.lost_concurrent_count += 1

            last_concurrent_period = concurrent_period
            self.concurrent_count += 1

        print "lost concurrent count:", self.lost_concurrent_count
        print "concurrent count:", self.concurrent_count
        return lost_concurrent_durations, concurrent_durations

    def process(self, concurrent_durations, distributer):
        total_failure_times = 0
        self.total_failure_slice_count = 0
        total_failure_period = 0.0
        # failure_period * failure_slice_count
        failure_period_with_weight = 0.0

        periods = concurrent_durations.keys()
        for period in periods:
            # key:slice_index, value:failure count in slice
            slice_failures = {}

            f_units = concurrent_durations[period]
            for u in f_units:
                if isinstance(u, Sector):
                    r = random()
                    # if no chunk is hited by sector error
                    if r > distributer.diskUsage():
                        continue
                    all_slices = u.parent.getChildren()
                    slice_index = choice(all_slices)
                    slice_failures.setdefault(
                        slice_index,
                        slice_failures.pop(slice_index, 0) + 1)
                else:
                    disks = []
                    if isinstance(u, Rack):
                        distributer.getAllDisksInRack(u, disks)
                    elif isinstance(u, Machine):
                        disks += u.getChildren()
                    elif isinstance(u, Disk):
                        disks.append(u)
                    else:
                        raise Exception("Invalid unit")

                    for disk in disks:
                        slices = disk.getChildren()
                        for slice_index in slices:
                            slice_failures.setdefault(
                                slice_index,
                                slice_failures.pop(slice_index, 0) + 1)

            failure_slice_count = 0
            slice_failure_in_period_flag = False
            failure_nums = slice_failures.values()
            for num in failure_nums:
                if self.isMDS and num > self.ft:
                    slice_failure_in_period_flag = True
                    failure_slice_count += 1
                if not self.isMDS and num > self.ft:
                    if num == self.ft + 1:
                        r2 = random()
                        if r2 < self.drs_handler.threshold:
                            slice_failure_in_period_flag = True
                            failure_slice_count += 1
                            print "random:%f, failures:%d,period:%f, threshold:%f" % (
                                r2, failure_slice_count, period[1] - period[0],
                                self.drs_handler.threshold)
                    else:
                        slice_failure_in_period_flag = True
                        failure_slice_count += 1
            self.total_failure_slice_count += failure_slice_count
            failure_period_with_weight += failure_slice_count * (period[1] -
                                                                 period[0])

            if slice_failure_in_period_flag:
                total_failure_times += 1
                total_failure_period += period[1] - period[0]

        return (total_failure_times, self.total_failure_slice_count,
                total_failure_period, failure_period_with_weight)

    # Get concurrent durations which contain $num$ failed units
    def getConcurrent(self, concurrent_durations, num):
        res = {}
        periods = concurrent_durations.keys()
        for period in periods:
            if len(concurrent_durations[period]) == num:
                res[period] = concurrent_durations[period]

        return res

    def printAll(self, concurrent_durations):
        i = 0
        periods = concurrent_durations.keys()
        periods.sort()
        for period in periods:
            format_string = str(period[0]) + "  " + str(period[1]) + " "
            for u in concurrent_durations[period]:
                format_string += " " + u.toString()
            print format_string

    def printToFile(self, file_path, concurrent_durations):
        pass