Exemple #1
0
    def test_number_of_machines(self):
        print 'test_number_of_machines'
        result = False
        wf = Workflow()
        wf.add_workflow(self.tutorial_dir_1, None)

        try:
            entries, n, costs = number_of_machines(wf, [], [], 1,
                                                   datetime(2000, 1, 1), 1)
            print i, n, costs
            print_sched(entries)
        except BudgetException as e:
            print e

        for i in range(1, 5):
            try:
                entries, n, costs = number_of_machines(wf, [], [], i,
                                                       datetime(2000, 1, 1),
                                                       10)
                print i, n, costs
                print_sched(entries)
            except BudgetException as e:
                print e

        result = True
        self.assertTrue(result)
Exemple #2
0
 def test_nmax(self):
     print 'test_nmax'
     result = False
     wf = Workflow()
     wf.add_workflow(self.tutorial_dir_1, None)
     
     print get_nmax(wf, [], [], datetime(2000,1,1)) 
     result = True
     self.assertTrue(result)
Exemple #3
0
    def test_nmax(self):
        print 'test_nmax'
        result = False
        wf = Workflow()
        wf.add_workflow(self.tutorial_dir_1, None)

        print get_nmax(wf, [], [], datetime(2000, 1, 1))
        result = True
        self.assertTrue(result)
Exemple #4
0
 def test_workflow(self):
     print 'test_workflow'
     result = False
     
     w = Workflow()
     w.add_workflow(self.tutorial_dir_1, None)
     for j in w.jobs:
         print j.id
     
     print [j.rank for j in w.ranked_jobs]
Exemple #5
0
    def test_cost_n(self):
        print 'test_cost_n'
        result = False
        wf = Workflow()
        wf.add_workflow(self.tutorial_dir_1, None)

        for i in [1, 2, 3]:
            entries, cost = sched_cost_n(wf, [], [], i, datetime(2000, 1, 1))
            print i, cost
            print_sched(entries)

        result = True
        self.assertTrue(result)
Exemple #6
0
    def test_workflow(self):
        print 'test_workflow'
        result = False

        w = Workflow()
        w.add_workflow(self.tutorial_dir_1, None)
        for j in w.jobs:
            print j.id

        print[j.rank for j in w.ranked_jobs]

        result = True
        self.assertTrue(result)
Exemple #7
0
 def test_cost_n(self):
     print 'test_cost_n'
     result = False
     wf = Workflow()
     wf.add_workflow(self.tutorial_dir_1, None)
     
     for i in [1,2,3]:
         entries, cost = sched_cost_n(wf, [], [], i, datetime(2000,1,1))
         print i, cost
         print_sched(entries)
     
     
     result = True
     self.assertTrue(result)
Exemple #8
0
    def test_merge_workflows(self):
        print 'test_merge_workflows'
        result = False

        w = Workflow()
        w.add_workflow(self.tutorial_dir_1, None)
        w.add_workflow(self.tutorial_dir_2, None)

        for wid in Set([j.wf_id for j in w.jobs]):
            print wid
            for j in [j for j in w.jobs if j.wf_id == wid]:
                print j.id

        result = True
        self.assertTrue(result)
Exemple #9
0
 def test_merge_workflows(self):
     print 'test_merge_workflows'
     result = False
     
     w = Workflow()
     w.add_workflow(self.tutorial_dir_1, None)
     w.add_workflow(self.tutorial_dir_2, None)
      
     for wid in Set([j.wf_id for j in w.jobs]):
         print wid
         for j in [j for j in w.jobs if j.wf_id == wid]:
             print j.id           
     
     result = True 
     self.assertTrue(result)
Exemple #10
0
    def test_number_of_machines(self):
        print 'test_number_of_machines'
        result = False
        wf = Workflow()
        wf.add_workflow(self.tutorial_dir_1, None)

        try:
            entries, n, costs = number_of_machines(wf, [], [], 1, datetime(2000,1,1), 1)
            print i, n, costs
            print_sched(entries)
        except BudgetException as e:
            print e
        
        for i in range(1,5):
            try:
                entries, n, costs = number_of_machines(wf, [], [], i, datetime(2000,1,1), 10)
                print i, n, costs
                print_sched(entries)
            except BudgetException as e:
                print e

            
        result = True
        self.assertTrue(result)
Exemple #11
0
class Monitor():
    def __init__(self):
        self.workflow = Workflow()
        self.creation_timestamp = self.timestamp = datetime.now()
        self.logwatcher = LogWatcher()

        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'local'
        self.machines = [manager]

        boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.entries = [boot_entry]
        self.entries_cid = {}

    def add_workflow(self, workflow_dir):
        wf_id = self.workflow.add_workflow(workflow_dir)
        self.logwatcher.add(wf_id, workflow_dir)

    def sync_machines(self):
        slots = condor_slots()
        for s in slots:
            if s not in [m.condor_slot for m in self.machines]:
                machine = Machine()
                machine.status = MachineStatus.running
                machine.condor_slot = s
                boot_job = Job('boot', None)
                boot_entry = ScheduleEntry(boot_job, machine, None, None)
                boot_entry.log[LogKey.real_start] = self.creation_timestamp
                boot_entry.log[LogKey.real_end] = self.timestamp
                boot_entry.status = EntryStatus.completed
                self.entries.append(boot_entry)
                self.machines.append(machine)
                print "++Machine", s

    def sync_jobs(self):
        log_entries = self.logwatcher.nexts()
        for le in log_entries:
            if le.id in self.entries_cid:  # in dict keys
                entry = self.entries_cid[le.id]
            else:
                entry = ScheduleEntry(condor_id=le.id)
                self.entries.append(entry)
                self.entries_cid[le.id] = entry
                print "++Job", le.id

            entry.log[le.event] = le.timestamp

            if le.event == LogKey.execute:
                entry.status = EntryStatus.executing
            elif le.event == LogKey.job_terminated:
                entry.status = EntryStatus.completed
                wf_id, dag_job_id, slot = condor_history(le.id)

                job = next(
                    (j for j in self.workflow.jobs
                     if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None)
                if job:
                    entry.job = job
                    entry.host = next(
                        (m for m in self.machines if m.condor_slot == slot),
                        self.machines[0])
                    print "--Job", le.id, dag_job_id, entry.host.condor_slot

    def update_timestamp(self):
        self.timestamp = datetime.now()
Exemple #12
0
class Monitor():
    def __init__(self):
        self.workflow = Workflow()
        self.creation_timestamp = self.timestamp = datetime.now()
        self.logwatcher = LogWatcher()
        
        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'local'
        self.machines = [manager]
        
        boot_entry = ScheduleEntry(Job('boot', None), manager, None, None)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.entries = [boot_entry]
        self.entries_cid = {}
        
    def add_workflow(self, workflow_dir):
        wf_id = self.workflow.add_workflow(workflow_dir)
        self.logwatcher.add(wf_id, workflow_dir)
            
    def sync_machines(self):
        slots = condor_slots()
        for s in slots:
            if s not in [m.condor_slot for m in self.machines]:
                machine = Machine()
                machine.status = MachineStatus.running
                machine.condor_slot = s
                boot_job = Job('boot', None)
                boot_entry = ScheduleEntry(boot_job, machine, None, None)
                boot_entry.log[LogKey.real_start] = self.creation_timestamp
                boot_entry.log[LogKey.real_end] = self.timestamp
                boot_entry.status = EntryStatus.completed
                self.entries.append(boot_entry)
                self.machines.append(machine)
                print "++Machine", s
                
    def sync_jobs(self):
        log_entries = self.logwatcher.nexts()
        for le in log_entries:
            if le.id in self.entries_cid: # in dict keys
                entry = self.entries_cid[le.id]
            else:
                entry = ScheduleEntry(condor_id=le.id)
                self.entries.append(entry)
                self.entries_cid[le.id] = entry
                print "++Job", le.id
                
            entry.log[le.event] = le.timestamp
            
            if le.event == LogKey.execute:
                entry.status = EntryStatus.executing
            elif le.event == LogKey.job_terminated:
                entry.status = EntryStatus.completed
                wf_id, dag_job_id, slot = condor_history(le.id)
                
                job = next((j for j in self.workflow.jobs if j.dag_job_id == dag_job_id and j.wf_id == wf_id), None)
                if job:
                    entry.job = job
                    entry.host = next((m for m in self.machines if m.condor_slot == slot), self.machines[0])
                    print "--Job", le.id, dag_job_id, entry.host.condor_slot
            
    def update_timestamp(self):
        self.timestamp = datetime.now()
Exemple #13
0
class Provisioner():
    def __init__(self, vm_limit, azure_config, skip_setup, local):
        self.vm_limit = vm_limit # user input
        self.budget = 0
        self.timestamp = datetime.now()
        self.cost_pred = 0
        self.wf_end = None
        
        self.jobs_terminated = False
        self.last_resched = None
        
        self.workflow = Workflow()
        self.logwatcher = LogWatcher()
        
        self.schedule = Schedule()
        
        manager = Machine()
        manager.status = MachineStatus.manager
        manager.condor_slot = 'manager'
        self.machines = [manager]
        
        boot_entry = ScheduleEntry(Job('boot', None), manager, self.timestamp, self.timestamp)
        boot_entry.real_start = self.timestamp
        boot_entry.real_end = self.timestamp
        boot_entry.status = EntryStatus.completed
        self.schedule.add_entry_host(boot_entry, manager)
        
        self.local = local
        if azure_config and not local:
            hostname = socket.gethostname()
            self.exp = AzureExperiment(azure_config, skip_setup=skip_setup, name=hostname)
            self.master_addr = socket.gethostbyname(hostname)
            self.user = azure_config.admin_username
        else:
            self.exp = self.master_addr = self.user = None
        
    def add_workflow(self, workflow_dir, prediction_file, budget):
        self.budget = self.budget + int(round(float(budget)))
        wf_id = self.workflow.add_workflow(workflow_dir, prediction_file=prediction_file)
        self.logwatcher.add(wf_id, workflow_dir)
            
    def update_schedule(self):
        print 'UPDATE SCHED'
        self.update_budget_timestamp()
        self.last_resched = self.timestamp 
    
        # completed and running entries will not change
        self.schedule.rm_scheduled_entries()

        if self.workflow.has_jobs_to_sched(self.schedule):
            # Max number of vms
            nmax = get_nmax(self.workflow, self.machines, self.schedule, self.vm_limit, self.timestamp, self.local)

            print 'NMAX',nmax
            
            # Get the number of machines to be used
            schedule, _cost, _n = sched_number_of_machines(self.workflow, self.machines, self.schedule, nmax, self.timestamp, self.budget, self.local)
            print "N", _n, 'budget', self.budget
            
            # Update schedule
            self.schedule = schedule

    def update_budget_timestamp(self):
        timestamp = datetime.now()
        if self.timestamp != None:
            # Supondo vm_cost em cost/second
            # Supondo que não houve mudança no número de máquinas
            # desde o ultimo self.timestamp
            delta = (timestamp - self.timestamp).seconds
            charged = delta * len(self.machines) * VM_COST_PER_SEC
            self.budget = self.budget - charged
        self.timestamp = timestamp
        
    def update_wf_pred(self):
        self.cost_pred, self.wf_end = sched_cost_pred(self.machines, self.schedule, self.timestamp)

    def allocate_new_vms(self):
        # boot entries
        if self.schedule != None:
            for m in self.schedule.entries_host.keys():
                entry = self.schedule.entries_host[m][0]
                if entry.status == EntryStatus.scheduled and entry.start() <= self.timestamp:
                    m.allocate(self.exp, self.master_addr, self.user)
                    
                    self.machines.append(m)
                    entry.status = EntryStatus.executing
                    entry.log[LogKey.real_start] = self.timestamp
        
    
    def deallocate_vms(self):
        for m in self.machines:
            if m.status == MachineStatus.manager:
                continue
            
            # if there's no more budget or
            # if there's nothing executing or scheduled to the machine
            if self.schedule == None or len([e for e in self.schedule.entries_host[m] if e.status != EntryStatus.completed]) == 0:
                m.deallocate(self.exp)
                print "--Machine", m.condor_slot
                
        # update machine list
        self.machines = [m for m in self.machines if m.status != MachineStatus.deallocating]
    
    
    def sync_machines(self):
        slots_addrs = condor_slots()
        running_machines = [m for m in self.machines if m.status == MachineStatus.running]
        allocating_machines = [m for m in self.machines if m.status == MachineStatus.allocating]
        #allocating_machines.sort(key=lambda x: self.schedule.entries_host[x][0].start())
        i = 0
        for (slot,addr) in slots_addrs:
            if slot not in [m.condor_slot for m in running_machines]:
                allocated_machine = None
                if not self.local:
                    allocated_machine = next((m for m in allocating_machines if m.priv_addr == addr), None)
                elif len(allocating_machines[i:]) > 0:
                    # update machine
                    allocated_machine = allocating_machines[i]
                
                if allocated_machine:
                    allocated_machine.status = MachineStatus.running
                    allocated_machine.condor_slot = slot
                    
                    # update entry
                    boot_entry = self.schedule.entries_host[allocated_machine][0]
                    boot_entry.log[LogKey.real_end] = self.timestamp
                    boot_entry.status = EntryStatus.completed
                
                    i += 1
                    print "++Machine", allocated_machine.condor_slot
                else:
                    if next((e for e in self.schedule.entries if e.host.priv_addr == addr and e.status != EntryStatus.completed), None):
                        print "ERROR: slot not found", slot, addr, 'nr', len(running_machines), 'na', len(allocating_machines)
                

    
    def _handle_log_events(self):
        jobs_terminated = False
        log_entries = self.logwatcher.nexts()
        
        for le in log_entries:
            if le.id in self.schedule.entries_cid:
                sched_entry = self.schedule.entries_cid[le.id]
            else:
                sched_entry = next((e for e in self.schedule.entries if e.job.dag_job_id == le.name and e.job.wf_id == le.wf_id), None)
                if sched_entry:
                    sched_entry.condor_id = le.id
                    self.schedule.add_entry_cid(sched_entry)
            if sched_entry:
                sched_entry.log[le.event] = le.timestamp
                
                if le.event == LogKey.execute:
                    sched_entry.status = EntryStatus.executing
            
                elif le.event == LogKey.job_terminated:
                    sched_entry.status = EntryStatus.completed 
                    sched_entry.log[LogKey.real_end] = self.timestamp
                    print "--Job", le.id, sched_entry.job.dag_job_id, sched_entry.host.condor_slot
                    jobs_terminated = True
            else:
                print 'could not find sched_entry for:', le.id
        return jobs_terminated
                
    def _handle_ready_jobs(self):    
        need_condor_resched = False
        idle_cjobs = condor_idle() # idle jobs

        for cjob in idle_cjobs:
            condor_id, wf_id, dag_job_id = cjob.split()
            if condor_id in self.schedule.entries_cid:
                sched_entry = self.schedule.entries_cid[condor_id]
            else:
                sched_entry = next((e for e in self.schedule.entries \
                                    if e.job.dag_job_id == dag_job_id \
                                    and e.job.wf_id == wf_id ), None)
                if sched_entry:
                    sched_entry.condor_id = condor_id
                    self.schedule.add_entry_cid(sched_entry)

            if sched_entry and sched_entry.status == EntryStatus.scheduled \
                    and sched_entry.host.status == MachineStatus.running:
                sched_entry.status = EntryStatus.executing
                sched_entry.log[LogKey.real_start] = self.timestamp
                print "++Job", condor_id, dag_job_id, sched_entry.host.condor_slot
                condor_qedit(condor_id, wf_id, dag_job_id, sched_entry.host.condor_slot)
                need_condor_resched = True

        if need_condor_resched:
            condor_reschedule()

    def update_jobs(self):
        
        # handle log events and check if any job terminated
        self.jobs_terminated = self._handle_log_events() or self.jobs_terminated
        
        # need to update schedule (?)
        if self.last_resched and self.jobs_terminated and \
        ((self.timestamp - self.last_resched).seconds > SCHED_TIMEOUT):
            self.update_schedule()
            self.jobs_terminated = False
        
        # handle jobs that are ready to execute
        self._handle_ready_jobs()