Esempio n. 1
0
 def wait_async_file(self, id, eof=None, bytes=None):
     the_fetch = None
     for fetch in self.ongoing_fetches:
         if fetch.ref.id == id:
             the_fetch = fetch
             break
     if the_fetch is None:
         ciel.log(
             "Failed to wait for async-fetch %s: not an active transfer" %
             id, "EXEC", logging.WARNING)
         return {"success": False}
     if eof is not None:
         ciel.log("Waiting for fetch %s to complete" % id, "EXEC",
                  logging.DEBUG)
         the_fetch.wait_eof()
     else:
         ciel.log(
             "Waiting for fetch %s length to exceed %d bytes" % (id, bytes),
             "EXEC", logging.DEBUG)
         the_fetch.wait_bytes(bytes)
     if the_fetch.done and not the_fetch.success:
         ciel.log("Wait %s complete: transfer has failed" % id, "EXEC",
                  logging.WARNING)
         return {"success": False}
     else:
         ret = {
             "size": int(the_fetch.bytes),
             "done": the_fetch.done,
             "success": True
         }
         ciel.log(
             "Wait %s complete: new length=%d, EOF=%s" %
             (id, ret["size"], ret["done"]), "EXEC", logging.DEBUG)
         return ret
Esempio n. 2
0
 def copy_loop(self):
     
     try:
         self.fetch_ip.set_filename(self.write_filename, True)
         with open(self.read_filename, "r") as input_fp:
             with open(self.write_filename, "w") as output_fp:
                 while True:
                     while True:
                         buf = input_fp.read(4096)
                         output_fp.write(buf)
                         self.bytes_copied += len(buf)
                         with self.lock:
                             if self.success is False or (self.bytes_copied == self.bytes_available and self.fetch_done):
                                 self.stream_done = True
                                 self.condvar.notify_all()
                                 ciel.log("FIFO-push for %s complete (success: %s)" % (self.ref, self.success), "EXEC", logging.INFO)
                                 return
                         if len(buf) < 4096:
                             # EOF, for now.
                             break
                     with self.lock:
                         self.next_threshold = self.bytes_copied + self.fetch_ip.chunk_size
                         while self.bytes_available < self.next_threshold and not self.fetch_done:
                             self.condvar.wait()
     except Exception as e:
         ciel.log("Push thread for %s died with exception %s" % (self.ref, e), "EXEC", logging.WARNING)
         with self.lock:
             self.stream_done = True
             self.condvar.notify_all()
Esempio n. 3
0
 def _backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True, callback=None):
     initial_wait = 5
     for _ in range(0, num_attempts):
         if self.stop_event.is_set():
             break
         try:
             try:
                 if method == "POST":
                     if need_result or num_attempts > 1:
                         content = post_string(url, payload)
                     else:
                         if callback is None:
                             callback = self.master_post_result_callback
                         post_string_noreturn(url, payload, result_callback=callback)
                         return
                 elif method == "GET":
                     content = get_string(url)
                 else:
                     raise Exception("Invalid method %s" % method)
                 return 200, content
             except Exception as e:
                 ciel.log("Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR)
                 raise MasterNotRespondingException()
         except:
             ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True)
         self.stop_event.wait(initial_wait)
         initial_wait += initial_wait * random.uniform(0.5, 1.5)
     ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True)
     if self.stop_event.is_set():
         raise WorkerShutdownException()
     else:
         raise MasterNotRespondingException()
Esempio n. 4
0
    def thread_main(self):

        while True:

            # While not connected, attempt to register as a backup master.
            while self.is_running:

                try:
                    maybe_terminator = self.queue.get(block=True)
                    if not self.is_running or maybe_terminator is THREAD_TERMINATOR:
                        return
                except Empty:
                    pass

                log_entry = maybe_terminator

                try:
                    if log_entry[0] == 'U':
                        self.standby_urls.add(log_entry[1])
                    elif log_entry[0] == 'P':
                        self.do_publish_refs(log_entry[1], log_entry[2])
                    elif log_entry[0] == 'W':
                        self.do_add_worker(log_entry[1])
                    elif log_entry[0] == 'J':
                        self.do_add_job(log_entry[1], log_entry[2])
                    elif log_entry[0] == 'D':
                        self.do_add_data(log_entry[1], log_entry[2])
                    else:
                        raise
                except:
                    ciel.log('Error passing log to backup master.',
                             'BACKUP_SENDER', logging.WARN, True)
Esempio n. 5
0
def sync_retrieve_refs(refs, task_record, accept_string=False):

    ctxs = []

    for ref in refs:
        sync_transfer = SynchronousTransfer(ref, task_record)
        ciel.log("Synchronous fetch ref %s" % ref.id, "BLOCKSTORE", logging.DEBUG)
        if accept_string:
            kwargs = {"string_callback": sync_transfer.return_string}
        else:
            kwargs = {}
        fetch_ref_async(
            ref,
            sync_transfer.result,
            sync_transfer.reset,
            sync_transfer.start_filename,
            task_record=task_record,
            **kwargs
        )
        ctxs.append(sync_transfer)

    for ctx in ctxs:
        ctx.wait()

    failed_transfers = filter(lambda x: not x.success, ctxs)
    if len(failed_transfers) > 0:
        raise MissingInputException(
            dict([(ctx.ref.id, SW2_TombstoneReference(ctx.ref.id, ctx.ref.location_hints)) for ctx in failed_transfers])
        )
    return ctxs
Esempio n. 6
0
 def backoff_request(self,
                     url,
                     method,
                     payload=None,
                     need_result=True,
                     callback=None):
     if self.stop_event.is_set():
         return
     try:
         if method == "POST":
             if need_result:
                 content = post_string(url, payload)
             else:
                 if callback is None:
                     callback = self.master_post_result_callback
                 post_string_noreturn(url,
                                      payload,
                                      result_callback=callback)
                 return
         elif method == "GET":
             content = get_string(url)
         else:
             raise Exception("Invalid method %s" % method)
         return 200, content
     except:
         ciel.log("Error attempting to contact master, aborting",
                  "MSTRPRXY", logging.WARNING, True)
         raise
Esempio n. 7
0
 def task_runnable(self, task):
     ciel.log('Task %s became runnable!' % task.task_id, 'LTG', logging.DEBUG)
     if self.execution_features.can_run(task.handler):
         if task.task_id in self.root_task_ids:
             ciel.log('Putting task %s in the runnableQ because it is a root' % task.task_id, 'LTG', logging.DEBUG)
             try:
                 self.runnable_queues[task.scheduling_class].put(task)
             except KeyError:
                 try:
                     self.runnable_queues['*'].put(task)
                 except KeyError:
                     ciel.log('Scheduling class %s not supported on this worker (for task %s)' % (task.scheduling_class, task.task_id), 'LTG', logging.ERROR)
                     raise
             task.taskset.inc_runnable_count()
         else:
             try:
                 is_small_task = task.worker_private['hint'] == 'small_task'
                 if is_small_task:
                     ciel.log('Putting task %s in the runnableQ because it is small' % task.task_id, 'LTG', logging.DEBUG)
                     try:
                         self.runnable_queues[task.scheduling_class].put(task)
                     except KeyError:
                         try:
                             self.runnable_queues['*'].put(task)
                         except KeyError:
                             ciel.log('Scheduling class %s not supported on this worker (for task %s)' % (task.scheduling_class, task.task_id), 'LTG', logging.ERROR)
                             raise
                     self.taskset.inc_runnable_count()
             except KeyError:
                 pass
             except AttributeError:
                 pass
Esempio n. 8
0
    def get_random_worker_with_capacity_weight(self, scheduling_class):

        with self._lock:
            try:
                candidates = self.scheduling_class_capacities[scheduling_class]
                total_capacity = self.scheduling_class_total_capacities[
                    scheduling_class]
            except KeyError:
                scheduling_class = '*'
                candidates = self.scheduling_class_capacities['*']
                total_capacity = self.scheduling_class_total_capacities['*']

            selected_slot = random.randrange(total_capacity)
            curr_slot = 0
            i = 0

            for worker, capacity in candidates:
                curr_slot += capacity
                if curr_slot > selected_slot:
                    return worker

            ciel.log(
                'Ran out of workers in capacity-weighted selection class=%s selected=%d total=%d'
                % (scheduling_class, selected_slot, total_capacity),
                'WORKER_POOL', logging.ERROR)
Esempio n. 9
0
 def prepare_task_descriptor_for_execute(cls, task_descriptor, task_record, block_store):
     # Convert task_private from a reference to an object in here.
     try:
         task_descriptor["task_private"] = retrieve_object_for_ref(task_descriptor["task_private"], BaseExecutor.TASK_PRIVATE_ENCODING, task_record)
     except:
         ciel.log('Error retrieving task_private reference from task', 'BASE_EXECUTOR', logging.WARN, True)
         raise
Esempio n. 10
0
 def _advertisment(self, bytes=None, done=None, absent=None, failed=None):
     if self.cancelled:
         return
     if done or absent or failed:
         self.subscribed_to_remote_adverts = False
     if absent is True or failed is True:
         if absent is True:
             ciel.log(
                 "Stream-fetch %s: advertisment subscription reported file absent"
                 % self.ref.id, "CURL_FETCH", logging.WARNING)
         else:
             ciel.log(
                 "Stream-fetch %s: advertisment reported remote production failure"
                 % self.ref.id, "CURL_FETCH", logging.WARNING)
         self.remote_failed = True
         if self.current_data_fetch is None:
             self.complete(False)
     else:
         ciel.log(
             "Stream-fetch %s: got advertisment: bytes %d done %s" %
             (self.ref.id, bytes, done), "CURL_FETCH", logging.DEBUG)
         if self.latest_advertisment <= bytes:
             self.latest_advertisment = bytes
         else:
             ciel.log(
                 "Stream-fetch %s: intriguing anomaly: advert for %d bytes; currently have %d. Probable reordering in the network"
                 % (self.ref.id, bytes, self.latest_advertisment),
                 "CURL_FETCH", logging.WARNING)
         if self.remote_done and not done:
             ciel.log(
                 "Stream-fetch %s: intriguing anomaly: advert said not-done, but we are. Probable reordering in the network"
                 % self.ref.id, "CURL_FETCH", logging.WARNING)
         self.remote_done = self.remote_done or done
         if self.current_data_fetch is None:
             self.check_complete()
Esempio n. 11
0
 def cancel(self):
     ciel.log("Stream-fetch %s: cancelling" % self.ref.id, "CURL_FETCH",
              logging.DEBUG)
     self.cancelled = True
     if self.current_data_fetch is not None:
         self.current_data_fetch.cancel()
     self.complete(False)
Esempio n. 12
0
def allinone_main(options, args):
    
    ciel.log = CielLogger()
    
    script_filename = args[0]
    run_id = args[1] if len(args) > 1 else 'allinone'
    
    base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-'))
    ciel.log('Writing block store files to %s' % base_dir, 'ALLINONE', logging.INFO)
    
    if options.blockstore is not None:
        base_dir = options.blockstore
    else:
        base_dir = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-'))
        options.blockstore = base_dir
        
    block_store = BlockStore(ciel.engine, 'localhost', 8000, base_dir, True)
    
    initial_task_descriptor, cont_ref = build_initial_task_descriptor(script_filename, block_store, 'root', 'root_cont', 'root_output')
        
    initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None)
    
    task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options)
    
    try:
        print run_id, 'SUBMITTED_JOB', now_as_timestamp()
        result = task_runner.run()
        print run_id, 'GOT_RESULT', now_as_timestamp()
        print block_store.retrieve_object_for_ref(result, 'json')
        
    except:
        pass
Esempio n. 13
0
 def open_output(self,
                 index,
                 may_pipe=False,
                 may_stream=False,
                 make_local_sweetheart=False,
                 can_smart_subscribe=False,
                 fd_socket_name=None):
     if may_pipe and not may_stream:
         raise Exception(
             "Insane parameters: may_stream=False and may_pipe=True may well lead to deadlock"
         )
     if index in self.ongoing_outputs:
         raise Exception("Tried to open output %d which was already open" %
                         index)
     if not sendmsg_enabled:
         ciel.log("Not using FDs directly: module 'sendmsg' not available",
                  "EXEC", logging.DEBUG)
         fd_socket_name = None
     output_name = self.expected_outputs[index]
     can_accept_fd = (fd_socket_name is not None)
     output_ctx = OngoingOutput(output_name, index, can_smart_subscribe,
                                may_pipe, make_local_sweetheart,
                                can_accept_fd, self)
     self.ongoing_outputs[index] = output_ctx
     self.context_manager.add_context(output_ctx)
     if may_stream:
         ref = output_ctx.get_stream_ref()
         self.task_record.prepublish_refs([ref])
     x, is_fd = output_ctx.get_filename_or_fd()
     if is_fd:
         return ({"sending_fd": True}, x)
     else:
         return ({"sending_fd": False, "filename": x}, None)
Esempio n. 14
0
 def cancel(self):
     ciel.log("Fetch %s: cancelling" % self.ref.id, "CURL_FETCH", logging.INFO)
     self.cancelled = True
     if self.curl_fetch is not None:
         self.curl_fetch.cancel()
     self.fp.close()
     self.callbacks.result(False)
Esempio n. 15
0
 def thread_main(self):
     try:
         with self.lock:
             self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             self.should_close = True            
         ciel.log("Connecting %s:%s" % (self.otherend_hostname, self.ref.socket_port), "TCP_FETCH", logging.DEBUG)
         subscribe_remote_output_nopost(self.ref.id, self)
         self.sock.connect((self.otherend_hostname, self.ref.socket_port))
         self.sock.sendall("%s %s %d\n" % (self.ref.id, get_own_netloc(), self.chunk_size))
         ciel.log("%s:%s connected: requesting %s (chunk size %d)" % (self.otherend_hostname, self.ref.socket_port, self.ref.id, self.chunk_size), "TCP_FETCH", logging.DEBUG)
         fp = self.sock.makefile("r", bufsize=0)
         response = fp.readline().strip()
         fp.close()
         with self.lock:
             self.should_close = False
             if response.find("GO") != -1:
                 ciel.log("TCP-fetch %s: transfer started" % self.ref.id, "TCP_FETCH", logging.DEBUG)
                 new_fd = os.dup(self.sock.fileno())
                 self.sock.close()
                 self.fetch_ctx.set_fd(new_fd, True)
             else:
                 ciel.log("TCP-fetch %s: request failed: other end said '%s'" % (self.ref.id, response), "TCP_FETCH", logging.WARNING)
                 unsubscribe_remote_output_nopost(self.ref.id)
                 self.done = True
                 self.sock.close()
                 self.fetch_ctx.result(False)
     except Exception as e:
         unsubscribe_remote_output_nopost(self.ref.id)
         ciel.log("TCP-fetch %s: failed due to exception %s" % (self.ref.id, repr(e)), "TCP_FETCH", logging.ERROR)
         with self.lock:
             if self.should_close:
                 self.sock.close()
             self.done = True
             self.should_close = False
         self.fetch_ctx.result(False)
Esempio n. 16
0
def receive_stream_advertisment(id, **args):
    try:
        with module_lock:
            remote_stat_subscriptions[id].advertisment(**args)
    except KeyError:
        ciel.log("Got advertisment for %s which is not an ongoing stream" % id,
                 "REMOTE_STAT", logging.WARNING)
Esempio n. 17
0
    def create_job_for_task(self, task_descriptor, job_options, job_id=None):
        """
        Convert a task descriptor into a job.  Allocates a new job id,
        creates a Job, and entrains it to the JobPool.

        This is always called with job_id = None; not sure why there's
        even an argument for it.
        """
        with self._lock:
        
            if job_id is None:
                job_id = self.allocate_job_id()
            task_id = 'root:%s' % (job_id, )
    
            task_descriptor['task_id'] = task_id
    
            # TODO: Here is where we will set up the job journal, etc.
            job_dir = self.make_job_directory(job_id)
            
            try:
                expected_outputs = task_descriptor['expected_outputs']
            except KeyError:
                expected_outputs = ['%s:job_output' % job_id]
                task_descriptor['expected_outputs'] = expected_outputs
                
            task = build_taskpool_task_from_descriptor(task_descriptor, None)
            job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
            task.job = job
            
            self.add_job(job)
            
            ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)
    
            return job
Esempio n. 18
0
def sync_retrieve_refs(refs, task_record, accept_string=False):

    ctxs = []

    for ref in refs:
        sync_transfer = SynchronousTransfer(ref, task_record)
        ciel.log("Synchronous fetch ref %s" % ref.id, "BLOCKSTORE",
                 logging.DEBUG)
        if accept_string:
            kwargs = {"string_callback": sync_transfer.return_string}
        else:
            kwargs = {}
        fetch_ref_async(ref,
                        sync_transfer.result,
                        sync_transfer.reset,
                        sync_transfer.start_filename,
                        task_record=task_record,
                        **kwargs)
        ctxs.append(sync_transfer)

    for ctx in ctxs:
        ctx.wait()

    failed_transfers = filter(lambda x: not x.success, ctxs)
    if len(failed_transfers) > 0:
        raise MissingInputException(
            dict([(ctx.ref.id,
                   SW2_TombstoneReference(ctx.ref.id, ctx.ref.location_hints))
                  for ctx in failed_transfers]))
    return ctxs
Esempio n. 19
0
 def task_finished(self, task, time):
     self.running_tasks -= 1
     self.task_cost = EWMA_ALPHA * time + (1 - EWMA_ALPHA) * self.task_cost
     ciel.log(
         'Job %s finished a task (now running %d, task cost now %f)' %
         (self.id, self.running_tasks, self.task_cost), 'JOB',
         logging.DEBUG)
Esempio n. 20
0
 def complete(self, success):
     if not self.local_done:
         self.local_done = True
         ciel.log("Stream-fetch %s: complete" % self.ref.id, "CURL_FETCH", logging.INFO)
         self.unsubscribe_remote_output()
         self.fp.close() 
         self.callbacks.result(success)        
Esempio n. 21
0
 def garbage_thread(self):
     while True:
         now = datetime.now()
         with self.lock:
             for executor in self.soft_cache_executors:
                 dead_recs = []
                 for proc_rec in executor.process_cache:
                     time_since_last_use = now - proc_rec.last_used_time
                     if time_since_last_use.seconds > 30:
                         proc_rec.kill()
                         dead_recs.append(proc_rec)
                 for dead_rec in dead_recs:
                     executor.process_cache.remove(dead_rec)
         self.gc_thread_stop.wait(60)
         if self.gc_thread_stop.isSet():
             with self.lock:
                 for executor in self.soft_cache_executors:
                     for proc_rec in executor.process_cache:
                         try:
                             proc_rec.kill()
                         except Exception as e:
                             ciel.log(
                                 "Failed to shut a process down (%s)" %
                                 repr(e), "PROCESSPOOL", logging.WARNING)
             ciel.log("Process pool garbage collector: terminating",
                      "PROCESSPOOL", logging.DEBUG)
             return
Esempio n. 22
0
    def create_job_for_task(self, task_descriptor, job_options, job_id=None):

        with self._lock:

            if job_id is None:
                job_id = self.allocate_job_id()
            task_id = 'root:%s' % (job_id, )

            task_descriptor['task_id'] = task_id

            # TODO: Here is where we will set up the job journal, etc.
            job_dir = self.make_job_directory(job_id)

            try:
                expected_outputs = task_descriptor['expected_outputs']
            except KeyError:
                expected_outputs = ['%s:job_output' % job_id]
                task_descriptor['expected_outputs'] = expected_outputs

            task = build_taskpool_task_from_descriptor(task_descriptor, None)
            job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
            task.job = job

            self.add_job(job)

            ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)

            return job
Esempio n. 23
0
    def run(self):
        self.task_graph.publish(self.initial_cont_ref, None)
        self.task_graph.spawn(self.initial_task, None)

        self.is_running = True
        
        self.workers = []

        ciel.log('Starting %d worker threads.' % self.num_workers, 'TASKRUNNER', logging.INFO)        
        for _ in range(self.num_workers):
            try:
                self.workers.append(multiprocessing.Process(target=worker_process_main, args=(self.options.blockstore, self.task_queue, self.response_queue)))
            except:
                print sys.exc_info()

        response_handler_thread = threading.Thread(target=self.response_handler_thread_main)
        response_handler_thread.start()

        ciel.log('Starting %d worker threads.' % self.num_workers, 'TASKRUNNER', logging.INFO)        
        for worker in self.workers:
            worker.start()
        
        result = self.job_output.join()
        
        self.is_running = False
        for worker in self.workers:
            self.task_queue.put(THREAD_TERMINATOR)
        self.response_queue.put((ACTION_STOP, None))
        response_handler_thread.join()
        for worker in self.workers:
            worker.join()
            
        return result
Esempio n. 24
0
 def notify_worker_failed(self, worker):
     with self._lock:
         try:
             worker_state = self.workers[worker]
             del self.workers[worker]
             ciel.log(
                 'Reassigning tasks from failed worker %s for job %s' %
                 (worker.id, self.id), 'JOB', logging.WARNING)
             for assigned in worker_state.assigned_tasks.values():
                 for failed_task in assigned:
                     failed_task.remove_worker(worker)
                     self.investigate_task_failure(
                         failed_task, ('WORKER_FAILED', None, {}))
             for scheduling_class in worker_state.queues:
                 while True:
                     queued_task = worker_state.pop_task_from_queue(
                         scheduling_class)
                     if queued_task is None:
                         break
                     self.runnable_queue.put(queued_task)
                     #self.investigate_task_failure(failed_task, ('WORKER_FAILED', None, {}))
                     #self.runnable_queue.put(queued_task)
             self.schedule()
         except KeyError:
             ciel.log('Weird keyerror coming out of notify_worker_failed',
                      'JOB', logging.WARNING, True)
             pass
Esempio n. 25
0
 def rollback(self):
     if not self.closed:
         ciel.log("Rollback output %s" % id, 'BLOCKSTORE', logging.WARNING)
         del streaming_producers[self.refid]
         with self.lock:
             self.closed = True
             self.succeeded = False
         if self.fifo_name is not None:
             try:
                 # Dismiss anyone waiting on this pipe
                 fd = os.open(self.fifo_name, os.O_NONBLOCK | os.O_WRONLY)
                 os.close(fd)
             except:
                 pass
             try:
                 os.remove(self.fifo_name)
             except:
                 pass
         if self.file_watch is not None:
             self.file_watch.cancel()
         if self.cat_proc is not None:
             try:
                 self.cat_proc.kill()
             except:
                 pass
         for subscriber in self.subscriptions:
             subscriber.result(False)
Esempio n. 26
0
def allinone_main(options, args):

    ciel.log = CielLogger()

    script_filename = args[0]
    run_id = args[1] if len(args) > 1 else "allinone"

    base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-"))
    ciel.log("Writing block store files to %s" % base_dir, "ALLINONE", logging.INFO)

    if options.blockstore is not None:
        base_dir = options.blockstore
    else:
        base_dir = tempfile.mkdtemp(prefix=os.getenv("TEMP", default="/tmp/sw-files-"))
        options.blockstore = base_dir

    block_store = BlockStore(ciel.engine, "localhost", 8000, base_dir, True)

    initial_task_descriptor, cont_ref = build_initial_task_descriptor(
        script_filename, block_store, "root", "root_cont", "root_output"
    )

    initial_task_object = build_taskpool_task_from_descriptor(initial_task_descriptor, None)

    task_runner = TaskRunner(initial_task_object, cont_ref, block_store, options)

    try:
        print run_id, "SUBMITTED_JOB", now_as_timestamp()
        result = task_runner.run()
        print run_id, "GOT_RESULT", now_as_timestamp()
        print block_store.retrieve_object_for_ref(result, "json", None)

    except:
        pass
Esempio n. 27
0
    def get_random_worker_with_capacity_weight(self, scheduling_class):
        """
        Select a worker at random weighted according to the worker's
        capacity within a given scheduling class, without reference to
        its current load.  Returns None if the total capacity of the
        class is 0.  If the scheduling class is completely unknown
        then we use '*' instead.
        """
        with self._lock:
            try:
                candidates = self.scheduling_class_capacities[scheduling_class]
                total_capacity = self.scheduling_class_total_capacities[scheduling_class]
            except KeyError:
                scheduling_class = '*'
                candidates = self.scheduling_class_capacities['*']
                total_capacity = self.scheduling_class_total_capacities['*']
        
            if total_capacity == 0:
                return None

            selected_slot = random.randrange(total_capacity)
            curr_slot = 0
            i = 0
            
            for worker, capacity in candidates:
                curr_slot += capacity
                if curr_slot > selected_slot:
                    return worker

            # XXX sos22 this is actually really quite bad; I think it
            # wants to be an abort().
            ciel.log('Ran out of workers in capacity-weighted selection class=%s selected=%d total=%d' % (scheduling_class, selected_slot, total_capacity), 'WORKER_POOL', logging.ERROR)
Esempio n. 28
0
    def notify_completed(self):
        """Called by LocalJobOutput.notify_ref_table_updated() when the taskset is complete."""
        ciel.log.error('Taskset complete', 'TASKEXEC', logging.DEBUG)

        # Release this task set, which may allow the JobManager to delete the job.
        self.job_manager.taskset_completed(self)

        if not self.aborted:
            # Send a task report back to the master.
            report_data = []
            for tr in self.task_records:
                if tr.success:
                    report_data.append(
                        (tr.task_descriptor['task_id'], tr.success,
                         (tr.spawned_tasks, tr.published_refs,
                          tr.get_profiling())))
                else:
                    ciel.log(
                        'Appending failure to report for task %s' %
                        tr.task_descriptor['task_id'], 'TASKEXEC',
                        logging.DEBUG)
                    report_data.append(
                        (tr.task_descriptor['task_id'], tr.success,
                         (tr.failure_reason, tr.failure_details,
                          tr.failure_bindings)))
            ciel.stopwatch.stop("worker_task")
            self.master_proxy.report_tasks(self.job.id,
                                           self.initial_td['task_id'],
                                           report_data)
Esempio n. 29
0
    def thread_main(self):
        
        while True:
            
            # While not connected, attempt to register as a backup master.
            while self.is_running:

                try:
                    maybe_terminator = self.queue.get(block=True)
                    if not self.is_running or maybe_terminator is THREAD_TERMINATOR:
                        return
                except Empty:
                    pass
                
                log_entry = maybe_terminator
                
                try:
                    if log_entry[0] == 'U':
                        self.standby_urls.add(log_entry[1])
                    elif log_entry[0] == 'P':
                        self.do_publish_refs(log_entry[1], log_entry[2])
                    elif log_entry[0] == 'W':
                        self.do_add_worker(log_entry[1])
                    elif log_entry[0] == 'J':
                        self.do_add_job(log_entry[1], log_entry[2])
                    elif log_entry[0] == 'D':
                        self.do_add_data(log_entry[1], log_entry[2])
                    else:
                        raise
                except:
                    ciel.log('Error passing log to backup master.', 'BACKUP_SENDER', logging.WARN, True)
Esempio n. 30
0
 def create_job_for_task(self, task_descriptor, job_options, job_id=None):
     
     with self._lock:
     
         if job_id is None:
             job_id = self.allocate_job_id()
         task_id = 'root:%s' % (job_id, )
 
         task_descriptor['task_id'] = task_id
 
         # TODO: Here is where we will set up the job journal, etc.
         job_dir = self.make_job_directory(job_id)
         
         try:
             expected_outputs = task_descriptor['expected_outputs']
         except KeyError:
             expected_outputs = ['%s:job_output' % job_id]
             task_descriptor['expected_outputs'] = expected_outputs
             
         task = build_taskpool_task_from_descriptor(task_descriptor, None)
         job = Job(job_id, task, job_dir, JOB_CREATED, self, job_options)
         task.job = job
         
         self.add_job(job)
         
         ciel.log('Added job: %s' % job.id, 'JOB_POOL', logging.INFO)
 
         return job
Esempio n. 31
0
def unsubscribe_output(otherend_netloc, id):
    with module_lock:
        try:
            remote_stream_subscribers[(id, otherend_netloc)].cancel()
            ciel.log("%s unsubscribed from %s" % (otherend_netloc, id), "BLOCKSTORE", logging.DEBUG)
        except KeyError:
            ciel.log("Ignored unsubscribe request for unknown block %s" % id, "BLOCKSTORE", logging.WARNING)
Esempio n. 32
0
 def rollback(self):
     if not self.closed:
         ciel.log("Rollback output %s" % id, 'BLOCKSTORE', logging.WARNING)
         del streaming_producers[self.refid]
         with self.lock:
             self.closed = True
             self.succeeded = False
         if self.fifo_name is not None:
             try:
                 # Dismiss anyone waiting on this pipe
                 fd = os.open(self.fifo_name, os.O_NONBLOCK | os.O_WRONLY)
                 os.close(fd)
             except:
                 pass
             try:
                 os.remove(self.fifo_name)
             except:
                 pass
         if self.file_watch is not None:
             self.file_watch.cancel()
         if self.cat_proc is not None:
             try:
                 self.cat_proc.kill()
             except:
                 pass
         for subscriber in self.subscriptions:
             subscriber.result(False)
Esempio n. 33
0
    def notify_completed(self):
        """Called by LocalJobOutput.notify_ref_table_updated() when the taskset is complete."""
        ciel.log.error("Taskset complete", "TASKEXEC", logging.INFO)

        if not self.aborted:
            # Send a task report back to the master.
            report_data = []
            for tr in self.task_records:
                if tr.success:
                    report_data.append(
                        (
                            tr.task_descriptor["task_id"],
                            tr.success,
                            (tr.spawned_tasks, tr.published_refs, tr.get_profiling()),
                        )
                    )
                else:
                    ciel.log(
                        "Appending failure to report for task %s" % tr.task_descriptor["task_id"],
                        "TASKEXEC",
                        logging.INFO,
                    )
                    report_data.append(
                        (
                            tr.task_descriptor["task_id"],
                            tr.success,
                            (tr.failure_reason, tr.failure_details, tr.failure_bindings),
                        )
                    )
            self.master_proxy.report_tasks(self.job.id, self.initial_td["task_id"], report_data)

        # Release this task set, which may allow the JobManager to delete the job.
        self.job_manager.taskset_completed(self)
Esempio n. 34
0
def subscribe_result(refid, success, url):
    try:
        with module_lock:
            remote_stat_subscriptions[refid].subscribe_result(success, url)
    except KeyError:
        ciel.log(
            "Subscribe-result for %s ignored as no longer subscribed" % url,
            "REMOTE_STAT", logging.WARNING)
Esempio n. 35
0
 def __exit__(self, exnt, exnv, exntb):
     if not self.closed:
         if exnt is None:
             self.close()
         else:
             ciel.log("FileOutputContext %s destroyed due to exception %s: rolling back" % (self.refid, repr(exnv)), "BLOCKSTORE", logging.WARNING)
             self.rollback()
     return False
Esempio n. 36
0
 def __exit__(self, exnt, exnv, exnbt):
     if exnt is not None:
         ciel.log("Context manager for %s exiting with exception %s" % (self.description, repr(exnv)), "EXEC", logging.WARNING)
     else:
         ciel.log("Context manager for %s exiting cleanly" % self.description, "EXEC", logging.DEBUG)
     for ctx in self.active_contexts:
         ctx.__exit__(exnt, exnv, exnbt)
     return False
Esempio n. 37
0
    def __init__(self, bus, name, queue_manager, num_cores=48):
        self.bus = bus
        self.name = name
        self.queue_manager = queue_manager
        self.num_cores = num_cores
        self.is_running = False

        ciel.log("SCCCorePool initializing", "SCC", logging.INFO)
Esempio n. 38
0
 def complete(self, success):
     if not self.local_done:
         self.local_done = True
         ciel.log("Stream-fetch %s: complete" % self.ref.id, "CURL_FETCH",
                  logging.DEBUG)
         self.unsubscribe_remote_output()
         self.fp.close()
         self.callbacks.result(success)
Esempio n. 39
0
 def cleanup(self):
     try:
         if self.from_process_fifo is not None: 
             os.close(self.from_process_fifo)
         if self.to_process_fifo is not None:
             os.close(self.to_process_fifo)
     except:
         ciel.log('Error cleaning up process %s, ignoring' % self.id, 'PROCESS', logging.WARN)
Esempio n. 40
0
 def task_finished(self, task, time):
     self.running_tasks -= 1
     self.task_cost = EWMA_ALPHA * time + (1 - EWMA_ALPHA) * self.task_cost
     ciel.log(
         "Job %s finished a task (now running %d, task cost now %f)" % (self.id, self.running_tasks, self.task_cost),
         "JOB",
         logging.INFO,
     )
Esempio n. 41
0
File: proc.py Progetto: jepst/ciel
 def publish_fetched_ref(self, fetch):
     completed_ref = fetch.get_completed_ref()
     if completed_ref is None:
         ciel.log("Cancelling async fetch %s (chunk %d)" % (fetch.ref.id, fetch.chunk_size), "EXEC", logging.DEBUG)
     else:
         if fetch.make_sweetheart:
             completed_ref = SW2_SweetheartReference.from_concrete(completed_ref, get_own_netloc())
         self.task_record.publish_ref(completed_ref)
Esempio n. 42
0
 def consider_next_fetch(self):
     if self.remote_done or self.latest_advertisment - self.previous_fetches_bytes_downloaded > self.current_chunk_size:
         self.start_next_fetch()
     else:
         ciel.log("Stream-fetch %s: paused (remote has %d, I have %d)" % 
                  (self.ref.id, self.latest_advertisment, self.previous_fetches_bytes_downloaded), 
                  "CURL_FETCH", logging.INFO)
         self.current_data_fetch = None
Esempio n. 43
0
 def _start(self):
     if self.ref.id in active_http_transfers:
         ciel.log("Joining existing fetch for ref %s" % self.ref, "BLOCKSTORE", logging.INFO)
     else:
         self.start_http_fetch()
     active_http_transfers[self.ref.id].add_listener(self.fetch_client)
     self.fetch = active_http_transfers[self.ref.id]
     self.fetch_client.set_filename(self.fetch.bs_ctx.filename, False)
Esempio n. 44
0
 def subscribe_result(self, success, _):
     if not success:
         ciel.log("Stream-fetch %s: failed to subscribe to remote adverts. Abandoning stream." 
                  % self.ref.id, "CURL_FETCH", logging.INFO)
         self.subscribed_to_remote_adverts = False
         self.remote_failed = True
         if self.current_data_fetch is None:
             self.complete(False)
Esempio n. 45
0
 def __init__(self, port):
     self.aux_port = port
     ciel.log("Listening for auxiliary connections on port %d" % port, "TCP_FETCH", logging.DEBUG)
     self.aux_listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.aux_listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
     self.aux_listen_socket.bind(("0.0.0.0", port))
     self.aux_listen_socket.listen(5)
     self.aux_listen_socket.setblocking(False)
Esempio n. 46
0
 def cancel(self):
     ciel.log("Fetch %s: cancelling" % self.ref.id, "CURL_FETCH",
              logging.DEBUG)
     self.cancelled = True
     if self.curl_fetch is not None:
         self.curl_fetch.cancel()
     self.fp.close()
     self.callbacks.result(False)
Esempio n. 47
0
 def cleanup(self):
     try:
         if self.from_process_fifo is not None: 
             self.from_process_fifo.close()
         if self.to_process_fifo is not None:
             self.to_process_fifo.close()
         shutil.rmtree(self.fifos_dir)
     except:
         ciel.log('Error cleaning up process %s, ignoring' % self.id, 'PROCESS', logging.WARN, True)
Esempio n. 48
0
 def soft_cache_process(self, proc_rec, exec_cls, soft_cache_keys):
     with self.lock:
         ciel.log("Caching process %s" % proc_rec.id, "PROCESSPOOL", logging.DEBUG)
         exec_cls.process_cache.add(proc_rec)
         proc_rec.is_free = True
         proc_rec.last_used_time = datetime.now()
         proc_rec.soft_cache_refs = set()
         for (refids, tag) in soft_cache_keys:
             proc_rec.soft_cache_refs.update(refids)
Esempio n. 49
0
 def set_state(self, state):
     self.record_event(JOB_STATE_NAMES[state])
     self.state = state
     evt_time = self.history[-1][0]
     ciel.log(
         '%s %s @ %f' %
         (self.id, JOB_STATE_NAMES[self.state],
          time.mktime(evt_time.timetuple()) + evt_time.microsecond / 1e6),
         'JOB', logging.INFO)
Esempio n. 50
0
 def task_runnable(self, task):
     if self.job.state == JOB_ACTIVE:
         task.set_state(TASK_QUEUED)
         self.scheduler_queue.put(task)
     else:
         ciel.log(
             'Task %s became runnable while job %s not active (%s): ignoring'
             % (task.task_id, self.job.id, JOB_STATE_NAMES[self.job.state]),
             'JOBTASKGRAPH', logging.WARN)
Esempio n. 51
0
 def cleanup(self):
     try:
         if self.from_process_fifo is not None:
             os.close(self.from_process_fifo)
         if self.to_process_fifo is not None:
             os.close(self.to_process_fifo)
     except:
         ciel.log('Error cleaning up process %s, ignoring' % self.id,
                  'PROCESS', logging.WARN)
Esempio n. 52
0
 def subscribe_result(self, success, _):
     if not success:
         ciel.log(
             "Stream-fetch %s: failed to subscribe to remote adverts. Abandoning stream."
             % self.ref.id, "CURL_FETCH", logging.DEBUG)
         self.subscribed_to_remote_adverts = False
         self.remote_failed = True
         if self.current_data_fetch is None:
             self.complete(False)
Esempio n. 53
0
 def _start(self):
     if self.ref.id in active_http_transfers:
         ciel.log("Joining existing fetch for ref %s" % self.ref,
                  "BLOCKSTORE", logging.DEBUG)
     else:
         self.start_http_fetch()
     active_http_transfers[self.ref.id].add_listener(self.fetch_client)
     self.fetch = active_http_transfers[self.ref.id]
     self.fetch_client.set_filename(self.fetch.bs_ctx.filename, False)
Esempio n. 54
0
def unsubscribe_output(otherend_netloc, id):
    with module_lock:
        try:
            remote_stream_subscribers[(id, otherend_netloc)].cancel()
            ciel.log("%s unsubscribed from %s" % (otherend_netloc, id),
                     "BLOCKSTORE", logging.DEBUG)
        except KeyError:
            ciel.log("Ignored unsubscribe request for unknown block %s" % id,
                     "BLOCKSTORE", logging.WARNING)
Esempio n. 55
0
 def soft_cache_process(self, proc_rec, exec_cls, soft_cache_keys):
     with self.lock:
         ciel.log("Caching process %s" % proc_rec.id, "PROCESSPOOL",
                  logging.DEBUG)
         exec_cls.process_cache.add(proc_rec)
         proc_rec.is_free = True
         proc_rec.last_used_time = datetime.now()
         proc_rec.soft_cache_refs = set()
         for (refids, tag) in soft_cache_keys:
             proc_rec.soft_cache_refs.update(refids)
Esempio n. 56
0
 def __exit__(self, exnt, exnv, exntb):
     if not self.closed:
         if exnt is None:
             self.close()
         else:
             ciel.log(
                 "FileOutputContext %s destroyed due to exception %s: rolling back"
                 % (self.refid, repr(exnv)), "BLOCKSTORE", logging.WARNING)
             self.rollback()
     return False
Esempio n. 57
0
 def cleanup(self):
     try:
         if self.from_process_fifo is not None:
             self.from_process_fifo.close()
         if self.to_process_fifo is not None:
             self.to_process_fifo.close()
         shutil.rmtree(self.fifos_dir)
     except:
         ciel.log('Error cleaning up process %s, ignoring' % self.id,
                  'PROCESS', logging.WARN, True)
Esempio n. 58
0
 def _check_completion(self):
     if self.success is False:
         ciel.log("Fetch for %s failed" % self.ref, "EXEC", logging.WARNING)
         return False
     elif self.success is True:
         ciel.log("Fetch for %s completed; using file directly" % self.ref,
                  "EXEC", logging.DEBUG)
         return True
     else:
         return False