def _finished(job_id, f): T('enter_sky_share_finished %d' % job_id) job = self.running[job_id] in_progress.remove(job) try: with Timing('sky_share_future_get %d' % job_id): torrent_id = f.get() #except ???Error as e: # TODO #pass except Exception as e: logging.warning('sky share for %s faled: %s' % (job, e)) # TODO better checks or collect STDERR for file in job.files: if not os.path.exists(job.directory + '/' + file): msg = 'Failed to share %s/%s: %s' % (job.directory, job.files, e) self._set_promise_error(job, OSError(errno.ENOENT, msg)) return schedule_retry(job) return logging.debug('sky share successfully done for %s: %s' % (job, torrent_id)) with self.lock: job.torrent_id = torrent_id self.upload_queue.append(job) self.upload_queue_not_empty.notify()
def _sandbox_wait2_loop(self): while not self.should_stop: T('begin_wait2_loop') with self.lock: while not(self.should_stop or self.wait2_queue): with Timing('wait2_queue_wait'): self.wait2_queue_not_empty.wait() if self.should_stop: # TODO return job = self.wait2_queue.popleft() try: with Timing('sbx_wait2_fetch_task_resource_id'): resource_id = self._try_fetch_upload_task_resource_id(job) except Exception as e: logging.warning("Failed to get resource %s from task %d: %s" \ % (job.resource_type, job.upload_task_id, e)) self._set_promise_error(job) del job continue if not resource_id: self._schedule_retry(job, self.Action.FETCH_RESOURCE_ID, delay=3.0) else: logging.debug('Done with %s, resource_id = %d' % (job, resource_id)) self._set_promise_value(job, resource_id) del job
def _upload_loop(self): while not self.should_stop: T('begin_upload_loop') with self.lock: while not(self.should_stop or self.upload_queue): self.upload_queue_not_empty.wait() if self.should_stop: # TODO return job = self.upload_queue.popleft() #logging.debug('Uploading to Sandbox %s' % job) try: with Timing('sbx_create_upload_task for %d' % job.id): task = self._try_create_upload_task(job) except Exception: self._set_promise_error(job) continue if not task: self._schedule_retry(job, self.Action.CREATE_UPLOAD_TASK, delay=10.0) continue job.upload_task_id = task.id logging.debug('upload_task_id=%d for %s' % (job.upload_task_id, job)) with self.lock: self.wait1_queue.append(job) self.wait1_queue_not_empty.notify() del task del job
def try_log_descriptors(): try: import subprocess files = subprocess.check_output(['lsof', '-p', str(os.getpid())]) except: logging.exception('Failed to dump lsof') else: logging.debug('lsof\n' + files)
def guess_my_host(peer_addr, timeout): try: s = socket.create_connection(peer_addr, timeout=timeout) except socket.error as e: logging.debug('Can\'t connect to %s: %s' % (peer_addr, e)) return return s.getsockname()[0]
def share(self, *args, **kwargs): job = self.Job(*args, **kwargs) with self.lock: job_id = self.next_job_id self.next_job_id += 1 job.id = job_id self.running[job_id] = job self.share_queue.append(job) self.share_queue_not_empty.notify() logging.debug('New %s' % job) #return (job.promise.to_future(), job.id) return job.promise.to_future()
def _the_loop(self): next_try_min_time = 0 while True: with self._lock: while True: now = time.time() if self._should_stop_max_time: if now > self._should_stop_max_time \ or next_try_min_time > self._should_stop_max_time: return if self._pending_update: deadline = next_try_min_time if now > deadline: break else: deadline = None self._changed.wait(deadline - now if deadline is not None else None) update, is_final = self._pending_update self._pending_update = None logging.debug('sending_update: %s' % ((update, is_final),)) try: self._send_update(update, is_final) except self.RetriableError: logging.exception('Failed to send update') with self._lock: if not self._pending_update: self._pending_update = (update, is_final) next_try_min_time = time.time() + self._RETRY_DELAY else: if is_final: return
def sky_share(subproc, directory, files): out = NamedTemporaryFile('w') argv = ['sky', 'share', '-d', directory] + files logging.debug(argv) # FIXME collect STDERR try: p = subproc.Popen(argv, stdout=out.name) except: try: raise finally: try: out.__exit__(None, None, None) except: pass def finalize(exit_code): try: status = exit_code.get() if status: raise RuntimeError("sky share failed with exit status: %d" % status) with open(out.name) as in_: id = in_.readline().rstrip('\n') if not id.startswith('rbtorrent:'): raise RuntimeError('Malformed output of sky share: %s' % id[:100]) return id finally: try: out.__exit__(None, None, None) except: pass return wrap_future(p.get_returncode_future(), finalize)
def _sandbox_wait1_loop(self): poll_interval = 3.0 in_progress = {} noop_sandbox_statuses = { 'DRAFT', 'ENQUEUING', 'ENQUEUED', 'PREPARING', 'EXECUTING', 'TEMPORARY', 'FINISHING', 'STOPPING', 'WAIT_RES', 'WAIT_TASK', 'WAIT_TIME', } next_poll_time = time.time() while not self.should_stop: T('begin_wait1_loop') with self.lock: timeout = None if in_progress: timeout = max(0.0, next_poll_time - time.time()) T('before_before_wait1_queue_not_empty_sleep %s' \ % ((timeout, self.should_stop, len(self.wait1_queue)),)) if (timeout is None or timeout) and not(self.should_stop or self.wait1_queue): T('before_wait1_queue_not_empty_sleep %s' % timeout) self.wait1_queue_not_empty.wait(timeout) if self.should_stop: # TODO return job = None if self.wait1_queue: job = self.wait1_queue.popleft() in_progress[job.upload_task_id] = job del job if time.time() < next_poll_time: logging.debug('continue_wait1_sleep') continue try: with Timing('sbx_list_task_statuses'): statuses = self.sandbox.list_task_statuses(in_progress.keys()) except Exception as e: logging.warning("Failed to get sandbox tasks' statuses: %s" % e) continue finally: next_poll_time = max(next_poll_time + poll_interval, time.time()) T('wait1_next_poll_time=%s' % next_poll_time) logging.debug("Task statuses: %s" % statuses) # TODO Comment out done = [] for task_id in in_progress.keys(): status = statuses.get(task_id) if status in noop_sandbox_statuses: continue job = in_progress.pop(task_id) if status == 'SUCCESS': done.append(job) logging.debug('Upload task=%d in SUCCESS for %s' % (task_id, job)) elif status in ['FAILURE', 'EXCEPTION'] or status is None: logging.warning("Task %d in FAILURE. Will create new task" % task_id) self._schedule_retry(job, self.Action.CREATE_UPLOAD_TASK, delay=5.0) else: logging.error("Unknown task status %s for task=%d, %s" % (status, task_id, job)) self._set_promise_error(job, RuntimeError("Unknown task status %s for task_id=%d" % (status, task_id))) T('after_process_all_wait1_statuses') with self.lock: self.wait2_queue.extend(done) self.wait2_queue_not_empty.notify()
def _share_loop(self): in_progress = set() def schedule_retry(job): self._schedule_retry(job, self.Action.SHARE_FILE, delay=10.0) def _finished(job_id, f): T('enter_sky_share_finished %d' % job_id) job = self.running[job_id] in_progress.remove(job) try: with Timing('sky_share_future_get %d' % job_id): torrent_id = f.get() #except ???Error as e: # TODO #pass except Exception as e: logging.warning('sky share for %s faled: %s' % (job, e)) # TODO better checks or collect STDERR for file in job.files: if not os.path.exists(job.directory + '/' + file): msg = 'Failed to share %s/%s: %s' % (job.directory, job.files, e) self._set_promise_error(job, OSError(errno.ENOENT, msg)) return schedule_retry(job) return logging.debug('sky share successfully done for %s: %s' % (job, torrent_id)) with self.lock: job.torrent_id = torrent_id self.upload_queue.append(job) self.upload_queue_not_empty.notify() while not self.should_stop: # TODO T('begin_share_loop') with self.lock: while not(self.should_stop or self.share_queue): self.share_queue_not_empty.wait() if self.should_stop: # TODO return job = self.share_queue.popleft() in_progress.add(job) logging.debug('Run sky share for %s' % job) try: with Timing('sky_share_future %d' % job.id): # ~4ms (we wait pid from subprocsrv) torrent_id = sky_share(self.subproc, job.directory, job.files) except: logging.exception('') # TODO in_progress.remove(job) schedule_retry(job) del job continue torrent_id.subscribe(lambda f, job_id=job.id: _finished(job_id, f)) del torrent_id del job
raise RuntimeError("Failed to send data to rem server: %s" % e.faultString) else: # FIXME Actually if isinstance(e, xmlrpclib.Fault) then not retriable # but not fatal as WrongTaskIdError raise RemNotifier.RetriableError(str(e)) rem_notifier = RemNotifier(send_update) #rem_notifier.send_update(pck.produce_rem_update_message()) # FIXME # TODO _create_rpc_server may throw errno.EADDRINUSE rpc_server = _create_rpc_server(pck, opts) try_log_descriptors() logging.debug('rpc_server.server_address = %s' % (rpc_server.server_address,)) my_host = try_guess_my_host(parse_network_address(opts.rem_server_addr), timeout=3.0) logging.debug('guessed host = %s' % my_host) rpc_server_addr = ( my_host or os.uname()[1], rpc_server.server_address[1] ) reset_tries = False if opts.resume_params: resume_params = json.loads(opts.resume_params) if resume_params.get('use_dummy_jobs', False):