def _update_train_data_for_next_iteration(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.current_iteration += 1 task_declaration.current_iteration_retry = 0 task_declaration.progress = ( task_declaration.current_iteration * task_declaration.epochs_in_iteration * 100 / task_declaration.epochs) count_ta = 0 for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)): train_data = ta.train_data # share data to worker train_data.set_encryption_key(ta.worker.enc_key) train_data.save() ta.state = TaskAssignment.State.TRAINING ta.save() count_ta += 1 assert task_declaration.workers_requested == count_ta task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS task_declaration.save()
def _process_verify_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS verification_assignments = task_declaration.get_verification_assignments( states=( VerificationAssignment.State.VERIFYING, VerificationAssignment.State.FINISHED ) ) failed = False finished_verification_assignments = [] count_timeout = 0 with async_commit(): for va in verification_assignments: if va.state == VerificationAssignment.State.VERIFYING: if va.iteration_is_finished: va.state = VerificationAssignment.State.FINISHED va.save() if va.state == VerificationAssignment.State.FINISHED: if va.verification_result.error: failed = True else: finished_verification_assignments.append(va) continue verify_timeout = settings.WAIT_VERIFY_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo) if (now - va.verification_result.modified_at).total_seconds() > verify_timeout: va.state = VerificationAssignment.State.TIMEOUT va.save() logger.info('Timeout of waiting for {}'.format(va)) count_timeout += 1 if count_timeout: task_declaration.verifiers_needed += count_timeout self._republish_for_verify(task_declaration) return if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if len(finished_verification_assignments) < task_declaration.verifiers_requested: # verification is not ready logger.info('Wait for finish of verification for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return fake_workers = self._parse_verification_results( task_declaration, finished_verification_assignments) if fake_workers: logger.info('Fake workers detected') fake_worker_ids = [] for worker_id, count_detections in fake_workers.items(): logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections)) fake_worker_ids.append(worker_id) self._reject_fake_workers(task_declaration, fake_worker_ids) self._republish_for_train(task_declaration) return if not task_declaration.last_iteration: self._update_train_data_for_next_iteration(task_declaration) return task_declaration.progress = 100.0 task_declaration.state = TaskDeclaration.State.COMPLETED task_declaration.save() logger.info('{} is finished tflops: {} estimated: {}'.format( task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))