def _parse_verification_results(self, task_declaration: TaskDeclaration, finished_verification_assignments: ListVerificationAssignments): fake_workers = {} for va in finished_verification_assignments: assert va.verification_result.error is None for result in va.verification_result.result: if result['is_fake']: try: fake_workers[result['worker_id']] += 1 except KeyError: fake_workers[result['worker_id']] = 1 task_declaration.tflops += va.verification_result.tflops # what to do if many verifiers ? if va.verification_result.weights_ipfs: # if weights_ipfs is None than fake workers are present task_declaration.weights_ipfs = va.verification_result.weights_ipfs if task_declaration.last_iteration: task_declaration.loss = va.verification_result.loss task_declaration.accuracy = va.verification_result.accuracy logger.info('Copy summarization for {}, loss: {}, accuracy: {}'.format( task_declaration, task_declaration.loss, task_declaration.accuracy)) else: logger.info('Copy summarization for {}'.format(task_declaration)) return fake_workers
def _republish_for_estimation(self, task_declaration: TaskDeclaration): assert task_declaration.estimators_needed > 0 task_declaration.state = TaskDeclaration.State.ESTIMATE_IS_REQUIRED task_declaration.save() for ei in task_declaration.get_estimation_assignments(states=(EstimationAssignment.State.REJECTED,)): ei.state = EstimationAssignment.State.REASSIGN # return back ownership ei.save(recipients=ei.estimator.address)
def _reassign_train_data(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT_TRAIN task_assignments = task_declaration.get_task_assignments( states=( TaskAssignment.State.ACCEPTED, TaskAssignment.State.TIMEOUT, TaskAssignment.State.FAKE_RESULTS ) ) # split by state accepted_task_assignment = [] failed_task_assignments = [] for ta in task_assignments: if ta.state == TaskAssignment.State.ACCEPTED: accepted_task_assignment.append(ta) continue if ta.state == TaskAssignment.State.TIMEOUT: failed_task_assignments.append(ta) continue if ta.state == TaskAssignment.State.FAKE_RESULTS: failed_task_assignments.append(ta) continue assert False and 'Check query!' assert len(failed_task_assignments) == len(accepted_task_assignment) # assign data to new accepted task_assignments for index, ta in enumerate(accepted_task_assignment): failed_ta = failed_task_assignments[index] # reassign train data # retrieve data which producer is able to encrypt train_data = TrainData.get_with_initial_data( asset_id=failed_ta.train_data_id, db=self.db, encryption=self.encryption ) train_data.task_assignment_id = ta.asset_id # share data with new worker train_data.set_encryption_key(ta.worker.enc_key) train_data.save() ta.train_data_id = train_data.asset_id ta.state = TaskAssignment.State.TRAINING ta.save() failed_ta.state = TaskAssignment.State.FORGOTTEN failed_ta.save() task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS task_declaration.save()
def _process_epoch_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.EPOCH_IN_PROGRESS task_assignments = task_declaration.get_task_assignments( states=( TaskAssignment.State.TRAINING, TaskAssignment.State.FINISHED, ) ) failed = False finished_task_assignments = [] count_timeout = 0 with async_commit(): for ta in task_assignments: if ta.state == TaskAssignment.State.TRAINING: if ta.iteration_is_finished: ta.state = TaskAssignment.State.FINISHED ta.save() if ta.state == TaskAssignment.State.FINISHED: if ta.train_result.error: failed = True else: finished_task_assignments.append(ta) continue train_timeout = settings.WAIT_TRAIN_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=ta.train_result.modified_at.tzinfo) if (now - ta.train_result.modified_at).total_seconds() > train_timeout: ta.state = TaskAssignment.State.TIMEOUT ta.save() logger.info('Timeout of waiting for {}'.format(ta)) count_timeout += 1 if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if count_timeout: task_declaration.workers_needed += count_timeout self._republish_for_train(task_declaration) return if len(finished_task_assignments) < task_declaration.workers_requested: logger.info('Wait for finish of training for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return if task_declaration.current_iteration > 1: self._save_loss_and_accuracy(task_declaration, finished_task_assignments) self._assign_verification_data(task_declaration, finished_task_assignments)
def monitor_task(asset_id, producer): task_declaration = TaskDeclaration.get(asset_id, db=producer.db, encryption=producer.encryption) logger.info('{} sate {}'.format(task_declaration, task_declaration.state)) while task_declaration.state != TaskDeclaration.State.FAILED: print_task_declaration(task_declaration) time.sleep(3) task_declaration = TaskDeclaration.get(asset_id, db=producer.db, encryption=producer.encryption) if task_declaration.state == TaskDeclaration.State.COMPLETED: print_task_declaration(task_declaration) break
def _reassign_verification_data(self, task_declaration: TaskDeclaration): verification_assignments = task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.TIMEOUT) ) # split accepted and overdue accepted_verification_assignments = [] timeout_verification_assignments = [] for va in verification_assignments: if va.state == VerificationAssignment.State.ACCEPTED: accepted_verification_assignments.append(va) continue if va.state == VerificationAssignment.State.TIMEOUT: timeout_verification_assignments.append(va) continue assert False and 'Check query!' assert len(accepted_verification_assignments) == len(timeout_verification_assignments) train_results = [ { 'worker_id': ta.worker_id, 'result': ta.train_result.weights_ipfs } for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)) ] for index, va in enumerate(accepted_verification_assignments): assert va.verification_data_id is None verification_data = VerificationData.create( verification_assignment_id=va.asset_id, # share data with verifier public_key=va.verifier.enc_key, test_dir_ipfs=task_declaration.dataset.test_dir_ipfs, model_code_ipfs=task_declaration.train_model.code_ipfs, train_results=train_results, db=self.db, encryption=self.encryption ) va.verification_data_id = verification_data.asset_id va.state = VerificationAssignment.State.VERIFYING va.save() failed_va = timeout_verification_assignments[index] failed_va.state = VerificationAssignment.State.FORGOTTEN failed_va.save() task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.save()
def _process_estimated(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.ESTIMATED # wait while job will be issued and will have enough balance if task_declaration.balance_in_wei >= task_declaration.train_cost_in_wei: task_declaration.state = TaskDeclaration.State.DEPLOYMENT task_declaration.save() else: if task_declaration.issued: logger.info('Deposit for {} is required, balance: {:.5f} train cost: {:.5f}'.format( task_declaration, task_declaration.balance, task_declaration.train_cost)) else: logger.info('Issue for {} is required, train cost: {:.5f}'.format( task_declaration, task_declaration.train_cost))
def _republish_for_verify(self, task_declaration: TaskDeclaration): assert task_declaration.verifiers_needed > 0 task_declaration.state = TaskDeclaration.State.DEPLOYMENT_VERIFICATION task_declaration.save() verification_assignment = task_declaration.get_verification_assignments( states=(VerificationAssignment.State.REJECTED,) ) for va in verification_assignment: va.state = VerificationAssignment.State.REASSIGN # return back ownership va.save(recipients=va.verifier.address)
def _republish_for_train(self, task_declaration: TaskDeclaration): assert task_declaration.workers_needed > 0 task_declaration.state = TaskDeclaration.State.DEPLOYMENT_TRAIN task_declaration.current_iteration_retry += 1 task_declaration.save() task_assignments = task_declaration.get_task_assignments( states=(TaskAssignment.State.REJECTED,) ) for ta in task_assignments: ta.state = TaskAssignment.State.REASSIGN # return back ownership ta.save(recipients=ta.worker.address)
def _save_loss_and_accuracy(self, task_declaration: TaskDeclaration, finished_task_assignments): assert task_declaration.current_iteration > 1 loss = [] accuracy = [] # collect loss and accuracy for prev iteration iteration = str(task_declaration.current_iteration - 1) for ta in finished_task_assignments: if ta.train_result.eval_results is not None and ta.train_result.eval_results.get(iteration): loss.append(ta.train_result.eval_results[iteration]['loss']) accuracy.append(ta.train_result.eval_results[iteration]['accuracy']) assert len(loss) and len(accuracy) task_declaration.loss = sum(loss)/len(loss) task_declaration.accuracy = sum(accuracy)/len(accuracy) logger.info('Save avr iteration: {} loss: {} and accuracy: {}'.format( iteration, task_declaration.loss, task_declaration.accuracy))
def _process_task_declarations(self): task_declarations = TaskDeclaration.enumerate( created_by_user=False, db=self.db, encryption=self.encryption) for task_declaration in task_declarations: try: self._process_task_declaration(task_declaration) except Exception as ex: logger.exception(ex)
def train_task(self, asset_id): while True: task_declaration = TaskDeclaration.get(asset_id, db=self.db, encryption=self.encryption) if task_declaration.in_finished_state: break self._process_task_declaration(task_declaration) time.sleep(settings.PRODUCER_PROCESS_INTERVAL)
def _process_deployment(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT with async_commit(): save = False for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.READY,)): if self._is_task_assignment_allowed(task_declaration, ta): ta.state = TaskAssignment.State.ACCEPTED ta.save() task_declaration.workers_needed -= 1 save = True else: ta.state = TaskAssignment.State.REJECTED ta.save() for va in task_declaration.get_verification_assignments(states=(VerificationAssignment.State.READY,)): if self._is_verification_assignment_allowed(task_declaration, va): va.state = VerificationAssignment.State.ACCEPTED va.save() task_declaration.verifiers_needed -= 1 save = True else: va.state = VerificationAssignment.State.REJECTED va.save() # save if were changes if save: task_declaration.save() ready_to_start = task_declaration.workers_needed == 0 and task_declaration.verifiers_needed == 0 logger.info('{} ready: {} workers_needed: {} verifiers_needed: {}'.format( task_declaration, ready_to_start, task_declaration.workers_needed, task_declaration.verifiers_needed)) if ready_to_start: self._assign_initial_train_data(task_declaration) return if not save: # recheck how many workers and verifiers really accepted accepted_workers_count = len(task_declaration.get_task_assignments( states=(TaskAssignment.State.ACCEPTED,))) accepted_verifiers_count = len(task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED,))) if accepted_workers_count == task_declaration.workers_requested \ and accepted_verifiers_count == task_declaration.verifiers_requested: logger.info('All performers are accepted, start train') task_declaration.workers_needed = 0 task_declaration.verifiers_needed = 0 self._assign_initial_train_data(task_declaration)
def train_remote(train_ipfs, test_ipfs, args): logger.info('Start remote train') producer = load_producer() logger.info('Generate initial model weights_ipfs') model = Model.load_model(path=args.path) initial_weights_path = '/tmp/tatau_initial_weights' model.save_weights(initial_weights_path) ipfs = IPFS() logger.info('Upload weights_ipfs to IPFS') initial_weights_file = ipfs.add_file(initial_weights_path) os.unlink(initial_weights_path) dataset_name = os.path.basename(args.name) dataset = Dataset.create( db=producer.db, encryption=producer.encryption, name=dataset_name, train_dir_ipfs=train_ipfs, test_dir_ipfs=test_ipfs ) # logger.info('Dataset created: {}'.format(dataset)) logger.info('Create model') train_model = TrainModel.upload_and_create( name=args.name, code_path=args.path, db=producer.db, encryption=producer.encryption ) logger.debug('Model created: {}'.format(train_model)) logger.info('Create train job') task = TaskDeclaration.create( producer_id=producer.asset_id, dataset_id=dataset.asset_id, train_model_id=train_model.asset_id, workers_needed=args.workers, verifiers_needed=args.verifiers, batch_size=args.batch, epochs=args.epochs, weights_ipfs=initial_weights_file.multihash, db=producer.db, encryption=producer.encryption, epochs_in_iteration=args.epochs_in_iteration ) logger.debug('Train job created: {}'.format(task))
def _assign_verification_data(self, task_declaration: TaskDeclaration, task_assignments: ListTaskAssignments): train_results = [] for ta in task_assignments: train_results.append({ 'worker_id': ta.worker_id, 'result': ta.train_result.weights_ipfs }) task_declaration.tflops += ta.train_result.tflops for verification_assignment in task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.FINISHED)): if verification_assignment.state == VerificationAssignment.State.ACCEPTED: assert verification_assignment.verification_data_id is None verification_data = VerificationData.create( verification_assignment_id=verification_assignment.asset_id, # share data with verifier public_key=verification_assignment.verifier.enc_key, test_dir_ipfs=task_declaration.dataset.test_dir_ipfs, model_code_ipfs=task_declaration.train_model.code_ipfs, train_results=train_results, db=self.db, encryption=self.encryption ) verification_assignment.verification_data_id = verification_data.asset_id verification_assignment.state = VerificationAssignment.State.VERIFYING verification_assignment.save() continue if verification_assignment.state == VerificationAssignment.State.FINISHED: verification_data = verification_assignment.verification_data verification_data.train_results = train_results verification_data.save() verification_assignment.state = VerificationAssignment.State.VERIFYING verification_assignment.save() continue task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.save()
def process_tasks(self): while True: try: for task_declaration in TaskDeclaration.enumerate(db=self.db, encryption=self.encryption): if task_declaration.in_finished_state: continue self._process_task_declaration(task_declaration) time.sleep(settings.PRODUCER_PROCESS_INTERVAL) except Exception as e: logger.exception(e)
def _process_estimate_is_required(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_REQUIRED with async_commit(): save = False for ea in task_declaration.get_estimation_assignments(states=(EstimationAssignment.State.READY,)): if self._is_estimation_assignment_allowed(task_declaration, ea): ea.state = EstimationAssignment.State.ACCEPTED ea.save() task_declaration.estimators_needed -= 1 save = True else: ea.state = EstimationAssignment.State.REJECTED ea.save() # save changes if save: task_declaration.save() if task_declaration.estimators_needed == 0: # in assign changes will be saved self._assign_estimate_data(task_declaration)
def _process_task_declarations(self): task_declarations = TaskDeclaration.enumerate( created_by_user=False, db=self.db, encryption=self.encryption) for task_declaration in task_declarations: try: self._process_task_declaration(task_declaration) except requests.exceptions.ConnectionError as ex: # hide from sentry connection errors to parity parity_ports = [ settings.PARITY_JSONRPC_PORT, settings.PARITY_WEBSOCKET_PORT ] if ex.args[0].pool.port in parity_ports and ex.args[ 0].pool.host == settings.PARITY_HOST: logger.info(ex) else: raise except Exception as ex: logger.exception(ex)
def _process_estimate_is_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_IN_PROGRESS estimation_assignments = task_declaration.get_estimation_assignments( states=( EstimationAssignment.State.ESTIMATING, EstimationAssignment.State.FINISHED ) ) finished_assignments = [] count_timeout = 0 with async_commit(): for ea in estimation_assignments: if ea.state == EstimationAssignment.State.ESTIMATING: if ea.estimation_result.state == EstimationResult.State.FINISHED: ea.state = EstimationAssignment.State.FINISHED ea.save() else: estimate_timeout = settings.WAIT_ESTIMATE_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=ea.estimation_result.modified_at.tzinfo) if (now - ea.estimation_result.modified_at).total_seconds() > estimate_timeout: ea.state = EstimationAssignment.State.TIMEOUT ea.save() logger.info('Timeout of waiting for {}'.format(ea)) count_timeout += 1 if ea.state == EstimationAssignment.State.FINISHED: finished_assignments.append(ea) if count_timeout: task_declaration.estimators_needed += count_timeout self._republish_for_estimation(task_declaration) return if len(finished_assignments) == task_declaration.estimators_requested: task_declaration.state = TaskDeclaration.State.ESTIMATED task_declaration.estimated_tflops, failed = Estimator.estimate(task_declaration, finished_assignments) if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return logger.info('Wait of finish for estimation {}, finished: {}, requested: {}'.format( task_declaration, len(finished_assignments), task_declaration.estimators_requested ))
def _update_train_data_for_next_iteration(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS task_declaration.current_iteration += 1 task_declaration.current_iteration_retry = 0 task_declaration.progress = ( task_declaration.current_iteration * task_declaration.epochs_in_iteration * 100 / task_declaration.epochs) count_ta = 0 for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)): train_data = ta.train_data # share data to worker train_data.set_encryption_key(ta.worker.enc_key) train_data.save() ta.state = TaskAssignment.State.TRAINING ta.save() count_ta += 1 assert task_declaration.workers_requested == count_ta task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS task_declaration.save()
def task_declaration(self): from tatau_core.models import TaskDeclaration return TaskDeclaration.get(self.task_declaration_id, db=self.db, encryption=self.encryption)
def _assign_initial_train_data(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT # start of train task_declaration.current_iteration += 1 task_declaration.current_iteration_retry = 0 accepted_task_assignment = task_declaration.get_task_assignments(states=(TaskAssignment.State.ACCEPTED,)) count_ta = 0 train_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.train_dir_ipfs).ls() test_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.test_dir_ipfs).ls() all_train_chunks_ipfs = self._chunk_it( iterable=[x.multihash for x in train_dirs_ipfs], count=task_declaration.workers_requested ) assert len(all_train_chunks_ipfs) == task_declaration.workers_requested all_test_chunks_ipfs = self._chunk_it( iterable=[x.multihash for x in test_dirs_ipfs], count=task_declaration.workers_requested ) assert len(all_test_chunks_ipfs) == task_declaration.workers_requested list_td_ta = [] with async_commit(): # create TrainData for index, task_assignment in enumerate(accepted_task_assignment): train_chunks_ipfs = all_train_chunks_ipfs[index] test_chunks_ipfs = all_test_chunks_ipfs[index] train_data = TrainData.create( model_code_ipfs=task_declaration.train_model.code_ipfs, train_chunks_ipfs=train_chunks_ipfs, test_chunks_ipfs=test_chunks_ipfs, data_index=index, db=self.db, encryption=self.encryption ) list_td_ta.append((train_data, task_assignment)) logger.debug('Created {}, train chunks: {}, count:{}, test chunks: {}, count:{}'.format( train_data, train_chunks_ipfs, len(train_chunks_ipfs), test_chunks_ipfs, len(test_chunks_ipfs))) count_ta += 1 assert task_declaration.workers_requested == count_ta with async_commit(): # share to worker for train_data, task_assignment in list_td_ta: train_data.task_assignment_id = task_assignment.asset_id train_data.set_encryption_key(task_assignment.worker.enc_key) train_data.save() task_assignment.train_data_id = train_data.asset_id task_assignment.state = TaskAssignment.State.TRAINING task_assignment.save() task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS task_declaration.save()
def main(): parser = argparse.ArgumentParser(description='Produce Task') parser.add_argument('-c', '--command', required=True, metavar='KEY', help='add|stop|cancel|issue|deposit|monitor') parser.add_argument('-k', '--key', default="producer", metavar='KEY', help='RSA key name') parser.add_argument('-n', '--name', default='mnist_mlp', metavar='NAME', help='model name') parser.add_argument('-p', '--path', default='examples/torch/mnist/cnn.py', metavar='PATH', help='model path') parser.add_argument('-train', '--dataset_train', default='QmP9KUr8Y6HxNoBNM8zakxC65diYWHG2VBRhPHYnT5uWZT', metavar='dataset_train', help='dataset dir') parser.add_argument('-test', '--dataset_test', default='QmRL93gvYRypqWs1wpzR8S6kvoGPxeP12v8RbTAJWDsQaK', metavar='dataset_test', help='dataset dir') parser.add_argument('-w', '--workers', default=1, type=int, metavar='WORKERS', help='workers count') parser.add_argument('-v', '--verifiers', default=1, type=int, metavar='VERIFIERS', help='verifiers count') parser.add_argument('-b', '--batch', default=128, type=int, metavar='BATCH_SIZE', help='batch size') parser.add_argument('-e', '--epochs', default=3, type=int, metavar='EPOCHS', help='epochs') parser.add_argument('-ei', '--epochs_in_iteration', default=1, type=int, metavar='EPOCHS IN ITERATION', help='epochs in iteration') parser.add_argument('-l', '--local', default=0, type=int, metavar='LOCAL', help='train model local') parser.add_argument('-t', '--task', default=None, type=str, metavar='TASK_ID', help='task declaration asset id') parser.add_argument('-eth', '--eth', default=None, type=float, metavar='ETH', help='ETH for deposit or issue') args = parser.parse_args() if args.command == 'add': if args.local: train_local( train_dir=args.dataset_train, test_dir=args.dataset_test, model_path=args.path, batch_size=args.batch, epochs=args.epochs ) else: train_remote( train_ipfs=args.dataset_train, test_ipfs=args.dataset_test, args=args ) return producer = load_producer() if not args.task: print('task is not specified, arg: -t') return if args.command == 'cancel': td = TaskDeclaration.get(args.task, db=producer.db, encryption=producer.encryption) td.state = TaskDeclaration.State.FAILED td.save() print('Canceled {}'.format(td)) return if args.command == 'stop': td = TaskDeclaration.get(args.task, db=producer.db, encryption=producer.encryption) td.state = TaskDeclaration.State.COMPLETED td.save() print('Stopped {}'.format(td)) return if args.command == 'monitor': monitor_task(args.task, producer) return if args.command == 'issue': if not args.eth: print('balance is not specified, arg: --eth') return encrypted_key, password = load_wallet_credentials(account_address_var_name='PRODUCER_ACCOUNT_ADDRESS') NodeContractInfo.configure(encrypted_key, password) task_declaration = TaskDeclaration.get(args.task, db=producer.db, encryption=producer.encryption) poa_wrapper.issue_job(task_declaration, args.eth) return if args.command == 'deposit': if not args.eth: print('balance is not specified, arg: --eth') return encrypted_key, password = load_wallet_credentials(account_address_var_name='PRODUCER_ACCOUNT_ADDRESS') NodeContractInfo.configure(encrypted_key, password) task_declaration = TaskDeclaration.get(args.task, db=producer.db, encryption=producer.encryption) poa_wrapper.deposit(task_declaration, args.eth) return
def _assign_estimate_data(self, task_declaration: TaskDeclaration): estimation_assignments = task_declaration.get_estimation_assignments( states=(EstimationAssignment.State.ACCEPTED, EstimationAssignment.State.TIMEOUT) ) # split accepted and overdue accepted_estimation_assignments = [] timeout_estimation_assignments = [] for ea in estimation_assignments: if ea.state == EstimationAssignment.State.ACCEPTED: accepted_estimation_assignments.append(ea) continue if ea.state == EstimationAssignment.State.TIMEOUT: timeout_estimation_assignments.append(ea) continue assert False and 'Check query!' if len(timeout_estimation_assignments): # its reassign assert len(timeout_estimation_assignments) == len(accepted_estimation_assignments) for index, ea in enumerate(accepted_estimation_assignments): timeout_ea = timeout_estimation_assignments[index] # reassign estimation data # retrieve data which producer is able to encrypt estimation_data = EstimationData.get_with_initial_data( asset_id=timeout_ea.estimation_data_id, db=self.db, encryption=self.encryption ) estimation_data.estimation_assignment_id = ea.asset_id # share data with new estimator estimation_data.set_encryption_key(ea.estimator.enc_key) estimation_data.save() ea.estimation_data_id = estimation_data.asset_id ea.state = EstimationAssignment.State.ESTIMATING ea.save() timeout_ea.state = EstimationAssignment.State.FORGOTTEN timeout_ea.save() else: estimation_data_params = Estimator.get_data_for_estimate(task_declaration) for ea in accepted_estimation_assignments: # create initial state with encrypted data which producer will be able to decrypt estimation_data = EstimationData.create( db=self.db, encryption=self.encryption, **estimation_data_params ) # share data with estimator estimation_data.estimation_assignment_id = ea.asset_id estimation_data.set_encryption_key(ea.estimator.enc_key) estimation_data.save() ea.estimation_data_id = estimation_data.asset_id ea.state = EstimationAssignment.State.ESTIMATING ea.save() task_declaration.state = TaskDeclaration.State.ESTIMATE_IS_IN_PROGRESS task_declaration.save()
def _process_verify_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS verification_assignments = task_declaration.get_verification_assignments( states=( VerificationAssignment.State.VERIFYING, VerificationAssignment.State.FINISHED ) ) failed = False finished_verification_assignments = [] count_timeout = 0 with async_commit(): for va in verification_assignments: if va.state == VerificationAssignment.State.VERIFYING: if va.iteration_is_finished: va.state = VerificationAssignment.State.FINISHED va.save() if va.state == VerificationAssignment.State.FINISHED: if va.verification_result.error: failed = True else: finished_verification_assignments.append(va) continue verify_timeout = settings.WAIT_VERIFY_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo) if (now - va.verification_result.modified_at).total_seconds() > verify_timeout: va.state = VerificationAssignment.State.TIMEOUT va.save() logger.info('Timeout of waiting for {}'.format(va)) count_timeout += 1 if count_timeout: task_declaration.verifiers_needed += count_timeout self._republish_for_verify(task_declaration) return if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if len(finished_verification_assignments) < task_declaration.verifiers_requested: # verification is not ready logger.info('Wait for finish of verification for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return fake_workers = self._parse_verification_results( task_declaration, finished_verification_assignments) if fake_workers: logger.info('Fake workers detected') fake_worker_ids = [] for worker_id, count_detections in fake_workers.items(): logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections)) fake_worker_ids.append(worker_id) self._reject_fake_workers(task_declaration, fake_worker_ids) self._republish_for_train(task_declaration) return if not task_declaration.last_iteration: self._update_train_data_for_next_iteration(task_declaration) return task_declaration.progress = 100.0 task_declaration.state = TaskDeclaration.State.COMPLETED task_declaration.save() logger.info('{} is finished tflops: {} estimated: {}'.format( task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))