def create(cls, data, metadata, recipients, db): prepared_create_tx = db.bdb.transactions.prepare( operation='CREATE', signers=db.kp.public_key, asset={'data': data}, recipients=recipients, metadata=metadata) fulfilled_create_tx = db.bdb.transactions.fulfill( transaction=prepared_create_tx, private_keys=db.kp.private_key) logger.debug('Fulfill CREATE tx {} for asset {}'.format( fulfilled_create_tx['id'], data['asset_name'])) asset_id = fulfilled_create_tx['id'] # check is asset already created logger.debug("Check is asset already created: {}".format(asset_id)) txs = db.bdb.transactions.get(asset_id=asset_id) if len(txs): logger.debug("Asset already exists: {}".format(asset_id)) asset = cls(asset_id=asset_id, transactions=txs, db=db) asset._update_if_were_changes(metadata, recipients) return asset, False from tatau_core.db.db import async_commit ac = async_commit() if ac. async: db.bdb.transactions.send_async(fulfilled_create_tx) ac.add_tx_id(fulfilled_create_tx['id']) else: db.bdb.transactions.send_commit(fulfilled_create_tx) return cls(asset_id=fulfilled_create_tx['id'], transactions=[fulfilled_create_tx], db=db), True
def _process_epoch_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.EPOCH_IN_PROGRESS task_assignments = task_declaration.get_task_assignments( states=( TaskAssignment.State.TRAINING, TaskAssignment.State.FINISHED, ) ) failed = False finished_task_assignments = [] count_timeout = 0 with async_commit(): for ta in task_assignments: if ta.state == TaskAssignment.State.TRAINING: if ta.iteration_is_finished: ta.state = TaskAssignment.State.FINISHED ta.save() if ta.state == TaskAssignment.State.FINISHED: if ta.train_result.error: failed = True else: finished_task_assignments.append(ta) continue train_timeout = settings.WAIT_TRAIN_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=ta.train_result.modified_at.tzinfo) if (now - ta.train_result.modified_at).total_seconds() > train_timeout: ta.state = TaskAssignment.State.TIMEOUT ta.save() logger.info('Timeout of waiting for {}'.format(ta)) count_timeout += 1 if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if count_timeout: task_declaration.workers_needed += count_timeout self._republish_for_train(task_declaration) return if len(finished_task_assignments) < task_declaration.workers_requested: logger.info('Wait for finish of training for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return if task_declaration.current_iteration > 1: self._save_loss_and_accuracy(task_declaration, finished_task_assignments) self._assign_verification_data(task_declaration, finished_task_assignments)
def _process_deployment(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT with async_commit(): save = False for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.READY,)): if self._is_task_assignment_allowed(task_declaration, ta): ta.state = TaskAssignment.State.ACCEPTED ta.save() task_declaration.workers_needed -= 1 save = True else: ta.state = TaskAssignment.State.REJECTED ta.save() for va in task_declaration.get_verification_assignments(states=(VerificationAssignment.State.READY,)): if self._is_verification_assignment_allowed(task_declaration, va): va.state = VerificationAssignment.State.ACCEPTED va.save() task_declaration.verifiers_needed -= 1 save = True else: va.state = VerificationAssignment.State.REJECTED va.save() # save if were changes if save: task_declaration.save() ready_to_start = task_declaration.workers_needed == 0 and task_declaration.verifiers_needed == 0 logger.info('{} ready: {} workers_needed: {} verifiers_needed: {}'.format( task_declaration, ready_to_start, task_declaration.workers_needed, task_declaration.verifiers_needed)) if ready_to_start: self._assign_initial_train_data(task_declaration) return if not save: # recheck how many workers and verifiers really accepted accepted_workers_count = len(task_declaration.get_task_assignments( states=(TaskAssignment.State.ACCEPTED,))) accepted_verifiers_count = len(task_declaration.get_verification_assignments( states=(VerificationAssignment.State.ACCEPTED,))) if accepted_workers_count == task_declaration.workers_requested \ and accepted_verifiers_count == task_declaration.verifiers_requested: logger.info('All performers are accepted, start train') task_declaration.workers_needed = 0 task_declaration.verifiers_needed = 0 self._assign_initial_train_data(task_declaration)
def _process_estimate_is_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_IN_PROGRESS estimation_assignments = task_declaration.get_estimation_assignments( states=( EstimationAssignment.State.ESTIMATING, EstimationAssignment.State.FINISHED ) ) finished_assignments = [] count_timeout = 0 with async_commit(): for ea in estimation_assignments: if ea.state == EstimationAssignment.State.ESTIMATING: if ea.estimation_result.state == EstimationResult.State.FINISHED: ea.state = EstimationAssignment.State.FINISHED ea.save() else: estimate_timeout = settings.WAIT_ESTIMATE_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=ea.estimation_result.modified_at.tzinfo) if (now - ea.estimation_result.modified_at).total_seconds() > estimate_timeout: ea.state = EstimationAssignment.State.TIMEOUT ea.save() logger.info('Timeout of waiting for {}'.format(ea)) count_timeout += 1 if ea.state == EstimationAssignment.State.FINISHED: finished_assignments.append(ea) if count_timeout: task_declaration.estimators_needed += count_timeout self._republish_for_estimation(task_declaration) return if len(finished_assignments) == task_declaration.estimators_requested: task_declaration.state = TaskDeclaration.State.ESTIMATED task_declaration.estimated_tflops, failed = Estimator.estimate(task_declaration, finished_assignments) if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return logger.info('Wait of finish for estimation {}, finished: {}, requested: {}'.format( task_declaration, len(finished_assignments), task_declaration.estimators_requested ))
def save(self, metadata, recipients): previous_tx = self.last_tx # we cant create tx if previous tx was not committed while not self.db.bdb.blocks.get(txid=previous_tx['id']): logger.debug('Previous tx is not committed, waiting...') time.sleep(1) output_index = 0 output = previous_tx['outputs'][output_index] transfer_input = { 'fulfillment': output['condition']['details'], 'fulfills': { 'output_index': output_index, 'transaction_id': previous_tx['id'], }, 'owners_before': output['public_keys'], } prepared_transfer_tx = self.db.bdb.transactions.prepare( operation='TRANSFER', asset={'id': self.asset_id}, inputs=transfer_input, recipients=recipients or self.db.kp.public_key, metadata=metadata, ) fulfilled_transfer_tx = self.db.bdb.transactions.fulfill( prepared_transfer_tx, private_keys=self.db.kp.private_key, ) logger.debug('Fulfill TRANSFER tx {} for asset {}'.format( fulfilled_transfer_tx['id'], self.data['asset_name'])) from tatau_core.db.db import async_commit ac = async_commit() if ac. async: self.db.bdb.transactions.send_async(fulfilled_transfer_tx) ac.add_tx_id(fulfilled_transfer_tx['id']) else: self.db.bdb.transactions.send_commit(fulfilled_transfer_tx) self._transactions.append(fulfilled_transfer_tx)
def _process_estimate_is_required(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.ESTIMATE_IS_REQUIRED with async_commit(): save = False for ea in task_declaration.get_estimation_assignments(states=(EstimationAssignment.State.READY,)): if self._is_estimation_assignment_allowed(task_declaration, ea): ea.state = EstimationAssignment.State.ACCEPTED ea.save() task_declaration.estimators_needed -= 1 save = True else: ea.state = EstimationAssignment.State.REJECTED ea.save() # save changes if save: task_declaration.save() if task_declaration.estimators_needed == 0: # in assign changes will be saved self._assign_estimate_data(task_declaration)
def _process_verify_in_progress(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS verification_assignments = task_declaration.get_verification_assignments( states=( VerificationAssignment.State.VERIFYING, VerificationAssignment.State.FINISHED ) ) failed = False finished_verification_assignments = [] count_timeout = 0 with async_commit(): for va in verification_assignments: if va.state == VerificationAssignment.State.VERIFYING: if va.iteration_is_finished: va.state = VerificationAssignment.State.FINISHED va.save() if va.state == VerificationAssignment.State.FINISHED: if va.verification_result.error: failed = True else: finished_verification_assignments.append(va) continue verify_timeout = settings.WAIT_VERIFY_TIMEOUT now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo) if (now - va.verification_result.modified_at).total_seconds() > verify_timeout: va.state = VerificationAssignment.State.TIMEOUT va.save() logger.info('Timeout of waiting for {}'.format(va)) count_timeout += 1 if count_timeout: task_declaration.verifiers_needed += count_timeout self._republish_for_verify(task_declaration) return if failed: logger.info('{} is failed'.format(task_declaration)) task_declaration.state = TaskDeclaration.State.FAILED task_declaration.save() return if len(finished_verification_assignments) < task_declaration.verifiers_requested: # verification is not ready logger.info('Wait for finish of verification for {} iteration {}'.format( task_declaration, task_declaration.current_iteration)) return fake_workers = self._parse_verification_results( task_declaration, finished_verification_assignments) if fake_workers: logger.info('Fake workers detected') fake_worker_ids = [] for worker_id, count_detections in fake_workers.items(): logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections)) fake_worker_ids.append(worker_id) self._reject_fake_workers(task_declaration, fake_worker_ids) self._republish_for_train(task_declaration) return if not task_declaration.last_iteration: self._update_train_data_for_next_iteration(task_declaration) return task_declaration.progress = 100.0 task_declaration.state = TaskDeclaration.State.COMPLETED task_declaration.save() logger.info('{} is finished tflops: {} estimated: {}'.format( task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))
def _assign_initial_train_data(self, task_declaration: TaskDeclaration): assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT # start of train task_declaration.current_iteration += 1 task_declaration.current_iteration_retry = 0 accepted_task_assignment = task_declaration.get_task_assignments(states=(TaskAssignment.State.ACCEPTED,)) count_ta = 0 train_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.train_dir_ipfs).ls() test_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.test_dir_ipfs).ls() all_train_chunks_ipfs = self._chunk_it( iterable=[x.multihash for x in train_dirs_ipfs], count=task_declaration.workers_requested ) assert len(all_train_chunks_ipfs) == task_declaration.workers_requested all_test_chunks_ipfs = self._chunk_it( iterable=[x.multihash for x in test_dirs_ipfs], count=task_declaration.workers_requested ) assert len(all_test_chunks_ipfs) == task_declaration.workers_requested list_td_ta = [] with async_commit(): # create TrainData for index, task_assignment in enumerate(accepted_task_assignment): train_chunks_ipfs = all_train_chunks_ipfs[index] test_chunks_ipfs = all_test_chunks_ipfs[index] train_data = TrainData.create( model_code_ipfs=task_declaration.train_model.code_ipfs, train_chunks_ipfs=train_chunks_ipfs, test_chunks_ipfs=test_chunks_ipfs, data_index=index, db=self.db, encryption=self.encryption ) list_td_ta.append((train_data, task_assignment)) logger.debug('Created {}, train chunks: {}, count:{}, test chunks: {}, count:{}'.format( train_data, train_chunks_ipfs, len(train_chunks_ipfs), test_chunks_ipfs, len(test_chunks_ipfs))) count_ta += 1 assert task_declaration.workers_requested == count_ta with async_commit(): # share to worker for train_data, task_assignment in list_td_ta: train_data.task_assignment_id = task_assignment.asset_id train_data.set_encryption_key(task_assignment.worker.enc_key) train_data.save() task_assignment.train_data_id = train_data.asset_id task_assignment.state = TaskAssignment.State.TRAINING task_assignment.save() task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS task_declaration.save()