class TrainModel(models.Model): name = fields.EncryptedCharField(immutable=True) code_ipfs = fields.EncryptedCharField(immutable=True) @classmethod def upload_and_create(cls, code_path, **kwargs): code_ipfs = IPFS().add_file(code_path).multihash return cls.create(code_ipfs=code_ipfs, **kwargs)
class TrainData(models.Model): # owner only producer, share data with workers data_index = fields.IntegerField(immutable=True) # this data may be encrypted for different workers model_code_ipfs = fields.EncryptedCharField() train_chunks_ipfs = fields.EncryptedJsonField() # data for evaluation test_chunks_ipfs = fields.EncryptedJsonField() task_assignment_id = fields.CharField(null=True, initial=None) @cached_property def task_assignment(self): return TaskAssignment.get(asset_id=self.task_assignment_id, db=self.db, encryption=self.encryption) @cached_property def current_iteration(self): return self.task_assignment.task_declaration.current_iteration @cached_property def weights_ipfs(self): return self.task_assignment.task_declaration.weights_ipfs @cached_property def epochs(self): return self.task_assignment.task_declaration.epochs_in_current_iteration @cached_property def batch_size(self): return self.task_assignment.task_declaration.batch_size
class EstimationData(models.Model): # this data may be encrypted for different estimators chunk_ipfs = fields.EncryptedCharField() model_code_ipfs = fields.EncryptedCharField() estimation_assignment_id = fields.CharField(null=True, initial=None) @cached_property def estimation_assignment(self): return EstimationAssignment.get(self.estimation_assignment_id, db=self.db, encryption=self.encryption) @cached_property def weights_ipfs(self): return self.estimation_assignment.task_declaration.weights_ipfs @cached_property def batch_size(self): return self.estimation_assignment.task_declaration.batch_size
class VerificationData(models.Model): # owner only producer, share data with verifier test_dir_ipfs = fields.EncryptedCharField(immutable=True) model_code_ipfs = fields.EncryptedCharField(immutable=True) verification_assignment_id = fields.CharField() train_results = fields.EncryptedJsonField() @cached_property def verification_assignment(self): return VerificationAssignment.get( asset_id=self.verification_assignment_id, db=self.db, encryption=self.encryption) @cached_property def current_iteration(self): return self.verification_assignment.task_declaration.current_iteration @cached_property def current_iteration_retry(self): return self.verification_assignment.task_declaration.current_iteration_retry
class EstimationResult(models.Model): # owner only estimator, share data with producer class State: INITIAL = 'initial' IN_PROGRESS = 'in progress' FINISHED = 'finished' estimation_assignment_id = fields.CharField(immutable=True) state = fields.CharField(initial=State.INITIAL) tflops = fields.FloatField(initial=0.0) progress = fields.FloatField(initial=0.0) error = fields.EncryptedCharField(null=True, initial=None)
class VerificationResult(models.Model): # owner only verifier, share data with producer class State: INITIAL = 'initial' IN_PROGRESS = 'in progress' VERIFICATION_FINISHED = 'verification is finished' FINISHED = 'finished' verification_assignment_id = fields.CharField(immutable=True) state = fields.CharField(initial=State.INITIAL) progress = fields.FloatField(initial=0.0) tflops = fields.FloatField(initial=0.0) current_iteration = fields.IntegerField(initial=0) current_iteration_retry = fields.IntegerField(initial=0) # results should be public result = fields.JsonField(required=False) weights_ipfs = fields.CharField(required=False) loss = fields.FloatField(required=False) accuracy = fields.FloatField(required=False) error = fields.EncryptedCharField(required=False) def clean(self): self.progress = 0.0 self.tflops = 0.0 self.result = None # remove from ipfs storage summarized weights_ipfs from prev iteration if self.weights_ipfs is not None: IPFS().remove_from_storage(self.weights_ipfs) self.weights_ipfs = None self.loss = 0.0 self.accuracy = 0.0 @cached_property def verification_assignment(self): return VerificationAssignment.get(self.verification_assignment_id, db=self.db, encryption=self.encryption)
class TrainResult(models.Model): # owner only worker, share data with producer class State: INITIAL = 'initial' IN_PROGRESS = 'in progress' FINISHED = 'finished' task_assignment_id = fields.CharField(immutable=True) state = fields.CharField(initial=State.INITIAL) progress = fields.FloatField(initial=0.0) tflops = fields.FloatField(initial=0.0) current_iteration = fields.IntegerField(initial=0) weights_ipfs = fields.CharField(required=False) error = fields.EncryptedCharField(required=False) loss = fields.FloatField(required=False) accuracy = fields.FloatField(required=False) train_history = fields.JsonField(required=False) eval_results = fields.JsonField(required=False) @cached_property def task_assignment(self): return TaskAssignment.get(self.task_assignment_id, db=self.db, encryption=self.encryption) def clean(self): self.progress = 0.0 self.tflops = 0.0 # remove from ipfs storage weights_ipfs from prev iteration if self.weights_ipfs is not None: IPFS().remove_from_storage(self.weights_ipfs) self.weights_ipfs = None self.error = None self.loss = 0.0 self.accuracy = 0.0 self.train_history = None
class Dataset(models.Model): name = fields.EncryptedCharField(immutable=True) train_dir_ipfs = fields.EncryptedCharField(immutable=True) test_dir_ipfs = fields.EncryptedCharField(immutable=True) @classmethod def upload_and_create(cls, train_dir, test_dir, **kwargs): logger.info('Creating dataset') ipfs = IPFS() kwargs['test_dir_ipfs'] = ipfs.add_dir(test_dir).multihash kwargs['train_dir_ipfs'] = ipfs.add_dir(train_dir).multihash return cls.create(**kwargs) @staticmethod def _download_to_dir(csv_text, target_dir, train_part=True): urls = [] for row in csv.reader(StringIO(csv_text), delimiter=',', quotechar='"'): urls.append({'x_url': row[0], 'y_url': row[1]}) if train_part: name_format = '_train_{{:0{}d}}'.format(len(str(len(urls))) + 1) else: name_format = '_test_{{:0{}d}}'.format(len(str(len(urls))) + 1) download_list = [] for index, u in enumerate(urls): download_list += [ FileDownloader.Params(url=u['x_url'], target_path=os.path.join( target_dir, 'x' + name_format.format(index))), FileDownloader.Params(url=u['y_url'], target_path=os.path.join( target_dir, 'y' + name_format.format(index))) ] FileDownloader.download_all(download_list) @staticmethod def parse_csv_and_upload_to_ipfs(csv_text, train_part): target_dir = tempfile.mkdtemp() try: Dataset._download_to_dir(csv_text, target_dir, train_part) ipfs = IPFS() return ipfs.add_dir(target_dir).multihash finally: shutil.rmtree(target_dir) @classmethod def create_from_csv(cls, train_csv_text, test_csv_text, **kwargs): train_dir = tempfile.mkdtemp() test_dir = tempfile.mkdtemp() try: kwargs['train_dir_ipfs'] = Dataset.parse_csv_and_upload_to_ipfs( train_csv_text, train_part=False) logger.info('Train part is uploaded: {}'.format( kwargs['train_dir_ipfs'])) kwargs['test_dir_ipfs'] = Dataset.parse_csv_and_upload_to_ipfs( test_csv_text, train_part=True) logger.info('Test part is uploaded: {}'.format( kwargs['test_dir_ipfs'])) finally: shutil.rmtree(train_dir) shutil.rmtree(test_dir) return cls.create(**kwargs) @classmethod def _split_files(cls, x_path, y_path, minibatch_size, target_dir): x_train = np.load(x_path) y_train = np.load(y_path) batches = int(len(x_train) / minibatch_size) logger.info('Split dataset to {} batches'.format(batches)) name_format = '{{:0{}d}}'.format(len(str(batches)) + 1) for batch_idx in range(0, batches): start_idx = batch_idx * minibatch_size end_idx = start_idx + minibatch_size x_batch = x_train[start_idx:end_idx] y_batch = y_train[start_idx:end_idx] chunk_dir = os.path.join(target_dir, 'chunk_' + name_format.format(batch_idx)) os.mkdir(chunk_dir) x_path = os.path.join(chunk_dir, 'x') np.save(x_path, x_batch) y_path = os.path.join(chunk_dir, 'y') np.save(y_path, y_batch) @classmethod def download_and_create(cls, x_train_url, y_train_url, x_test_url, y_test_url, minibatch_size, **kwargs): logger.info('Creating dataset') train_download_target_dir = tempfile.mkdtemp() test_target_dir = tempfile.mkdtemp() train_dir = tempfile.mkdtemp() test_dir = tempfile.mkdtemp() try: x_train_path = os.path.join(train_download_target_dir, 'x_train') y_train_path = os.path.join(train_download_target_dir, 'y_train') x_test_path = os.path.join(test_target_dir, 'x_test') y_test_path = os.path.join(test_target_dir, 'y_test') download_list = [ FileDownloader.Params(url=x_train_url, target_path=x_train_path), FileDownloader.Params(url=y_train_url, target_path=y_train_path), FileDownloader.Params(url=x_test_url, target_path=x_test_path), FileDownloader.Params(url=y_test_url, target_path=y_test_path), ] FileDownloader.download_all(download_list) ipfs = IPFS() cls._split_files(x_path=x_test_path, y_path=y_test_path, minibatch_size=minibatch_size, target_dir=test_dir) kwargs['test_dir_ipfs'] = ipfs.add_dir(test_dir, recursive=True).multihash logger.info('Test part is uploaded: {}'.format( kwargs['test_dir_ipfs'])) cls._split_files(x_path=x_train_path, y_path=y_train_path, minibatch_size=minibatch_size, target_dir=train_dir) kwargs['train_dir_ipfs'] = ipfs.add_dir(train_dir, recursive=True).multihash logger.info('Train part is uploaded: {}'.format( kwargs['train_dir_ipfs'])) return cls.create(**kwargs) finally: shutil.rmtree(test_target_dir) shutil.rmtree(train_download_target_dir) shutil.rmtree(train_dir) shutil.rmtree(test_dir)