Example #1
0
    def _update_train_data_for_next_iteration(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS

        task_declaration.current_iteration += 1
        task_declaration.current_iteration_retry = 0

        task_declaration.progress = (
                task_declaration.current_iteration * task_declaration.epochs_in_iteration * 100
                / task_declaration.epochs)

        count_ta = 0
        for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,)):
            train_data = ta.train_data
            # share data to worker
            train_data.set_encryption_key(ta.worker.enc_key)
            train_data.save()

            ta.state = TaskAssignment.State.TRAINING
            ta.save()
            count_ta += 1

        assert task_declaration.workers_requested == count_ta
        task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS
        task_declaration.save()
Example #2
0
    def _assign_initial_train_data(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT
        # start of train
        task_declaration.current_iteration += 1
        task_declaration.current_iteration_retry = 0

        accepted_task_assignment = task_declaration.get_task_assignments(states=(TaskAssignment.State.ACCEPTED,))

        count_ta = 0

        train_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.train_dir_ipfs).ls()
        test_dirs_ipfs, files = Directory(multihash=task_declaration.dataset.test_dir_ipfs).ls()

        all_train_chunks_ipfs = self._chunk_it(
            iterable=[x.multihash for x in train_dirs_ipfs],
            count=task_declaration.workers_requested
        )

        assert len(all_train_chunks_ipfs) == task_declaration.workers_requested

        all_test_chunks_ipfs = self._chunk_it(
            iterable=[x.multihash for x in test_dirs_ipfs],
            count=task_declaration.workers_requested
        )

        assert len(all_test_chunks_ipfs) == task_declaration.workers_requested

        list_td_ta = []
        with async_commit():
            # create TrainData
            for index, task_assignment in enumerate(accepted_task_assignment):
                train_chunks_ipfs = all_train_chunks_ipfs[index]
                test_chunks_ipfs = all_test_chunks_ipfs[index]

                train_data = TrainData.create(
                    model_code_ipfs=task_declaration.train_model.code_ipfs,
                    train_chunks_ipfs=train_chunks_ipfs,
                    test_chunks_ipfs=test_chunks_ipfs,
                    data_index=index,
                    db=self.db,
                    encryption=self.encryption
                )

                list_td_ta.append((train_data, task_assignment))
                logger.debug('Created {}, train chunks: {}, count:{}, test chunks: {}, count:{}'.format(
                    train_data, train_chunks_ipfs, len(train_chunks_ipfs), test_chunks_ipfs, len(test_chunks_ipfs)))
                count_ta += 1

        assert task_declaration.workers_requested == count_ta

        with async_commit():
            # share to worker
            for train_data, task_assignment in list_td_ta:
                train_data.task_assignment_id = task_assignment.asset_id
                train_data.set_encryption_key(task_assignment.worker.enc_key)
                train_data.save()

                task_assignment.train_data_id = train_data.asset_id
                task_assignment.state = TaskAssignment.State.TRAINING
                task_assignment.save()

            task_declaration.state = TaskDeclaration.State.EPOCH_IN_PROGRESS
            task_declaration.save()