Ejemplo n.º 1
0
    def _process_deployment(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.DEPLOYMENT

        with async_commit():
            save = False
            for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.READY,)):
                if self._is_task_assignment_allowed(task_declaration, ta):
                    ta.state = TaskAssignment.State.ACCEPTED
                    ta.save()

                    task_declaration.workers_needed -= 1
                    save = True
                else:
                    ta.state = TaskAssignment.State.REJECTED
                    ta.save()

            for va in task_declaration.get_verification_assignments(states=(VerificationAssignment.State.READY,)):
                if self._is_verification_assignment_allowed(task_declaration, va):
                    va.state = VerificationAssignment.State.ACCEPTED
                    va.save()

                    task_declaration.verifiers_needed -= 1
                    save = True
                else:
                    va.state = VerificationAssignment.State.REJECTED
                    va.save()

            # save if were changes
            if save:
                task_declaration.save()

        ready_to_start = task_declaration.workers_needed == 0 and task_declaration.verifiers_needed == 0
        logger.info('{} ready: {} workers_needed: {} verifiers_needed: {}'.format(
            task_declaration, ready_to_start, task_declaration.workers_needed, task_declaration.verifiers_needed))

        if ready_to_start:
            self._assign_initial_train_data(task_declaration)
            return

        if not save:
            # recheck how many workers and verifiers really accepted
            accepted_workers_count = len(task_declaration.get_task_assignments(
                states=(TaskAssignment.State.ACCEPTED,)))

            accepted_verifiers_count = len(task_declaration.get_verification_assignments(
                states=(VerificationAssignment.State.ACCEPTED,)))

            if accepted_workers_count == task_declaration.workers_requested \
                    and accepted_verifiers_count == task_declaration.verifiers_requested:
                logger.info('All performers are accepted, start train')
                task_declaration.workers_needed = 0
                task_declaration.verifiers_needed = 0
                self._assign_initial_train_data(task_declaration)
Ejemplo n.º 2
0
    def _reassign_verification_data(self, task_declaration: TaskDeclaration):
        verification_assignments = task_declaration.get_verification_assignments(
            states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.TIMEOUT)
        )

        # split accepted and overdue
        accepted_verification_assignments = []
        timeout_verification_assignments = []
        for va in verification_assignments:
            if va.state == VerificationAssignment.State.ACCEPTED:
                accepted_verification_assignments.append(va)
                continue

            if va.state == VerificationAssignment.State.TIMEOUT:
                timeout_verification_assignments.append(va)
                continue

            assert False and 'Check query!'

        assert len(accepted_verification_assignments) == len(timeout_verification_assignments)

        train_results = [
            {
                'worker_id': ta.worker_id,
                'result': ta.train_result.weights_ipfs
            }
            for ta in task_declaration.get_task_assignments(states=(TaskAssignment.State.FINISHED,))
        ]

        for index, va in enumerate(accepted_verification_assignments):
            assert va.verification_data_id is None
            verification_data = VerificationData.create(
                verification_assignment_id=va.asset_id,
                # share data with verifier
                public_key=va.verifier.enc_key,
                test_dir_ipfs=task_declaration.dataset.test_dir_ipfs,
                model_code_ipfs=task_declaration.train_model.code_ipfs,
                train_results=train_results,
                db=self.db,
                encryption=self.encryption
            )

            va.verification_data_id = verification_data.asset_id
            va.state = VerificationAssignment.State.VERIFYING
            va.save()

            failed_va = timeout_verification_assignments[index]
            failed_va.state = VerificationAssignment.State.FORGOTTEN
            failed_va.save()

        task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS
        task_declaration.save()
Ejemplo n.º 3
0
    def _republish_for_verify(self, task_declaration: TaskDeclaration):
        assert task_declaration.verifiers_needed > 0

        task_declaration.state = TaskDeclaration.State.DEPLOYMENT_VERIFICATION
        task_declaration.save()

        verification_assignment = task_declaration.get_verification_assignments(
            states=(VerificationAssignment.State.REJECTED,)
        )

        for va in verification_assignment:
            va.state = VerificationAssignment.State.REASSIGN
            # return back ownership
            va.save(recipients=va.verifier.address)
Ejemplo n.º 4
0
    def _assign_verification_data(self, task_declaration: TaskDeclaration, task_assignments: ListTaskAssignments):
        train_results = []
        for ta in task_assignments:
            train_results.append({
                'worker_id': ta.worker_id,
                'result': ta.train_result.weights_ipfs
            })
            task_declaration.tflops += ta.train_result.tflops

        for verification_assignment in task_declaration.get_verification_assignments(
                states=(VerificationAssignment.State.ACCEPTED, VerificationAssignment.State.FINISHED)):

            if verification_assignment.state == VerificationAssignment.State.ACCEPTED:
                assert verification_assignment.verification_data_id is None
                verification_data = VerificationData.create(
                    verification_assignment_id=verification_assignment.asset_id,
                    # share data with verifier
                    public_key=verification_assignment.verifier.enc_key,
                    test_dir_ipfs=task_declaration.dataset.test_dir_ipfs,
                    model_code_ipfs=task_declaration.train_model.code_ipfs,
                    train_results=train_results,
                    db=self.db,
                    encryption=self.encryption
                )

                verification_assignment.verification_data_id = verification_data.asset_id
                verification_assignment.state = VerificationAssignment.State.VERIFYING
                verification_assignment.save()
                continue

            if verification_assignment.state == VerificationAssignment.State.FINISHED:
                verification_data = verification_assignment.verification_data
                verification_data.train_results = train_results
                verification_data.save()

                verification_assignment.state = VerificationAssignment.State.VERIFYING
                verification_assignment.save()
                continue

        task_declaration.state = TaskDeclaration.State.VERIFY_IN_PROGRESS
        task_declaration.save()
Ejemplo n.º 5
0
    def _process_verify_in_progress(self, task_declaration: TaskDeclaration):
        assert task_declaration.state == TaskDeclaration.State.VERIFY_IN_PROGRESS
        verification_assignments = task_declaration.get_verification_assignments(
            states=(
                VerificationAssignment.State.VERIFYING,
                VerificationAssignment.State.FINISHED
            )
        )

        failed = False
        finished_verification_assignments = []
        count_timeout = 0
        with async_commit():
            for va in verification_assignments:
                if va.state == VerificationAssignment.State.VERIFYING:
                    if va.iteration_is_finished:
                        va.state = VerificationAssignment.State.FINISHED
                        va.save()

                if va.state == VerificationAssignment.State.FINISHED:
                    if va.verification_result.error:
                        failed = True
                    else:
                        finished_verification_assignments.append(va)
                    continue

                verify_timeout = settings.WAIT_VERIFY_TIMEOUT
                now = datetime.datetime.utcnow().replace(tzinfo=va.verification_result.modified_at.tzinfo)
                if (now - va.verification_result.modified_at).total_seconds() > verify_timeout:
                    va.state = VerificationAssignment.State.TIMEOUT
                    va.save()

                    logger.info('Timeout of waiting for {}'.format(va))
                    count_timeout += 1

        if count_timeout:
            task_declaration.verifiers_needed += count_timeout
            self._republish_for_verify(task_declaration)
            return

        if failed:
            logger.info('{} is failed'.format(task_declaration))
            task_declaration.state = TaskDeclaration.State.FAILED
            task_declaration.save()
            return

        if len(finished_verification_assignments) < task_declaration.verifiers_requested:
            # verification is not ready
            logger.info('Wait for finish of verification for {} iteration {}'.format(
                task_declaration, task_declaration.current_iteration))
            return

        fake_workers = self._parse_verification_results(
            task_declaration, finished_verification_assignments)

        if fake_workers:
            logger.info('Fake workers detected')
            fake_worker_ids = []
            for worker_id, count_detections in fake_workers.items():
                logger.info('Fake worker_id: {}, count detections: {}'.format(worker_id, count_detections))
                fake_worker_ids.append(worker_id)
            self._reject_fake_workers(task_declaration, fake_worker_ids)
            self._republish_for_train(task_declaration)
            return

        if not task_declaration.last_iteration:
            self._update_train_data_for_next_iteration(task_declaration)
            return

        task_declaration.progress = 100.0
        task_declaration.state = TaskDeclaration.State.COMPLETED
        task_declaration.save()
        logger.info('{} is finished tflops: {} estimated: {}'.format(
            task_declaration, task_declaration.tflops, task_declaration.estimated_tflops))