Esempio n. 1
0
def _migrate_tasks(project_path, project):
    """ Migrate tasks from json file to database objects"""
    tasks_path = project_path / 'tasks.json'
    with io.open(os.path.abspath(tasks_path)) as t:
        tasks_data = json.load(t)
        for task_id, task_data in tasks_data.items():
            task = Task.objects.create(data=task_data.get('data', {}),
                                       project=project)

            # migrate annotations
            annotations_path = project_path / 'completions' / '{}.json'.format(
                task_id)
            if annotations_path.exists():
                with io.open(os.path.abspath(annotations_path)) as c:
                    annotations_data = json.load(c)
                    for annotation in annotations_data['completions']:
                        task_annotation = Annotation(
                            result=annotation['result'],
                            task=task,
                            lead_time=annotation['lead_time'],
                            was_cancelled=annotation.get(
                                'was_cancelled', False),
                            completed_by=project.created_by,
                        )
                        with suppress_autotime(task_annotation,
                                               ['created_at']):
                            task_annotation.created_at = datetime.datetime.fromtimestamp(
                                annotation['created_at'],
                                tz=datetime.datetime.now().astimezone().tzinfo)
                            task_annotation.save()

            # migrate predictions
            predictions_path = project_path / 'predictions' / '{}.json'.format(
                task_id)
            if predictions_path.exists():
                with io.open(os.path.abspath(predictions_path)) as c:
                    predictions_data = json.load(c)
                    for prediction in predictions_data['predictions']:
                        task_prediction = Prediction(
                            result=prediction['result'], task=task)
                        with suppress_autotime(task_prediction,
                                               ['created_at']):
                            task_prediction.created_at = datetime.datetime.fromtimestamp(
                                prediction['created_at'],
                                tz=datetime.datetime.now().astimezone().tzinfo)
                            task_prediction.save()
Esempio n. 2
0
    def create(self, request, *args, **kwargs):
        # check project permissions
        project = self.get_object()
        tasks_ids = set(Task.objects.filter(project=project).values_list('id', flat=True))
        logger.debug(f'Importing {len(self.request.data)} predictions to project {project} with {len(tasks_ids)} tasks')
        predictions = []
        for item in self.request.data:
            if item.get('task') not in tasks_ids:
                raise LabelStudioValidationErrorSentryIgnored(
                    f'{item} contains invalid "task" field: corresponding task ID couldn\'t be retrieved '
                    f'from project {project} tasks')
            predictions.append(Prediction(
                task_id=item['task'],
                result=Prediction.prepare_prediction_result(item.get('result'), project),
                score=item.get('score'),
                model_version=item.get('model_version', 'undefined')
            ))
        predictions_obj = Prediction.objects.bulk_create(predictions, batch_size=settings.BATCH_SIZE)

        return Response({'created': len(predictions_obj)}, status=status.HTTP_201_CREATED)
Esempio n. 3
0
    def create(self, validated_data):
        """ Create Tasks and Annotations in bulk
        """
        db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data
        logging.info(
            f'Try to serialize tasks with annotations, data len = {len(validated_data)}'
        )
        user = self.context.get('user', None)
        project = self.context.get('project')

        # to be sure we add tasks with annotations at the same time
        with transaction.atomic():

            # extract annotations and predictions
            task_annotations, task_predictions = [], []
            for task in validated_tasks:
                task_annotations.append(task.pop('annotations', []))
                task_predictions.append(task.pop('predictions', []))

            # check annotator permissions for completed by
            organization = user.active_organization \
                if not project.created_by.active_organization else project.created_by.active_organization
            project_user_ids = organization.members.values_list('user__id',
                                                                flat=True)
            annotator_ids = set()
            for annotations in task_annotations:
                for annotation in annotations:
                    annotator_ids.add(self.get_completed_by_id(annotation))

            for i in annotator_ids:
                if i not in project_user_ids and i is not None:
                    raise ValidationError(
                        f'Annotations with "completed_by"={i} are produced by annotator '
                        f'who is not allowed for this project as invited annotator or team member'
                    )

            # add tasks first
            for task in validated_tasks:
                t = Task(project=project,
                         data=task['data'],
                         meta=task.get('meta', {}),
                         overlap=project.maximum_annotations,
                         file_upload_id=task.get('file_upload_id'))
                db_tasks.append(t)

            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_tasks = []
                try:
                    last_task = Task.objects.latest('id')
                    current_id = last_task.id + 1
                except Task.DoesNotExist:
                    current_id = 1

                for task in db_tasks:
                    task.id = current_id
                    current_id += 1
                self.db_tasks = Task.objects.bulk_create(
                    db_tasks, batch_size=settings.BATCH_SIZE)
            else:
                self.db_tasks = Task.objects.bulk_create(
                    db_tasks, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Tasks serialization success, len = {len(self.db_tasks)}')

            # add annotations
            for i, annotations in enumerate(task_annotations):
                for annotation in annotations:
                    # support both "ground_truth" and "ground_truth"
                    ground_truth = annotation.pop('ground_truth', True)
                    if 'ground_truth' in annotation:
                        ground_truth = annotation.pop('ground_truth', True)

                    # get user id
                    completed_by_id = self.get_completed_by_id(
                        annotation, default=user.id if user else None)
                    annotation.pop('completed_by', None)

                    db_annotations.append(
                        Annotation(task=self.db_tasks[i],
                                   ground_truth=ground_truth,
                                   completed_by_id=completed_by_id,
                                   result=annotation['result']))

            # add predictions
            last_model_version = None
            for i, predictions in enumerate(task_predictions):
                for prediction in predictions:
                    prediction_score = prediction.get('score')
                    if prediction_score is not None:
                        try:
                            prediction_score = float(prediction_score)
                        except ValueError as exc:
                            logger.error(
                                f'Can\'t upload prediction score: should be in float format. Reason: {exc}.'
                                f'Fallback to score=None',
                                exc_info=True)
                            prediction_score = None

                    last_model_version = prediction.get(
                        'model_version', 'undefined')
                    db_predictions.append(
                        Prediction(task=self.db_tasks[i],
                                   result=prediction['result'],
                                   score=prediction_score,
                                   model_version=last_model_version))

            # annotations: DB bulk create
            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_annotations = []
                try:
                    last_annotation = Annotation.objects.latest('id')
                    current_id = last_annotation.id + 1
                except Annotation.DoesNotExist:
                    current_id = 1

                for annotation in db_annotations:
                    annotation.id = current_id
                    current_id += 1
                self.db_annotations = Annotation.objects.bulk_create(
                    db_annotations, batch_size=settings.BATCH_SIZE)
            else:
                self.db_annotations = Annotation.objects.bulk_create(
                    db_annotations, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Annotations serialization success, len = {len(self.db_annotations)}'
            )

            # predictions: DB bulk create
            self.db_predictions = Prediction.objects.bulk_create(
                db_predictions, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Predictions serialization success, len = {len(self.db_predictions)}'
            )

            # renew project model version if it's empty
            if not project.model_version and last_model_version is not None:
                project.model_version = last_model_version
                project.save()

        return db_tasks
Esempio n. 4
0
    def create(self, validated_data):
        """ Create Tasks and Annotations in bulk
        """
        db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data
        logging.info(
            f'Try to serialize tasks with annotations, data len = {len(validated_data)}'
        )
        user = self.context.get('user', None)
        project = self.context.get('project')

        organization = user.active_organization \
            if not project.created_by.active_organization else project.created_by.active_organization
        members_email_to_id = dict(
            organization.members.values_list('user__email', 'user__id'))
        members_ids = set(members_email_to_id.values())
        logger.debug(
            f"{len(members_email_to_id)} members found in organization {organization}"
        )

        # to be sure we add tasks with annotations at the same time
        with transaction.atomic():

            # extract annotations and predictions
            task_annotations, task_predictions = [], []
            for task in validated_tasks:
                annotations = task.pop('annotations', [])
                # insert a valid "completed_by_id" by existing member
                self._insert_valid_completed_by_id_or_raise(
                    annotations, members_email_to_id, members_ids, user
                    or project.created_by)
                predictions = task.pop('predictions', [])
                task_annotations.append(annotations)
                task_predictions.append(predictions)

            # add tasks first
            for task in validated_tasks:
                t = Task(project=project,
                         data=task['data'],
                         meta=task.get('meta', {}),
                         overlap=project.maximum_annotations,
                         file_upload_id=task.get('file_upload_id'))
                db_tasks.append(t)

            # deprecated meta warning
            if 'meta' in task:
                logger.warning(
                    'You task data has field "meta" which is deprecated and it will be removed in future'
                )

            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_tasks = []
                try:
                    last_task = Task.objects.latest('id')
                    current_id = last_task.id + 1
                except Task.DoesNotExist:
                    current_id = 1

                for task in db_tasks:
                    task.id = current_id
                    current_id += 1
                self.db_tasks = Task.objects.bulk_create(
                    db_tasks, batch_size=settings.BATCH_SIZE)
            else:
                self.db_tasks = Task.objects.bulk_create(
                    db_tasks, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Tasks serialization success, len = {len(self.db_tasks)}')

            # add annotations
            for i, annotations in enumerate(task_annotations):
                for annotation in annotations:
                    # support both "ground_truth" and "ground_truth"
                    ground_truth = annotation.pop('ground_truth', True)
                    was_cancelled = annotation.pop('was_cancelled', False)

                    db_annotations.append(
                        Annotation(
                            task=self.db_tasks[i],
                            ground_truth=ground_truth,
                            was_cancelled=was_cancelled,
                            completed_by_id=annotation['completed_by_id'],
                            result=annotation['result']))

            # add predictions
            last_model_version = None
            for i, predictions in enumerate(task_predictions):
                for prediction in predictions:
                    prediction_score = prediction.get('score')
                    if prediction_score is not None:
                        try:
                            prediction_score = float(prediction_score)
                        except ValueError as exc:
                            logger.error(
                                f'Can\'t upload prediction score: should be in float format. Reason: {exc}.'
                                f'Fallback to score=None',
                                exc_info=True)
                            prediction_score = None

                    last_model_version = prediction.get(
                        'model_version', 'undefined')
                    db_predictions.append(
                        Prediction(task=self.db_tasks[i],
                                   result=prediction['result'],
                                   score=prediction_score,
                                   model_version=last_model_version))

            # annotations: DB bulk create
            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_annotations = []
                try:
                    last_annotation = Annotation.objects.latest('id')
                    current_id = last_annotation.id + 1
                except Annotation.DoesNotExist:
                    current_id = 1

                for annotation in db_annotations:
                    annotation.id = current_id
                    current_id += 1
                self.db_annotations = Annotation.objects.bulk_create(
                    db_annotations, batch_size=settings.BATCH_SIZE)
            else:
                self.db_annotations = Annotation.objects.bulk_create(
                    db_annotations, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Annotations serialization success, len = {len(self.db_annotations)}'
            )

            # predictions: DB bulk create
            self.db_predictions = Prediction.objects.bulk_create(
                db_predictions, batch_size=settings.BATCH_SIZE)
            logging.info(
                f'Predictions serialization success, len = {len(self.db_predictions)}'
            )

            # renew project model version if it's empty
            if not project.model_version and last_model_version is not None:
                project.model_version = last_model_version
                project.save()

        return db_tasks
Esempio n. 5
0
    def create(self, validated_data):
        """ Create Tasks and Annotations in bulk
        """
        db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data
        logging.info(f'Try to serialize tasks with annotations, data len = {len(validated_data)}')
        user = self.context.get('user', None)

        organization = user.active_organization \
            if not self.project.created_by.active_organization else self.project.created_by.active_organization
        members_email_to_id = dict(organization.members.values_list('user__email', 'user__id'))
        members_ids = set(members_email_to_id.values())
        logger.debug(f"{len(members_email_to_id)} members found in organization {organization}")

        # to be sure we add tasks with annotations at the same time
        with transaction.atomic():

            # extract annotations and predictions
            task_annotations, task_predictions = [], []
            for task in validated_tasks:
                annotations = task.pop('annotations', [])
                # insert a valid "completed_by_id" by existing member
                self._insert_valid_completed_by_id_or_raise(
                    annotations, members_email_to_id, members_ids, user or self.project.created_by)
                predictions = task.pop('predictions', [])
                task_annotations.append(annotations)
                task_predictions.append(predictions)

            # add tasks first
            max_overlap = self.project.maximum_annotations

            # identify max inner id
            tasks = Task.objects.filter(project=self.project)
            max_inner_id = (tasks.order_by("-inner_id")[0].inner_id + 1) if tasks else 1

            for i, task in enumerate(validated_tasks):
                t = Task(
                    project=self.project,
                    data=task['data'],
                    meta=task.get('meta', {}),
                    overlap=max_overlap,
                    is_labeled=len(task_annotations[i]) >= max_overlap,
                    file_upload_id=task.get('file_upload_id'),
                    inner_id=max_inner_id + i
                )
                db_tasks.append(t)

            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_tasks = []
                try:
                    last_task = Task.objects.latest('id')
                    current_id = last_task.id + 1
                except Task.DoesNotExist:
                    current_id = 1

                for task in db_tasks:
                    task.id = current_id
                    current_id += 1
                self.db_tasks = Task.objects.bulk_create(db_tasks, batch_size=settings.BATCH_SIZE)
            else:
                self.db_tasks = Task.objects.bulk_create(db_tasks, batch_size=settings.BATCH_SIZE)
            logging.info(f'Tasks serialization success, len = {len(self.db_tasks)}')

            # add annotations
            for i, annotations in enumerate(task_annotations):
                for annotation in annotations:
                    if not isinstance(annotation, dict):
                        continue
                        
                    # support both "ground_truth" and "ground_truth"
                    ground_truth = annotation.pop('ground_truth', True)
                    was_cancelled = annotation.pop('was_cancelled', False)
                    lead_time = annotation.pop('lead_time', None)

                    db_annotations.append(Annotation(task=self.db_tasks[i],
                                                     ground_truth=ground_truth,
                                                     was_cancelled=was_cancelled,
                                                     completed_by_id=annotation['completed_by_id'],
                                                     result=annotation['result'],
                                                     lead_time=lead_time))

            # add predictions
            last_model_version = None
            for i, predictions in enumerate(task_predictions):
                for prediction in predictions:
                    if not isinstance(prediction, dict):
                        continue

                    # we need to call result normalizer here since "bulk_create" doesn't call save() method
                    result = Prediction.prepare_prediction_result(prediction['result'], self.project)
                    prediction_score = prediction.get('score')
                    if prediction_score is not None:
                        try:
                            prediction_score = float(prediction_score)
                        except ValueError as exc:
                            logger.error(
                                f'Can\'t upload prediction score: should be in float format. Reason: {exc}.'
                                f'Fallback to score=None', exc_info=True)
                            prediction_score = None

                    last_model_version = prediction.get('model_version', 'undefined')
                    db_predictions.append(Prediction(task=self.db_tasks[i],
                                                     result=result,
                                                     score=prediction_score,
                                                     model_version=last_model_version))

            # annotations: DB bulk create
            if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE:
                self.db_annotations = []
                try:
                    last_annotation = Annotation.objects.latest('id')
                    current_id = last_annotation.id + 1
                except Annotation.DoesNotExist:
                    current_id = 1

                for annotation in db_annotations:
                    annotation.id = current_id
                    current_id += 1
                self.db_annotations = Annotation.objects.bulk_create(db_annotations, batch_size=settings.BATCH_SIZE)
            else:
                self.db_annotations = Annotation.objects.bulk_create(db_annotations, batch_size=settings.BATCH_SIZE)
            logging.info(f'Annotations serialization success, len = {len(self.db_annotations)}')

            # predictions: DB bulk create
            self.db_predictions = Prediction.objects.bulk_create(db_predictions, batch_size=settings.BATCH_SIZE)
            logging.info(f'Predictions serialization success, len = {len(self.db_predictions)}')

            # renew project model version if it's empty
            if not self.project.model_version and last_model_version is not None:
                self.project.model_version = last_model_version
                self.project.save()

        self.post_process_annotations(self.db_annotations)
        return db_tasks