def _migrate_tasks(project_path, project): """ Migrate tasks from json file to database objects""" tasks_path = project_path / 'tasks.json' with io.open(os.path.abspath(tasks_path)) as t: tasks_data = json.load(t) for task_id, task_data in tasks_data.items(): task = Task.objects.create(data=task_data.get('data', {}), project=project) # migrate annotations annotations_path = project_path / 'completions' / '{}.json'.format( task_id) if annotations_path.exists(): with io.open(os.path.abspath(annotations_path)) as c: annotations_data = json.load(c) for annotation in annotations_data['completions']: task_annotation = Annotation( result=annotation['result'], task=task, lead_time=annotation['lead_time'], was_cancelled=annotation.get( 'was_cancelled', False), completed_by=project.created_by, ) with suppress_autotime(task_annotation, ['created_at']): task_annotation.created_at = datetime.datetime.fromtimestamp( annotation['created_at'], tz=datetime.datetime.now().astimezone().tzinfo) task_annotation.save() # migrate predictions predictions_path = project_path / 'predictions' / '{}.json'.format( task_id) if predictions_path.exists(): with io.open(os.path.abspath(predictions_path)) as c: predictions_data = json.load(c) for prediction in predictions_data['predictions']: task_prediction = Prediction( result=prediction['result'], task=task) with suppress_autotime(task_prediction, ['created_at']): task_prediction.created_at = datetime.datetime.fromtimestamp( prediction['created_at'], tz=datetime.datetime.now().astimezone().tzinfo) task_prediction.save()
def create(self, request, *args, **kwargs): # check project permissions project = self.get_object() tasks_ids = set(Task.objects.filter(project=project).values_list('id', flat=True)) logger.debug(f'Importing {len(self.request.data)} predictions to project {project} with {len(tasks_ids)} tasks') predictions = [] for item in self.request.data: if item.get('task') not in tasks_ids: raise LabelStudioValidationErrorSentryIgnored( f'{item} contains invalid "task" field: corresponding task ID couldn\'t be retrieved ' f'from project {project} tasks') predictions.append(Prediction( task_id=item['task'], result=Prediction.prepare_prediction_result(item.get('result'), project), score=item.get('score'), model_version=item.get('model_version', 'undefined') )) predictions_obj = Prediction.objects.bulk_create(predictions, batch_size=settings.BATCH_SIZE) return Response({'created': len(predictions_obj)}, status=status.HTTP_201_CREATED)
def create(self, validated_data): """ Create Tasks and Annotations in bulk """ db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data logging.info( f'Try to serialize tasks with annotations, data len = {len(validated_data)}' ) user = self.context.get('user', None) project = self.context.get('project') # to be sure we add tasks with annotations at the same time with transaction.atomic(): # extract annotations and predictions task_annotations, task_predictions = [], [] for task in validated_tasks: task_annotations.append(task.pop('annotations', [])) task_predictions.append(task.pop('predictions', [])) # check annotator permissions for completed by organization = user.active_organization \ if not project.created_by.active_organization else project.created_by.active_organization project_user_ids = organization.members.values_list('user__id', flat=True) annotator_ids = set() for annotations in task_annotations: for annotation in annotations: annotator_ids.add(self.get_completed_by_id(annotation)) for i in annotator_ids: if i not in project_user_ids and i is not None: raise ValidationError( f'Annotations with "completed_by"={i} are produced by annotator ' f'who is not allowed for this project as invited annotator or team member' ) # add tasks first for task in validated_tasks: t = Task(project=project, data=task['data'], meta=task.get('meta', {}), overlap=project.maximum_annotations, file_upload_id=task.get('file_upload_id')) db_tasks.append(t) if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_tasks = [] try: last_task = Task.objects.latest('id') current_id = last_task.id + 1 except Task.DoesNotExist: current_id = 1 for task in db_tasks: task.id = current_id current_id += 1 self.db_tasks = Task.objects.bulk_create( db_tasks, batch_size=settings.BATCH_SIZE) else: self.db_tasks = Task.objects.bulk_create( db_tasks, batch_size=settings.BATCH_SIZE) logging.info( f'Tasks serialization success, len = {len(self.db_tasks)}') # add annotations for i, annotations in enumerate(task_annotations): for annotation in annotations: # support both "ground_truth" and "ground_truth" ground_truth = annotation.pop('ground_truth', True) if 'ground_truth' in annotation: ground_truth = annotation.pop('ground_truth', True) # get user id completed_by_id = self.get_completed_by_id( annotation, default=user.id if user else None) annotation.pop('completed_by', None) db_annotations.append( Annotation(task=self.db_tasks[i], ground_truth=ground_truth, completed_by_id=completed_by_id, result=annotation['result'])) # add predictions last_model_version = None for i, predictions in enumerate(task_predictions): for prediction in predictions: prediction_score = prediction.get('score') if prediction_score is not None: try: prediction_score = float(prediction_score) except ValueError as exc: logger.error( f'Can\'t upload prediction score: should be in float format. Reason: {exc}.' f'Fallback to score=None', exc_info=True) prediction_score = None last_model_version = prediction.get( 'model_version', 'undefined') db_predictions.append( Prediction(task=self.db_tasks[i], result=prediction['result'], score=prediction_score, model_version=last_model_version)) # annotations: DB bulk create if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_annotations = [] try: last_annotation = Annotation.objects.latest('id') current_id = last_annotation.id + 1 except Annotation.DoesNotExist: current_id = 1 for annotation in db_annotations: annotation.id = current_id current_id += 1 self.db_annotations = Annotation.objects.bulk_create( db_annotations, batch_size=settings.BATCH_SIZE) else: self.db_annotations = Annotation.objects.bulk_create( db_annotations, batch_size=settings.BATCH_SIZE) logging.info( f'Annotations serialization success, len = {len(self.db_annotations)}' ) # predictions: DB bulk create self.db_predictions = Prediction.objects.bulk_create( db_predictions, batch_size=settings.BATCH_SIZE) logging.info( f'Predictions serialization success, len = {len(self.db_predictions)}' ) # renew project model version if it's empty if not project.model_version and last_model_version is not None: project.model_version = last_model_version project.save() return db_tasks
def create(self, validated_data): """ Create Tasks and Annotations in bulk """ db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data logging.info( f'Try to serialize tasks with annotations, data len = {len(validated_data)}' ) user = self.context.get('user', None) project = self.context.get('project') organization = user.active_organization \ if not project.created_by.active_organization else project.created_by.active_organization members_email_to_id = dict( organization.members.values_list('user__email', 'user__id')) members_ids = set(members_email_to_id.values()) logger.debug( f"{len(members_email_to_id)} members found in organization {organization}" ) # to be sure we add tasks with annotations at the same time with transaction.atomic(): # extract annotations and predictions task_annotations, task_predictions = [], [] for task in validated_tasks: annotations = task.pop('annotations', []) # insert a valid "completed_by_id" by existing member self._insert_valid_completed_by_id_or_raise( annotations, members_email_to_id, members_ids, user or project.created_by) predictions = task.pop('predictions', []) task_annotations.append(annotations) task_predictions.append(predictions) # add tasks first for task in validated_tasks: t = Task(project=project, data=task['data'], meta=task.get('meta', {}), overlap=project.maximum_annotations, file_upload_id=task.get('file_upload_id')) db_tasks.append(t) # deprecated meta warning if 'meta' in task: logger.warning( 'You task data has field "meta" which is deprecated and it will be removed in future' ) if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_tasks = [] try: last_task = Task.objects.latest('id') current_id = last_task.id + 1 except Task.DoesNotExist: current_id = 1 for task in db_tasks: task.id = current_id current_id += 1 self.db_tasks = Task.objects.bulk_create( db_tasks, batch_size=settings.BATCH_SIZE) else: self.db_tasks = Task.objects.bulk_create( db_tasks, batch_size=settings.BATCH_SIZE) logging.info( f'Tasks serialization success, len = {len(self.db_tasks)}') # add annotations for i, annotations in enumerate(task_annotations): for annotation in annotations: # support both "ground_truth" and "ground_truth" ground_truth = annotation.pop('ground_truth', True) was_cancelled = annotation.pop('was_cancelled', False) db_annotations.append( Annotation( task=self.db_tasks[i], ground_truth=ground_truth, was_cancelled=was_cancelled, completed_by_id=annotation['completed_by_id'], result=annotation['result'])) # add predictions last_model_version = None for i, predictions in enumerate(task_predictions): for prediction in predictions: prediction_score = prediction.get('score') if prediction_score is not None: try: prediction_score = float(prediction_score) except ValueError as exc: logger.error( f'Can\'t upload prediction score: should be in float format. Reason: {exc}.' f'Fallback to score=None', exc_info=True) prediction_score = None last_model_version = prediction.get( 'model_version', 'undefined') db_predictions.append( Prediction(task=self.db_tasks[i], result=prediction['result'], score=prediction_score, model_version=last_model_version)) # annotations: DB bulk create if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_annotations = [] try: last_annotation = Annotation.objects.latest('id') current_id = last_annotation.id + 1 except Annotation.DoesNotExist: current_id = 1 for annotation in db_annotations: annotation.id = current_id current_id += 1 self.db_annotations = Annotation.objects.bulk_create( db_annotations, batch_size=settings.BATCH_SIZE) else: self.db_annotations = Annotation.objects.bulk_create( db_annotations, batch_size=settings.BATCH_SIZE) logging.info( f'Annotations serialization success, len = {len(self.db_annotations)}' ) # predictions: DB bulk create self.db_predictions = Prediction.objects.bulk_create( db_predictions, batch_size=settings.BATCH_SIZE) logging.info( f'Predictions serialization success, len = {len(self.db_predictions)}' ) # renew project model version if it's empty if not project.model_version and last_model_version is not None: project.model_version = last_model_version project.save() return db_tasks
def create(self, validated_data): """ Create Tasks and Annotations in bulk """ db_tasks, db_annotations, db_predictions, validated_tasks = [], [], [], validated_data logging.info(f'Try to serialize tasks with annotations, data len = {len(validated_data)}') user = self.context.get('user', None) organization = user.active_organization \ if not self.project.created_by.active_organization else self.project.created_by.active_organization members_email_to_id = dict(organization.members.values_list('user__email', 'user__id')) members_ids = set(members_email_to_id.values()) logger.debug(f"{len(members_email_to_id)} members found in organization {organization}") # to be sure we add tasks with annotations at the same time with transaction.atomic(): # extract annotations and predictions task_annotations, task_predictions = [], [] for task in validated_tasks: annotations = task.pop('annotations', []) # insert a valid "completed_by_id" by existing member self._insert_valid_completed_by_id_or_raise( annotations, members_email_to_id, members_ids, user or self.project.created_by) predictions = task.pop('predictions', []) task_annotations.append(annotations) task_predictions.append(predictions) # add tasks first max_overlap = self.project.maximum_annotations # identify max inner id tasks = Task.objects.filter(project=self.project) max_inner_id = (tasks.order_by("-inner_id")[0].inner_id + 1) if tasks else 1 for i, task in enumerate(validated_tasks): t = Task( project=self.project, data=task['data'], meta=task.get('meta', {}), overlap=max_overlap, is_labeled=len(task_annotations[i]) >= max_overlap, file_upload_id=task.get('file_upload_id'), inner_id=max_inner_id + i ) db_tasks.append(t) if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_tasks = [] try: last_task = Task.objects.latest('id') current_id = last_task.id + 1 except Task.DoesNotExist: current_id = 1 for task in db_tasks: task.id = current_id current_id += 1 self.db_tasks = Task.objects.bulk_create(db_tasks, batch_size=settings.BATCH_SIZE) else: self.db_tasks = Task.objects.bulk_create(db_tasks, batch_size=settings.BATCH_SIZE) logging.info(f'Tasks serialization success, len = {len(self.db_tasks)}') # add annotations for i, annotations in enumerate(task_annotations): for annotation in annotations: if not isinstance(annotation, dict): continue # support both "ground_truth" and "ground_truth" ground_truth = annotation.pop('ground_truth', True) was_cancelled = annotation.pop('was_cancelled', False) lead_time = annotation.pop('lead_time', None) db_annotations.append(Annotation(task=self.db_tasks[i], ground_truth=ground_truth, was_cancelled=was_cancelled, completed_by_id=annotation['completed_by_id'], result=annotation['result'], lead_time=lead_time)) # add predictions last_model_version = None for i, predictions in enumerate(task_predictions): for prediction in predictions: if not isinstance(prediction, dict): continue # we need to call result normalizer here since "bulk_create" doesn't call save() method result = Prediction.prepare_prediction_result(prediction['result'], self.project) prediction_score = prediction.get('score') if prediction_score is not None: try: prediction_score = float(prediction_score) except ValueError as exc: logger.error( f'Can\'t upload prediction score: should be in float format. Reason: {exc}.' f'Fallback to score=None', exc_info=True) prediction_score = None last_model_version = prediction.get('model_version', 'undefined') db_predictions.append(Prediction(task=self.db_tasks[i], result=result, score=prediction_score, model_version=last_model_version)) # annotations: DB bulk create if settings.DJANGO_DB == settings.DJANGO_DB_SQLITE: self.db_annotations = [] try: last_annotation = Annotation.objects.latest('id') current_id = last_annotation.id + 1 except Annotation.DoesNotExist: current_id = 1 for annotation in db_annotations: annotation.id = current_id current_id += 1 self.db_annotations = Annotation.objects.bulk_create(db_annotations, batch_size=settings.BATCH_SIZE) else: self.db_annotations = Annotation.objects.bulk_create(db_annotations, batch_size=settings.BATCH_SIZE) logging.info(f'Annotations serialization success, len = {len(self.db_annotations)}') # predictions: DB bulk create self.db_predictions = Prediction.objects.bulk_create(db_predictions, batch_size=settings.BATCH_SIZE) logging.info(f'Predictions serialization success, len = {len(self.db_predictions)}') # renew project model version if it's empty if not self.project.model_version and last_model_version is not None: self.project.model_version = last_model_version self.project.save() self.post_process_annotations(self.db_annotations) return db_tasks