Beispiel #1
0
def _expand(results, key, queryset, serializer):
    from documentcloud.documents.tasks import solr_index

    ids = {r[key] for r in results if key in r}
    objs = queryset.filter(pk__in=ids)
    obj_dict = {obj.pk: serializer(obj).data for obj in objs}
    for result in results:
        # user and organization should always be available, re-index if they are not
        if key not in result:
            solr_index.delay(result["id"])
        else:
            result[key] = obj_dict.get(result[key])
    return results
Beispiel #2
0
    def partial_update(self, request, pk=None, document_pk=None):
        document = self.get_object(edit=True)
        serializer = DataAddRemoveSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        if pk in document.data:
            document.data[pk].extend(serializer.data.get("values", []))
            document.data[pk] = [
                i
                for i in document.data[pk]
                if i not in serializer.data.get("remove", [])
            ]
        else:
            document.data[pk] = serializer.data.get("values", [])

        # remove duplicate values
        document.data[pk] = list(set(document.data[pk]))

        if not document.data[pk]:
            # remove key if all values are removed
            del document.data[pk]

        document.save()
        transaction.on_commit(
            lambda: solr_index.delay(document.pk, field_updates={f"data_{pk}": "set"})
        )
        return Response(document.data)
Beispiel #3
0
    def perform_create(self, serializer):

        bulk = hasattr(serializer, "many") and serializer.many

        if bulk:
            file_urls = [d.pop("file_url", None) for d in serializer.validated_data]
            force_ocrs = [d.pop("force_ocr", False) for d in serializer.validated_data]
        else:
            file_urls = [serializer.validated_data.pop("file_url", None)]
            force_ocrs = [serializer.validated_data.pop("force_ocr", False)]

        documents = serializer.save(
            user=self.request.user, organization=self.request.user.organization
        )

        if not bulk:
            documents = [documents]

        for document, file_url, force_ocr in zip(documents, file_urls, force_ocrs):
            transaction.on_commit(lambda d=document: solr_index.delay(d.pk))
            if file_url is not None:
                transaction.on_commit(
                    lambda d=document, fu=file_url, fo=force_ocr: fetch_file_url.delay(
                        fu, d.pk, fo
                    )
                )
Beispiel #4
0
    def create(self, request, *args, **kwargs):
        """Initiate asyncrhonous creation of entities"""
        # pylint: disable=unused-argument
        if not request.user.has_perm("documents.change_document", self.document):
            raise exceptions.PermissionDenied(
                "You do not have permission to edit this document"
            )

        with transaction.atomic():
            # We select for update here to lock the document between checking if it is
            # processing and starting the entity extraction to ensure another
            # thread does not start processing this document before we mark it as
            # processing
            document = Document.objects.select_for_update().get(pk=self.document.pk)

            if document.processing:
                return Response(
                    {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST
                )

            if document.entities.exists():
                return Response(
                    {"error": "Entities already created"},
                    status=status.HTTP_400_BAD_REQUEST,
                )

            document.status = Status.readable
            document.save()
            transaction.on_commit(
                lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
            )

            transaction.on_commit(lambda: extract_entities.delay(self.document.pk))

        return Response("OK")
Beispiel #5
0
    def create(self, request, *args, **kwargs):
        document = self.get_object()
        serializer = self.get_serializer(data={"data": request.data})
        serializer.is_valid(raise_exception=True)

        with transaction.atomic():
            # We select for update here to lock the document between checking if it is
            # processing and starting the page modification to ensure another
            # thread does not start processing this document before we mark it as
            # processing
            document = Document.objects.select_for_update().get(pk=document.pk)

            if document.processing:
                return Response(
                    {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST
                )

            document.status = Status.pending
            document.save()
            transaction.on_commit(
                lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
            )

        modify.delay(
            document.pk,
            document.page_count,
            document.slug,
            document.access,
            serializer.data,
        )
        return Response(serializer.data, status=status.HTTP_201_CREATED)
Beispiel #6
0
def _add_asset_url(results):
    from documentcloud.documents.tasks import solr_index

    for result in results:
        # access and status should always be available, re-index if they are not
        if "access" not in result or "status" not in result:
            solr_index.delay(result["id"])
            result["asset_url"] = settings.PRIVATE_ASSET_URL
        elif result["access"] == "public" and result["status"] in (
            "success",
            "readable",
        ):
            result["asset_url"] = settings.PUBLIC_ASSET_URL
        else:
            result["asset_url"] = settings.PRIVATE_ASSET_URL
    return results
Beispiel #7
0
    def create(self, request, *args, **kwargs):
        with transaction.atomic():

            document = self.get_object()

            serializer = self.get_serializer(data=request.data, many=True)
            serializer.is_valid(raise_exception=True)

            if document.processing:
                return Response(
                    {"error": "Already processing"}, status=status.HTTP_400_BAD_REQUEST
                )

            document.status = Status.pending
            # we must invalidate the cache after a redaction
            document.cache_dirty = True
            document.save()
            transaction.on_commit(
                lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
            )

        redact.delay(
            document.pk,
            document.slug,
            document.access,
            Language.get_choice(document.language).ocr_code,
            serializer.data,
        )
        return Response(serializer.data, status=status.HTTP_201_CREATED)
Beispiel #8
0
 def perform_destroy(self, instance):
     """Index the note changes in Solr"""
     super().perform_destroy(instance)
     transaction.on_commit(
         lambda: solr_index.delay(
             self.kwargs["document_pk"], field_updates={"notes": "set"}
         )
     )
Beispiel #9
0
 def perform_update(self, serializer):
     """Index the note changes in Solr"""
     super().perform_update(serializer)
     transaction.on_commit(
         lambda: solr_index.delay(
             self.kwargs["document_pk"], field_updates={"notes": "set"}
         )
     )
Beispiel #10
0
 def perform_create(self, serializer):
     """Specify the document
     Set the status of the document to error
     """
     serializer.save(document_id=self.document.pk)
     self.document.status = Status.error
     self.document.save()
     transaction.on_commit(
         lambda: solr_index.delay(self.document.pk, field_updates={"status": "set"})
     )
Beispiel #11
0
 def perform_create(self, serializer):
     """Specify the document, user and organization"""
     serializer.save(
         document_id=self.kwargs["document_pk"],
         user=self.request.user,
         organization=self.request.user.organization,
     )
     transaction.on_commit(
         lambda: solr_index.delay(
             self.kwargs["document_pk"], field_updates={"notes": "set"}
         )
     )
Beispiel #12
0
    def update(self, request, pk=None, document_pk=None):
        document = self.get_object(edit=True)
        serializer = self.serializer_class(data=request.data)
        serializer.is_valid(raise_exception=True)

        # remove duplicate values
        document.data[pk] = list(set(serializer.data["values"]))
        document.save()
        transaction.on_commit(
            lambda: solr_index.delay(document.pk, field_updates={f"data_{pk}": "set"})
        )
        return Response(document.data)
Beispiel #13
0
    def destroy(self, request, pk=None, document_pk=None):
        document = self.get_object(edit=True)

        if pk in document.data:
            del document.data[pk]
            document.save()
            transaction.on_commit(
                lambda: solr_index.delay(
                    document.pk, field_updates={f"data_{pk}": "set"}
                )
            )

        return Response(status=status.HTTP_204_NO_CONTENT)
Beispiel #14
0
 def _process(self, document, force_ocr):
     """Process a document after you have uploaded the file"""
     transaction.on_commit(
         lambda: process.delay(
             document.pk,
             document.slug,
             document.access,
             Language.get_choice(document.language).ocr_code,
             force_ocr,
             document.original_extension,
         )
     )
     transaction.on_commit(
         lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
     )
Beispiel #15
0
 def cancel_process(self, request, pk=None):
     """Cancel processing for a document"""
     # pylint: disable=unused-argument
     document = self.get_object()
     if not document.processing:
         return Response(
             {"error": "Not processing"}, status=status.HTTP_400_BAD_REQUEST
         )
     with transaction.atomic():
         document.status = Status.error
         document.save()
         transaction.on_commit(
             lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
         )
         document.errors.create(message="Processing was cancelled")
         transaction.on_commit(lambda: process_cancel.delay(document.pk))
         return Response("OK", status=status.HTTP_200_OK)
Beispiel #16
0
def extract_entities(document):
    """The public entry point to the module.
    The document should be set to Status.readable before this function
    is called on it.
    Mainly a wrapper with error handling to ensure document doesn't get stuck
    in a processing state.
    """
    from documentcloud.documents.tasks import solr_index

    try:
        _extract_entities(document)
    finally:
        with transaction.atomic():
            document.status = Status.success
            document.save()
            transaction.on_commit(
                lambda: solr_index.delay(document.pk, field_updates={"status": "set"})
            )
        logger.info("Extracting entities for %s finished", document)
Beispiel #17
0
    def _update_solr(self, document, old_processing, old_data_keys, validated_data):
        """Update solr index after updating a document"""
        # update solr index
        if old_processing and document.status == Status.success:
            # if it was processed succesfully, do a full index with text
            kwargs = {"index_text": True}
        elif old_processing:
            # if it is not done processing or error, we may not be indexed yet
            # do a full index, without text since text has not been processed yet
            kwargs = {"index_text": False}
        else:
            # only update the fields that were updated
            # never try to update the id
            validated_data.pop("id", None)
            data = validated_data.pop("data", None)
            if data:
                # we want to update all data keys if data is set directly,
                # including old data keys which may have been removed
                all_keys = old_data_keys | data.keys()
                for key in all_keys:
                    validated_data[f"data_{key}"] = None
            kwargs = {"field_updates": {f: "set" for f in validated_data}}

        transaction.on_commit(lambda: solr_index.delay(document.pk, **kwargs))
Beispiel #18
0
 def save_model(self, request, obj, form, change):
     super().save(request, obj, form, change)
     transaction.on_commit(lambda: solr_index.delay(
         obj.pk, field_updates={f: "set"
                                for f in form.changed_data}))
Beispiel #19
0
def post_process(document, modification_data):
    """Post process the notes and sections for the document as specified by
    modifications
    """
    from documentcloud.documents.tasks import solr_index

    # Remove entities (no matter what)
    document.entities.all().delete()

    # (document.id, old_page) -> [(new_page, rotation), ...]
    page_map = _build_page_map(document, modification_data["modifications"])

    # load all documents, notes and sections
    # prefetch all notes and sections
    documents = Document.objects.prefetch_related(
        "notes",
        "sections").filter(id__in=[doc_id for doc_id, _page in page_map])

    # map all notes and sections from involved documents to their correct places
    # the first occurence of a note or section from the original document may be
    # moved instead of copied
    create_notes, update_notes, delete_notes = [], [], []
    create_sections, update_sections, delete_sections = [], [], []
    for source_document in documents:
        creates, updates, deletes = _process_page_objs(
            page_map,
            document,
            source_document,
            source_document.notes.all(),
            remove_note,
        )
        create_notes.extend(creates)
        update_notes.extend(updates)
        delete_notes.extend(deletes)

        creates, updates, deletes = _process_page_objs(
            page_map,
            document,
            source_document,
            source_document.sections.all(),
            remove_section,
        )
        create_sections.extend(creates)
        update_sections.extend(updates)
        delete_sections.extend(deletes)

    _commit_db(
        Note,
        ["page_number", "x1", "y1", "x2", "y2"],
        create_notes,
        update_notes,
        delete_notes,
    )
    _commit_db(Section, ["page_number"], create_sections, update_sections,
               delete_sections)

    document.status = Status.success
    document.page_spec = modification_data["pagespec"]
    if "filehash" in modification_data and modification_data["filehash"]:
        document.file_hash = modification_data["filehash"]
    document.save()

    transaction.on_commit(lambda: solr_index.delay(document.pk,
                                                   field_updates={
                                                       "status": "set",
                                                       "page_count": "set"
                                                   }))