Example #1
0
def bulk_add_documents(
    elastic_search: ElasticSearcher,
    elastic_doc: ElasticDocument,
    index: str,
    chunk_size: int,
    field_data: List[dict],
    flatten_doc=False,
):
    new_docs = apply_custom_processing(elastic_search, flatten_doc)
    actions = apply_field_changes_generator(new_docs, index, field_data)
    # No need to wait for indexing to actualize, hence refresh is False.
    elastic_doc.bulk_add_generator(actions=actions,
                                   chunk_size=chunk_size,
                                   refresh="wait_for")
Example #2
0
    def post(self, request, pk: int):
        ed = ElasticDocument(index=None)

        # Validate payload and project permissions.
        serializer: InsertDocumentsSerializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)
        project = get_object_or_404(Project, pk=pk)
        if project.users.filter(pk=request.user.pk).exists() is False:
            raise PermissionDenied("You do not have permissions for this project!")

        # Split indices on whether they have index access or lack any index details at all.
        documents = serializer.validated_data["documents"]
        split_fields = serializer.validated_data["split_text_in_fields"]
        indices = project.get_indices()

        correct_actions, failed_actions, missing_actions = self._split_documents_per_index(allowed_indices=indices, documents=documents)
        missing_actions, index_name, has_new_index = self._normalize_missing_index_values(missing_actions, project.pk, indices)
        split_actions = self._split_text(correct_actions + missing_actions, split_fields)

        if has_new_index:
            ed.core.create_index(index_name)
            ed.core.add_texta_facts_mapping(index_name)
            index, is_created = Index.objects.get_or_create(name=index_name, is_open=True)
            project.indices.add(index)

        # Send the documents to Elasticsearch.
        success_count, errors = ed.bulk_add_generator(actions=split_actions, stats_only=False)
        return Response(
            {
                "successfully_indexed": success_count,
                "errors": errors,
                "failed_index_permissions": len(failed_actions)
            }
        )
Example #3
0
def bulk_add_documents(elastic_search: ElasticSearcher, elastic_doc: ElasticDocument, index: str, chunk_size: int, flatten_doc=False):
    new_docs = apply_elastic_search(elastic_search, flatten_doc)
    actions = annotator_bulk_generator(new_docs, index)
    # No need to wait for indexing to actualize, hence refresh is False.
    elastic_doc.bulk_add_generator(actions=actions, chunk_size=chunk_size, refresh="wait_for")