Example #1
0
def apply_mlp_on_es_docs(self, source_and_meta_docs: List[str], mlp_id: int):
    """
    Applies MLP on documents received by previous tasks and updates them in Elasticsearch.
    :param self: Reference to the Celery Task object of this task, courtesy of the bind parameter in the decorator.
    :param source_and_meta_docs: List of Elasticsearch document ID's to pull from Elasticsearch.
    :param mlp_id: ID of the MLPObject which contains progress.
    """
    mlp_object = get_mlp_object(mlp_id)

    task_object = mlp_object.task

    # Get the necessary fields.
    field_data: List[str] = json.loads(mlp_object.fields)
    if TEXTA_TAGS_KEY not in field_data:
        # Add in existing facts so that proper duplicate filtering would be applied.
        field_data.append(TEXTA_TAGS_KEY)

    analyzers: List[str] = json.loads(mlp_object.analyzers)

    # retrieve document from ES
    document_wrapper = ElasticDocument(index=None)
    source_and_meta_docs = document_wrapper.get_bulk(
        doc_ids=source_and_meta_docs, fields=field_data)

    source_documents = [doc["_source"] for doc in source_and_meta_docs]
    mlp_docs = apply_mlp_on_documents(source_documents, analyzers, field_data,
                                      mlp_id)
    es_documents = unite_source_with_meta(source_and_meta_docs, mlp_docs)
    update_documents_in_es(es_documents)

    # Update progress
    task_object.update_progress_iter(len(source_and_meta_docs))
    return True
Example #2
0
    def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""):
        """
        This is a helper function to parse all the given fields and use the document_ids
        as input to make a significant_words aggregation.
        Args:
            exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations.
            stop_words: Optional parameter to remove stopwords from the results.
            indices: Indices from which to perform the aggregation.
            fields: From which fields can you get the text content needed for comparison.
            document_ids: IDs of the documents you want to use as baseline for the aggregation.

        Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents.

        """
        ed = ElasticDocument("*")
        ea = ElasticAggregator(indices=indices)

        stop_words = StopWords._get_stop_words(custom_stop_words=stop_words)
        # Validate that those documents exist.
        validated_docs: List[dict] = ed.get_bulk(document_ids)
        if validated_docs:
            unique_ids = list(set([index["_id"] for index in validated_docs]))
            significant_words = []
            for field in fields:
                sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude)
                significant_words += sw

            return significant_words
        else:
            return []
Example #3
0
    def retrieve(self, request, *args, **kwargs):
        # API v1 to v2 compliance
        if "clustering_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["clustering_pk"]
        elif "topic_analyzer_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["topic_analyzer_pk"]

        queryset = Cluster.objects.filter(
            clusteringresult__project__pk=self.kwargs["project_pk"],
            clusteringresult__pk=topic_analyzer_pk)
        cluster = get_object_or_404(queryset, pk=self.kwargs["pk"])

        doc_ids = json.loads(cluster.document_ids)
        fields = json.loads(cluster.fields)
        indices = json.loads(cluster.indices)
        significant_words = json.loads(cluster.significant_words)
        display_fields = json.loads(cluster.display_fields)

        if display_fields:
            fields += display_fields

        ed = ElasticDocument(index=",".join(indices))

        documents = ed.get_bulk(doc_ids, flatten=True)
        documents = documents if documents else []
        documents = [{
            "id": doc["_id"],
            "index": doc["_index"],
            "content": doc["_source"]
        } for doc in documents]

        formated_cluster = {
            "id": cluster.pk,
            "intracluster_similarity": cluster.intracluster_similarity,
            "document_count": cluster.get_document_count(),
            "significant_words": significant_words,
            "documents": documents
        }
        return Response(formated_cluster)
Example #4
0
    def update(self, request, *args, **kwargs):
        serializer = ClusterSerializer(data=request.data, partial=True)
        serializer.is_valid()

        cluster = Cluster.objects.get(pk=kwargs["pk"])
        clustering_object = ClusteringResult.objects.get(
            pk=kwargs["topic_analyzer_pk"])

        fields = json.loads(cluster.fields)
        stop_words = json.loads(clustering_object.stop_words)
        indices = json.loads(cluster.indices)

        if "document_ids" in serializer.validated_data:
            document_ids = serializer.validated_data["document_ids"]
            ed = ElasticDocument("*")

            # Validate that those documents exist.
            validated_docs = ed.get_bulk(document_ids)
            if validated_docs:
                unique_ids = list(
                    set([index["_id"] for index in validated_docs]))
                cluster.document_ids = json.dumps(unique_ids)

                sw = Cluster.get_significant_words(indices=indices,
                                                   fields=fields,
                                                   document_ids=unique_ids,
                                                   stop_words=stop_words)
                cluster.significant_words = json.dumps(sw)

                cluster_content = ClusterContent(
                    unique_ids,
                    vectors_filepath=clustering_object.vector_model.name)
                cluster.intracluster_similarity = cluster_content.get_intracluster_similarity(
                )
            else:
                cluster.document_ids = json.dumps([])

        cluster.save()
        return Response({"message": "Cluster has been updated successfully!"})
Example #5
0
    def add_documents(self, request, *args, **kwargs):
        serializer = ClusteringIdsSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        clustering_pk = ClusterViewSet.__handle_clustering_pk(kwargs)
        clustering_obj = ClusteringResult.objects.get(pk=clustering_pk)
        cluster_obj = clustering_obj.cluster_result.get(pk=kwargs["pk"])
        indices = clustering_obj.get_indices()
        stop_words = json.loads(clustering_obj.stop_words)
        fields = json.loads(clustering_obj.fields)

        ed = ElasticDocument(indices)

        # Get full elasticsearch documents with id, index, type and source values.
        existing_documents: List[dict] = ed.get_bulk(
            serializer.validated_data["ids"])
        existing_documents: List[str] = [
            document["_id"] for document in existing_documents
        ]

        saved_documents = json.loads(cluster_obj.document_ids)
        unique_ids = list(set(existing_documents + saved_documents))
        cluster_obj.document_ids = json.dumps(unique_ids)

        # get texts of new documents
        new_documents = []
        phraser = None
        new_ids = [
            doc_id for doc_id in unique_ids if doc_id not in saved_documents
        ]
        if len(new_ids) > 0:
            indices = clustering_obj.get_indices()
            stop_words = json.loads(clustering_obj.stop_words)
            ignored_ids = json.loads(clustering_obj.ignored_ids)
            fields = json.loads(clustering_obj.fields)
            document_limit = clustering_obj.document_limit
            query = {"query": {"ids": {"values": new_ids}}}

            text_processor = TextProcessor(remove_stop_words=True,
                                           custom_stop_words=stop_words)
            elastic_search = ElasticSearcher(
                indices=indices,
                query=query,
                text_processor=text_processor,
                ignore_ids=set(ignored_ids),
                output=ElasticSearcher.OUT_TEXT_WITH_ID,
                field_data=fields,
                scroll_limit=document_limit)

            for doc_id, text in elastic_search:
                new_documents.append({"id": doc_id, "text": text})

            if clustering_obj.embedding:
                embedding = clustering_obj.embedding.get_embedding()
                embedding.load_django(clustering_obj.embedding)
                phraser = embedding.phraser
            else:
                phraser = None

        # Update the similarity score since the documents were changed.
        cc = ClusterContent(doc_ids=unique_ids,
                            vectors_filepath=clustering_obj.vector_model.path)
        cluster_obj.intracluster_similarity = float(
            cc.get_intracluster_similarity(new_documents=new_documents,
                                           phraser=phraser))

        # Update the significant words since the documents were changed.
        sw = Cluster.get_significant_words(indices=indices,
                                           fields=fields,
                                           document_ids=unique_ids,
                                           stop_words=stop_words)
        cluster_obj.significant_words = json.dumps(sw)

        cluster_obj.save()
        return Response({
            "message":
            str(len(new_ids)) +
            " new document(s) successfully added to the cluster!"
        })