Esempio n. 1
0
    def post(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)
        serializer = self.get_serializer(data=request.data)
        if serializer.is_valid(raise_exception=True):
            ed = ElasticDocument(index)
            document = ed.get(document_id, fields=[TEXTA_TAGS_KEY])
            if not document:
                raise NotFound(
                    f"Could not find document with ID '{document_id}' from index '{index}'!"
                )

            document = document.get("_source")
            target_facts = serializer.validated_data.get("facts", [])
            existing_facts = document.get(TEXTA_TAGS_KEY, [])

            new_facts = []
            for index_count, existing_fact in enumerate(existing_facts):
                for fact in target_facts:
                    if not (fact.items() <= existing_fact.items()):
                        new_facts.append(existing_fact)

            document[TEXTA_TAGS_KEY] = new_facts
            ed.update(index, document_id, doc=document)
            return Response({
                "message":
                f"Removed given facts from document with the ID of {document_id}!"
            })
Esempio n. 2
0
def apply_mlp_on_es_docs(self, source_and_meta_docs: List[str], mlp_id: int):
    """
    Applies MLP on documents received by previous tasks and updates them in Elasticsearch.
    :param self: Reference to the Celery Task object of this task, courtesy of the bind parameter in the decorator.
    :param source_and_meta_docs: List of Elasticsearch document ID's to pull from Elasticsearch.
    :param mlp_id: ID of the MLPObject which contains progress.
    """
    mlp_object = get_mlp_object(mlp_id)

    task_object = mlp_object.task

    # Get the necessary fields.
    field_data: List[str] = json.loads(mlp_object.fields)
    if TEXTA_TAGS_KEY not in field_data:
        # Add in existing facts so that proper duplicate filtering would be applied.
        field_data.append(TEXTA_TAGS_KEY)

    analyzers: List[str] = json.loads(mlp_object.analyzers)

    # retrieve document from ES
    document_wrapper = ElasticDocument(index=None)
    source_and_meta_docs = document_wrapper.get_bulk(
        doc_ids=source_and_meta_docs, fields=field_data)

    source_documents = [doc["_source"] for doc in source_and_meta_docs]
    mlp_docs = apply_mlp_on_documents(source_documents, analyzers, field_data,
                                      mlp_id)
    es_documents = unite_source_with_meta(source_and_meta_docs, mlp_docs)
    update_documents_in_es(es_documents)

    # Update progress
    task_object.update_progress_iter(len(source_and_meta_docs))
    return True
Esempio n. 3
0
    def post(self, request, pk: int):
        ed = ElasticDocument(index=None)

        # Validate payload and project permissions.
        serializer: InsertDocumentsSerializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)
        project = get_object_or_404(Project, pk=pk)
        if project.users.filter(pk=request.user.pk).exists() is False:
            raise PermissionDenied("You do not have permissions for this project!")

        # Split indices on whether they have index access or lack any index details at all.
        documents = serializer.validated_data["documents"]
        split_fields = serializer.validated_data["split_text_in_fields"]
        indices = project.get_indices()

        correct_actions, failed_actions, missing_actions = self._split_documents_per_index(allowed_indices=indices, documents=documents)
        missing_actions, index_name, has_new_index = self._normalize_missing_index_values(missing_actions, project.pk, indices)
        split_actions = self._split_text(correct_actions + missing_actions, split_fields)

        if has_new_index:
            ed.core.create_index(index_name)
            ed.core.add_texta_facts_mapping(index_name)
            index, is_created = Index.objects.get_or_create(name=index_name, is_open=True)
            project.indices.add(index)

        # Send the documents to Elasticsearch.
        success_count, errors = ed.bulk_add_generator(actions=split_actions, stats_only=False)
        return Response(
            {
                "successfully_indexed": success_count,
                "errors": errors,
                "failed_index_permissions": len(failed_actions)
            }
        )
Esempio n. 4
0
 def get(self, request, pk: int, index: str, document_id: str):
     validate_index_and_project_perms(request, pk, index)
     ed = ElasticDocument(index)
     document = ed.get(document_id)
     if not document:
         raise NotFound(f"Could not find document with ID '{document_id}' from index '{index}'!")
     return Response(document)
Esempio n. 5
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Esempio n. 6
0
    def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""):
        """
        This is a helper function to parse all the given fields and use the document_ids
        as input to make a significant_words aggregation.
        Args:
            exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations.
            stop_words: Optional parameter to remove stopwords from the results.
            indices: Indices from which to perform the aggregation.
            fields: From which fields can you get the text content needed for comparison.
            document_ids: IDs of the documents you want to use as baseline for the aggregation.

        Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents.

        """
        ed = ElasticDocument("*")
        ea = ElasticAggregator(indices=indices)

        stop_words = StopWords._get_stop_words(custom_stop_words=stop_words)
        # Validate that those documents exist.
        validated_docs: List[dict] = ed.get_bulk(document_ids)
        if validated_docs:
            unique_ids = list(set([index["_id"] for index in validated_docs]))
            significant_words = []
            for field in fields:
                sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude)
                significant_words += sw

            return significant_words
        else:
            return []
Esempio n. 7
0
def update_documents_in_es(documents: List[dict]):
    """
    Updates the documents inside Elasticsearch, either with the MLP results or the
    error messages.

    :param documents: Full Elasticsearch documents..
    """
    ed = ElasticDocument(index=None)
    ed.bulk_update(actions=documents)
Esempio n. 8
0
    def delete(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)

        try:
            ed = ElasticDocument(index)
            document = ed.delete(doc_id=document_id)
            return Response(document)
        except texta_elastic.exceptions.NotFoundError:
            return Response(status=status.HTTP_404_NOT_FOUND)
Esempio n. 9
0
def tag_cluster(self, cluster_pk: int, clustering_object_pk: int, fact: dict):
    ed = ElasticDocument("")
    cluster = Cluster.objects.get(pk=cluster_pk)
    clustering_object = ClusteringResult.objects.get(pk=clustering_object_pk)
    doc_ids = json.loads(cluster.document_ids)
    ignored_ids = json.loads(clustering_object.ignored_ids)
    ed.add_fact_to_documents(fact=fact, doc_ids=doc_ids)
    clustering_object.ignored_ids = json.dumps(doc_ids + ignored_ids)
    clustering_object.save()
    return True
Esempio n. 10
0
    def patch(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)

        try:
            ed = ElasticDocument(index)
            document = ed.update(index=index, doc_id=document_id, doc=request.data)
            return Response(document)
        except elasticsearch.exceptions.RequestError as e:
            if e.error == "mapper_parsing_exception":  # TODO Extend the decorator with different variants of the request error instead.
                return Response(e.info["error"]["root_cause"], status=status.HTTP_400_BAD_REQUEST)
        except texta_elastic.exceptions.NotFoundError:
            return Response(status=status.HTTP_404_NOT_FOUND)
Esempio n. 11
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Esempio n. 12
0
def bulk_add_documents(
    elastic_search: ElasticSearcher,
    elastic_doc: ElasticDocument,
    index: str,
    chunk_size: int,
    field_data: List[dict],
    flatten_doc=False,
):
    new_docs = apply_custom_processing(elastic_search, flatten_doc)
    actions = apply_field_changes_generator(new_docs, index, field_data)
    # No need to wait for indexing to actualize, hence refresh is False.
    elastic_doc.bulk_add_generator(actions=actions,
                                   chunk_size=chunk_size,
                                   refresh="wait_for")
Esempio n. 13
0
    def pull_document_by_id(self, request, pk=None, project_pk=None):
        annotator: Annotator = self.get_object()
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        ed = ElasticDocument(index=annotator.get_indices())
        document_id = serializer.validated_data["document_id"]
        document = ed.get(document_id)
        if document:
            document = self._process_document_output(document, annotator)
            return Response(document)
        else:
            return Response({"message": "No such document!"},
                            status=status.HTTP_404_NOT_FOUND)
Esempio n. 14
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Esempio n. 15
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Esempio n. 16
0
    def import_dataset(self) -> list:
        error_container = []
        # retrieve content from file
        success, file_content = self._get_file_content()
        file_content = file_content.dropna(how="all")

        # check if file was parsed
        if not success:
            error_container.append('unknown file type')
            return error_container

        # convert content to list of records (dicts)
        records = file_content.to_dict(orient='records')
        # set num_records
        self.num_records = len(records)
        # set total number of documents for progress
        if self.show_progress:
            self.show_progress.set_total(self.num_records)

        # add documents to ES
        es_doc = ElasticDocument(self.index)
        # create index
        es_doc.core.create_index(self.index)
        # add mapping for texta facts
        es_doc.core.add_texta_facts_mapping(self.index)
        # get records
        chunk_size = 500
        records = [{
            k: v
            for k, v in record.items() if pd.Series(v).notna().all()
        } for record in records]
        record_chunks = list(chunks(records, chunk_size))

        for documents in record_chunks:
            success, errors = es_doc.bulk_add(documents,
                                              chunk_size=chunk_size,
                                              stats_only=False,
                                              raise_on_error=False)
            self.num_records_success += success
            if self.show_progress:
                self.show_progress.update(success)

            for error in list(errors):
                message = error["index"]["error"]["reason"] if isinstance(
                    error, dict) else str(error)
                error_container.append(message)

        return error_container
Esempio n. 17
0
    def multitag_docs(self, request, pk=None, project_pk=None):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        # filter tagger groups present in project
        project_object = Project.objects.get(id=project_pk)
        regex_taggers_groups = RegexTaggerGroup.objects.filter(
            project=project_object)
        # filter again based on serializer
        if serializer.validated_data['tagger_groups']:
            regex_taggers_groups = regex_taggers_groups.filter(
                pk__in=serializer.validated_data['tagger_groups'])

        docs = serializer.validated_data["docs"]
        fields = serializer.validated_data["fields"]

        # apply taggers
        result = []
        for regex_tagger_group in regex_taggers_groups:
            matches = regex_tagger_group.tag_docs(fields, docs)
            result.extend(matches)

        result = ElasticDocument.remove_duplicate_facts(result)

        return Response(result, status=status.HTTP_200_OK)
Esempio n. 18
0
 def _initialize_es(self, project_pk, text_processor, callback_progress,
                    prediction_to_match):
     # create es doc
     es_doc = ElasticDocument(self.feedback_index)
     # if no model objects, return nones for query and search
     if not self.model_object:
         return es_doc, None, None
     # create mathing query
     query = Query()
     query.add_string_filter(query_string=self.model_object.MODEL_TYPE,
                             fields=["model_type"])
     if self.model_object:
         query.add_string_filter(query_string=str(self.model_object.pk),
                                 fields=["model_id"])
     if prediction_to_match:
         query.add_string_filter(query_string=prediction_to_match,
                                 fields=["correct_result"])
     # if no index, don't create searcher object
     if not self.check_index_exists():
         return es_doc, None, query.query
     # create es search
     es_search = ElasticSearcher(indices=self.feedback_index,
                                 query=query.query,
                                 text_processor=text_processor,
                                 output=ElasticSearcher.OUT_DOC_WITH_ID,
                                 callback_progress=callback_progress)
     # return objects
     return es_doc, es_search, query.query
Esempio n. 19
0
 def _get_sample_document(self, id_field: str, id_value: str, index: str):
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW)
     ed = ElasticDocument(index=index)
     response = es.search()["hits"]["hits"]
     document = response[0] if response else None
     return ed, document
Esempio n. 20
0
    def setUp(self):
        self.user = create_test_user('user', '*****@*****.**', 'pw')
        self.new_test_index_name = f"ttk_test_query_tagger_{uuid.uuid4().hex[:5]}"
        self.project = project_creation("SearchQueryTaggerTestProject", self.new_test_index_name, self.user)
        self.project.users.add(self.user)
        self.url = reverse("v2:search_query_tagger-list", kwargs={"project_pk": self.project.pk})

        self.uuid = "adasda-5874856a-das4das98f5"
        self.document = {
            "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.",
            "Field_2": "This is a different sentence.",
            "Field_3": "This is test data.",
            "uuid": self.uuid}

        self.ed = ElasticDocument(index=self.new_test_index_name)

        self.ed.add(self.document)
        self.client.login(username='******', password='******')
Esempio n. 21
0
    def count_indices(self, request, pk=None, project_pk=None):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid()

        indices = [{"name": name} for name in serializer.validated_data.get("indices", [])]
        serializer = IndexSerializer(data=indices, many=True)
        serializer.is_valid(raise_exception=True)

        project: Project = self.get_object()
        ed = ElasticDocument(None)

        indices = [index["name"] for index in indices]
        if indices:
            # We check for indices before to prevent the default behaviour of picking all the indices in project.
            indices = project.get_available_or_all_project_indices(indices)
            count = ed.count(indices=indices)
            return Response(count)
        else:
            return Response(0)
Esempio n. 22
0
    def post(self, request, project_pk: int):
        project: Project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)

        serializer = ProjectDocumentSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])
        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        doc_id = serializer.validated_data["doc_id"]
        if not doc_id:
            raise InvalidInputDocument(detail="No doc_id supplied")

        es = ElasticDocument(index=indices)
        results = es.get(doc_id)
        return Response(results, status=status.HTTP_200_OK)
Esempio n. 23
0
def process_mlp_actions(generator: ElasticSearcher, analyzers: List[str],
                        field_data: List[str], mlp_class: MLP, mlp_id: int):
    """
    ElasticSearcher returns a list of 100 RAW elastic documents.
    Since MLP needs a raw document to process, we need to memorize the index of the document in question
    so that we could later fetch it's metadata for the Bulk generator.
    """
    counter = 0
    info_logger = logging.getLogger(INFO_LOGGER)

    info_logger.info(
        f"Starting the processing of indices for MLP worker with ID of {mlp_id}!"
    )
    for document_batch in generator:
        document_sources = [dict(hit["_source"]) for hit in document_batch]
        mlp_processed = mlp_class.process_docs(document_sources,
                                               analyzers=analyzers,
                                               doc_paths=field_data)

        for index, mlp_processed_document in enumerate(mlp_processed):
            original_elastic_document = document_batch[index]

            # Make sure that existing facts in the document and new ones don't overlap.
            original_facts = original_elastic_document["_source"].get(
                "texta_facts", [])
            new_facts = mlp_processed_document.get("texta_facts", [])
            total_facts = [fact for fact in original_facts + new_facts if fact]
            unique_facts = ElasticDocument.remove_duplicate_facts(total_facts)

            elastic_update_body = {
                "_id": original_elastic_document["_id"],
                "_index": original_elastic_document["_index"],
                "_type": original_elastic_document.get("_type", "_doc"),
                "_op_type": "update",
                "doc": {
                    **mlp_processed_document,
                    **{
                        "texta_facts": unique_facts
                    }
                }
            }

            yield elastic_update_body

            counter += 1
            progress = generator.callback_progress
            if counter % generator.scroll_size == 0:
                info_logger.info(
                    f"Progress on applying MLP for worker with id: {mlp_id} at {counter} out of {progress.n_total} documents!"
                )
            elif counter == progress.n_total:
                info_logger.info(
                    f"Finished applying MLP for worker with id: {mlp_id} at {counter}/{progress.n_total} documents!"
                )
Esempio n. 24
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Esempio n. 25
0
def apply_summarizer_on_index(self, summarizer_id: int):
    summarizer_object = Summarizer.objects.get(pk=summarizer_id)
    task_object = summarizer_object.task
    try:
        load_sumy()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step('scrolling summarizer')

        # Get the necessary fields.
        indices: List[str] = summarizer_object.get_indices()
        field_data: List[str] = json.loads(summarizer_object.fields)
        ratio_data: float[str] = summarizer_object.ratio
        algorithm_data: List[str] = summarizer_object.algorithm

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(summarizer_object.query),
                                   indices=indices,
                                   field_data=field_data,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="30m")

        actions = process_actions(searcher,
                                  field_data,
                                  ratio_data,
                                  algorithm=algorithm_data,
                                  summarizer_class=sumy,
                                  summarizer_id=summarizer_id)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return summarizer_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Esempio n. 26
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     fields: List[str],
                     fact_name: str,
                     fact_value: str,
                     tagger_object: BertTaggerObject,
                     tagger: BertTagger = None):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..."
        )
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):

                    result = tagger_object.apply_loaded_tagger(
                        tagger, text, input_type="text", feedback=False)

                    # If tagger is binary and fact value is not specified by the user, use tagger description as fact value
                    if result["result"] in ["true", "false"]:
                        if not fact_value:
                            fact_value = tagger_object.description

                    # For multitag, use the prediction as fact value
                    else:
                        fact_value = result["result"]

                    new_facts = to_texta_facts(result, field, fact_name,
                                               fact_value)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
Esempio n. 27
0
    def retrieve(self, request, *args, **kwargs):
        # API v1 to v2 compliance
        if "clustering_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["clustering_pk"]
        elif "topic_analyzer_pk" in self.kwargs:
            topic_analyzer_pk = self.kwargs["topic_analyzer_pk"]

        queryset = Cluster.objects.filter(
            clusteringresult__project__pk=self.kwargs["project_pk"],
            clusteringresult__pk=topic_analyzer_pk)
        cluster = get_object_or_404(queryset, pk=self.kwargs["pk"])

        doc_ids = json.loads(cluster.document_ids)
        fields = json.loads(cluster.fields)
        indices = json.loads(cluster.indices)
        significant_words = json.loads(cluster.significant_words)
        display_fields = json.loads(cluster.display_fields)

        if display_fields:
            fields += display_fields

        ed = ElasticDocument(index=",".join(indices))

        documents = ed.get_bulk(doc_ids, flatten=True)
        documents = documents if documents else []
        documents = [{
            "id": doc["_id"],
            "index": doc["_index"],
            "content": doc["_source"]
        } for doc in documents]

        formated_cluster = {
            "id": cluster.pk,
            "intracluster_similarity": cluster.intracluster_similarity,
            "document_count": cluster.get_document_count(),
            "significant_words": significant_words,
            "documents": documents
        }
        return Response(formated_cluster)
Esempio n. 28
0
    def update(self, request, *args, **kwargs):
        serializer = ClusterSerializer(data=request.data, partial=True)
        serializer.is_valid()

        cluster = Cluster.objects.get(pk=kwargs["pk"])
        clustering_object = ClusteringResult.objects.get(
            pk=kwargs["topic_analyzer_pk"])

        fields = json.loads(cluster.fields)
        stop_words = json.loads(clustering_object.stop_words)
        indices = json.loads(cluster.indices)

        if "document_ids" in serializer.validated_data:
            document_ids = serializer.validated_data["document_ids"]
            ed = ElasticDocument("*")

            # Validate that those documents exist.
            validated_docs = ed.get_bulk(document_ids)
            if validated_docs:
                unique_ids = list(
                    set([index["_id"] for index in validated_docs]))
                cluster.document_ids = json.dumps(unique_ids)

                sw = Cluster.get_significant_words(indices=indices,
                                                   fields=fields,
                                                   document_ids=unique_ids,
                                                   stop_words=stop_words)
                cluster.significant_words = json.dumps(sw)

                cluster_content = ClusterContent(
                    unique_ids,
                    vectors_filepath=clustering_object.vector_model.name)
                cluster.intracluster_similarity = cluster_content.get_intracluster_similarity(
                )
            else:
                cluster.document_ids = json.dumps([])

        cluster.save()
        return Response({"message": "Cluster has been updated successfully!"})
Esempio n. 29
0
    def skip_document(self, request, pk=None, project_pk=None):
        serializer: DocumentIDSerializer = self.get_serializer(
            data=request.data)
        serializer.is_valid(raise_exception=True)
        annotator: Annotator = self.get_object()

        ed = ElasticDocument(index=annotator.get_indices())
        document_id = serializer.validated_data["document_id"]
        document = ed.get(document_id)
        texta_annotations = document["_source"].get("texta_annotator", [])

        processed_timestamp = None
        if texta_annotations:
            for texta_annotation in texta_annotations:
                processed_timestamp = texta_annotation.get(
                    "processed_timestamp_utc", None)

                if processed_timestamp:
                    return Response({
                        "detail":
                        f"Document with ID: {serializer.validated_data['document_id']} is already annotated"
                    })

            annotator.skip_document(serializer.validated_data["document_id"],
                                    serializer.validated_data["index"],
                                    user=request.user)
            return Response({
                "detail":
                f"Skipped document with ID: {serializer.validated_data['document_id']}"
            })
        else:
            annotator.skip_document(serializer.validated_data["document_id"],
                                    serializer.validated_data["index"],
                                    user=request.user)
            return Response({
                "detail":
                f"Skipped document with ID: {serializer.validated_data['document_id']}"
            })
Esempio n. 30
0
    def setUp(self):
        self.user = create_test_user('user', '*****@*****.**', 'pw')
        self.index_uuid = uuid.uuid4().hex[:5]
        self.new_test_index_name = f"ttk_test_fields_tagger_{self.index_uuid}"

        self.ed = ElasticDocument(index=self.new_test_index_name)
        self.ed.core.es.indices.create(index=self.new_test_index_name, ignore=[400, 404])

        self.project = project_creation("SearchFieldsTaggerTestProject", self.new_test_index_name, self.user)
        self.project.users.add(self.user)
        self.url = reverse(f"{VERSION_NAMESPACE}:search_fields_tagger-list", kwargs={"project_pk": self.project.pk})

        self.uuid = uuid.uuid4().hex[:10]
        self.document = {
            "Field_1": "This is sentence1. This is sentence2. This is sentence3. This is sentence4. This is sentence5.",
            "Field_2": "This is a different sentence.",
            "Field_3": "This is test data.",
            "newline_break": "olgu\nõnnistatud\npüha\nkäsikranaat",
            "array_break": ["olgu", "õnnistatud", "püha", "käsikranaat"],
            "uuid": self.uuid
        }

        self.ed.add(self.document)
        self.client.login(username='******', password='******')