Esempio n. 1
0
    def multitag_docs(self, request, pk=None, project_pk=None):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        # filter tagger groups present in project
        project_object = Project.objects.get(id=project_pk)
        regex_taggers_groups = RegexTaggerGroup.objects.filter(
            project=project_object)
        # filter again based on serializer
        if serializer.validated_data['tagger_groups']:
            regex_taggers_groups = regex_taggers_groups.filter(
                pk__in=serializer.validated_data['tagger_groups'])

        docs = serializer.validated_data["docs"]
        fields = serializer.validated_data["fields"]

        # apply taggers
        result = []
        for regex_tagger_group in regex_taggers_groups:
            matches = regex_tagger_group.tag_docs(fields, docs)
            result.extend(matches)

        result = ElasticDocument.remove_duplicate_facts(result)

        return Response(result, status=status.HTTP_200_OK)
Esempio n. 2
0
def process_mlp_actions(generator: ElasticSearcher, analyzers: List[str],
                        field_data: List[str], mlp_class: MLP, mlp_id: int):
    """
    ElasticSearcher returns a list of 100 RAW elastic documents.
    Since MLP needs a raw document to process, we need to memorize the index of the document in question
    so that we could later fetch it's metadata for the Bulk generator.
    """
    counter = 0
    info_logger = logging.getLogger(INFO_LOGGER)

    info_logger.info(
        f"Starting the processing of indices for MLP worker with ID of {mlp_id}!"
    )
    for document_batch in generator:
        document_sources = [dict(hit["_source"]) for hit in document_batch]
        mlp_processed = mlp_class.process_docs(document_sources,
                                               analyzers=analyzers,
                                               doc_paths=field_data)

        for index, mlp_processed_document in enumerate(mlp_processed):
            original_elastic_document = document_batch[index]

            # Make sure that existing facts in the document and new ones don't overlap.
            original_facts = original_elastic_document["_source"].get(
                "texta_facts", [])
            new_facts = mlp_processed_document.get("texta_facts", [])
            total_facts = [fact for fact in original_facts + new_facts if fact]
            unique_facts = ElasticDocument.remove_duplicate_facts(total_facts)

            elastic_update_body = {
                "_id": original_elastic_document["_id"],
                "_index": original_elastic_document["_index"],
                "_type": original_elastic_document.get("_type", "_doc"),
                "_op_type": "update",
                "doc": {
                    **mlp_processed_document,
                    **{
                        "texta_facts": unique_facts
                    }
                }
            }

            yield elastic_update_body

            counter += 1
            progress = generator.callback_progress
            if counter % generator.scroll_size == 0:
                info_logger.info(
                    f"Progress on applying MLP for worker with id: {mlp_id} at {counter} out of {progress.n_total} documents!"
                )
            elif counter == progress.n_total:
                info_logger.info(
                    f"Finished applying MLP for worker with id: {mlp_id} at {counter}/{progress.n_total} documents!"
                )
Esempio n. 3
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     fields: List[str],
                     fact_name: str,
                     fact_value: str,
                     tagger_object: BertTaggerObject,
                     tagger: BertTagger = None):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..."
        )
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):

                    result = tagger_object.apply_loaded_tagger(
                        tagger, text, input_type="text", feedback=False)

                    # If tagger is binary and fact value is not specified by the user, use tagger description as fact value
                    if result["result"] in ["true", "false"]:
                        if not fact_value:
                            fact_value = tagger_object.description

                    # For multitag, use the prediction as fact value
                    else:
                        fact_value = result["result"]

                    new_facts = to_texta_facts(result, field, fact_name,
                                               fact_value)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
Esempio n. 4
0
def update_search_query_generator(generator: ElasticSearcher, ec: ElasticCore,
                                  fields: List[str], fact_name: str,
                                  fact_value: str,
                                  tagger_object: SearchQueryTagger):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling Search Query Tagger with ID {tagger_object.id}...")
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):

                    result = {
                        'tagger_id': tagger_object.id,
                        'result': tagger_object.fact_name
                    }

                    if result["result"]:
                        if not fact_value:
                            fact_value = tagger_object.description

                    else:
                        fact_value = result["result"]

                    new_facts = to_texta_facts(field, fact_name, fact_value)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                },
                "retry_on_conflict": 3
            }
Esempio n. 5
0
    def tag_docs(self, fields: List[str], docs: List[dict]):
        # apply tagger
        for doc in docs:
            for field in fields:
                flattened_doc = ElasticCore(
                    check_connection=False).flatten(doc)
                text = flattened_doc.get(field, None)
                matches_as_facts = self.match_texts([text],
                                                    as_texta_facts=True,
                                                    field=field)
                for fact in matches_as_facts:
                    fact.update(fact=self.description)

                pre_existing_facts = doc.get(TEXTA_TAGS_KEY, [])
                filtered_facts = ElasticDocument.remove_duplicate_facts(
                    pre_existing_facts + matches_as_facts)
                doc[TEXTA_TAGS_KEY] = filtered_facts

        return docs
Esempio n. 6
0
def update_search_fields_generator(
        generator: ElasticSearcher, ec: ElasticCore, fields: List[str],
        fact_name: str, search_field_tagger_object: SearchFieldsTagger,
        use_breakup: bool, breakup_character: str):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Applying Search Fields Tagger with ID {search_field_tagger_object.id}..."
        )
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                field_content = flat_hit.get(field, None)
                processed_content = handle_field_content(
                    field_content, breakup_character, use_breakup)

                for content in processed_content:
                    new_facts = to_texta_facts(field,
                                               fact_name,
                                               fact_value=content)
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
Esempio n. 7
0
    def post(self, request, pk: int, index: str, document_id: str):
        validate_index_and_project_perms(request, pk, index)
        serializer = self.get_serializer(data=request.data)
        if serializer.is_valid(raise_exception=True):
            ed = ElasticDocument(index)
            document = ed.get(document_id, fields=[TEXTA_TAGS_KEY])
            if not document:
                raise NotFound(
                    f"Could not find document with ID '{document_id}' from index '{index}'!"
                )

            document = document.get("_source")
            facts = serializer.validated_data.get("facts", [])
            existing_facts = document.get(TEXTA_TAGS_KEY, [])
            new_facts = ed.remove_duplicate_facts(facts + existing_facts)

            document[TEXTA_TAGS_KEY] = new_facts
            ed.update(index, document_id, doc=document)
            return Response({
                "message":
                f"Added given facts from document with the ID of {document_id}!"
            })
Esempio n. 8
0
def update_generator(keyword_detector: RakunDetectorWrapper,
                     generator: ElasticSearcher, ec: ElasticCore,
                     fields: List[str], rakun_extractor_object: RakunExtractor,
                     fact_name: str, fact_value: str, add_spans: bool):
    for scroll_batch in generator:
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):
                    results = rakun_extractor_object.get_rakun_keywords(
                        keyword_detector=keyword_detector,
                        texts=[text],
                        field_path=field,
                        fact_name=fact_name,
                        fact_value=fact_value,
                        add_spans=add_spans)
                    existing_facts.extend(results)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
Esempio n. 9
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     mlp_fields: List[str],
                     label_suffix: str,
                     object_id: int,
                     extractor: CRFExtractor = None):
    """
    Tags & updates documents in ES.
    """
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling CRFExtractor with ID {object_id} to batch {i + 1}...")
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            existing_facts = hit.get("texta_facts", [])
            for mlp_field in mlp_fields:
                new_facts = extractor.tag(
                    hit, field_name=mlp_field,
                    label_suffix=label_suffix)["texta_facts"]
                if new_facts:
                    existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }
Esempio n. 10
0
def update_generator(generator: ElasticSearcher,
                     ec: ElasticCore,
                     fields: List[str],
                     fact_name: str,
                     fact_value: str,
                     max_tags: int,
                     object_id: int,
                     object_type: str,
                     tagger_object: Union[Tagger, TaggerGroup],
                     object_args: Dict,
                     tagger: TextTagger = None):
    for i, scroll_batch in enumerate(generator):
        logging.getLogger(INFO_LOGGER).info(
            f"Appyling {object_type} with ID {object_id} to batch {i + 1}...")
        for raw_doc in scroll_batch:
            hit = raw_doc["_source"]
            flat_hit = ec.flatten(hit)
            existing_facts = hit.get("texta_facts", [])

            for field in fields:
                text = flat_hit.get(field, None)
                if text and isinstance(text, str):
                    if object_type == "tagger":
                        result = tagger_object.apply_loaded_tagger(
                            tagger, text, input_type="text", feedback=None)
                        if result:
                            tags = [result]
                        else:
                            tags = []
                    else:
                        # update text and tags with MLP
                        combined_texts, ner_tags = get_mlp(
                            object_id, [text],
                            lemmatize=object_args["lemmatize"],
                            use_ner=object_args["use_ner"])
                        # retrieve tag candidates
                        tag_candidates = get_tag_candidates(
                            object_id, [text],
                            ignore_tags=ner_tags,
                            n_similar_docs=object_args["n_similar_docs"],
                            max_candidates=object_args["n_candidate_tags"])
                        # get tags (sorted by probability in descending order)
                        tagger_group_tags = apply_tagger_group(
                            object_id,
                            text,
                            tag_candidates,
                            request=None,
                            input_type='text',
                            lemmatize=object_args["lemmatize"],
                            feedback=False,
                            use_async=False)
                        # take only `max_tags` first tags
                        tags = ner_tags + tagger_group_tags[:max_tags]

                    new_facts = to_texta_fact(tags, field, fact_name,
                                              fact_value)
                    if new_facts:
                        existing_facts.extend(new_facts)

            if existing_facts:
                # Remove duplicates to avoid adding the same facts with repetitive use.
                existing_facts = ElasticDocument.remove_duplicate_facts(
                    existing_facts)

            yield {
                "_index": raw_doc["_index"],
                "_id": raw_doc["_id"],
                "_type": raw_doc.get("_type", "_doc"),
                "_op_type": "update",
                "_source": {
                    "doc": {
                        "texta_facts": existing_facts
                    }
                }
            }