def multitag_docs(self, request, pk=None, project_pk=None): serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) # filter tagger groups present in project project_object = Project.objects.get(id=project_pk) regex_taggers_groups = RegexTaggerGroup.objects.filter( project=project_object) # filter again based on serializer if serializer.validated_data['tagger_groups']: regex_taggers_groups = regex_taggers_groups.filter( pk__in=serializer.validated_data['tagger_groups']) docs = serializer.validated_data["docs"] fields = serializer.validated_data["fields"] # apply taggers result = [] for regex_tagger_group in regex_taggers_groups: matches = regex_tagger_group.tag_docs(fields, docs) result.extend(matches) result = ElasticDocument.remove_duplicate_facts(result) return Response(result, status=status.HTTP_200_OK)
def process_mlp_actions(generator: ElasticSearcher, analyzers: List[str], field_data: List[str], mlp_class: MLP, mlp_id: int): """ ElasticSearcher returns a list of 100 RAW elastic documents. Since MLP needs a raw document to process, we need to memorize the index of the document in question so that we could later fetch it's metadata for the Bulk generator. """ counter = 0 info_logger = logging.getLogger(INFO_LOGGER) info_logger.info( f"Starting the processing of indices for MLP worker with ID of {mlp_id}!" ) for document_batch in generator: document_sources = [dict(hit["_source"]) for hit in document_batch] mlp_processed = mlp_class.process_docs(document_sources, analyzers=analyzers, doc_paths=field_data) for index, mlp_processed_document in enumerate(mlp_processed): original_elastic_document = document_batch[index] # Make sure that existing facts in the document and new ones don't overlap. original_facts = original_elastic_document["_source"].get( "texta_facts", []) new_facts = mlp_processed_document.get("texta_facts", []) total_facts = [fact for fact in original_facts + new_facts if fact] unique_facts = ElasticDocument.remove_duplicate_facts(total_facts) elastic_update_body = { "_id": original_elastic_document["_id"], "_index": original_elastic_document["_index"], "_type": original_elastic_document.get("_type", "_doc"), "_op_type": "update", "doc": { **mlp_processed_document, **{ "texta_facts": unique_facts } } } yield elastic_update_body counter += 1 progress = generator.callback_progress if counter % generator.scroll_size == 0: info_logger.info( f"Progress on applying MLP for worker with id: {mlp_id} at {counter} out of {progress.n_total} documents!" ) elif counter == progress.n_total: info_logger.info( f"Finished applying MLP for worker with id: {mlp_id} at {counter}/{progress.n_total} documents!" )
def update_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, tagger_object: BertTaggerObject, tagger: BertTagger = None): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling BERT Tagger with ID {tagger_object.id} to batch {i + 1}..." ) for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): result = tagger_object.apply_loaded_tagger( tagger, text, input_type="text", feedback=False) # If tagger is binary and fact value is not specified by the user, use tagger description as fact value if result["result"] in ["true", "false"]: if not fact_value: fact_value = tagger_object.description # For multitag, use the prediction as fact value else: fact_value = result["result"] new_facts = to_texta_facts(result, field, fact_name, fact_value) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_search_query_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, tagger_object: SearchQueryTagger): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling Search Query Tagger with ID {tagger_object.id}...") for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): result = { 'tagger_id': tagger_object.id, 'result': tagger_object.fact_name } if result["result"]: if not fact_value: fact_value = tagger_object.description else: fact_value = result["result"] new_facts = to_texta_facts(field, fact_name, fact_value) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } }, "retry_on_conflict": 3 }
def tag_docs(self, fields: List[str], docs: List[dict]): # apply tagger for doc in docs: for field in fields: flattened_doc = ElasticCore( check_connection=False).flatten(doc) text = flattened_doc.get(field, None) matches_as_facts = self.match_texts([text], as_texta_facts=True, field=field) for fact in matches_as_facts: fact.update(fact=self.description) pre_existing_facts = doc.get(TEXTA_TAGS_KEY, []) filtered_facts = ElasticDocument.remove_duplicate_facts( pre_existing_facts + matches_as_facts) doc[TEXTA_TAGS_KEY] = filtered_facts return docs
def update_search_fields_generator( generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, search_field_tagger_object: SearchFieldsTagger, use_breakup: bool, breakup_character: str): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Applying Search Fields Tagger with ID {search_field_tagger_object.id}..." ) for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: field_content = flat_hit.get(field, None) processed_content = handle_field_content( field_content, breakup_character, use_breakup) for content in processed_content: new_facts = to_texta_facts(field, fact_name, fact_value=content) existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def post(self, request, pk: int, index: str, document_id: str): validate_index_and_project_perms(request, pk, index) serializer = self.get_serializer(data=request.data) if serializer.is_valid(raise_exception=True): ed = ElasticDocument(index) document = ed.get(document_id, fields=[TEXTA_TAGS_KEY]) if not document: raise NotFound( f"Could not find document with ID '{document_id}' from index '{index}'!" ) document = document.get("_source") facts = serializer.validated_data.get("facts", []) existing_facts = document.get(TEXTA_TAGS_KEY, []) new_facts = ed.remove_duplicate_facts(facts + existing_facts) document[TEXTA_TAGS_KEY] = new_facts ed.update(index, document_id, doc=document) return Response({ "message": f"Added given facts from document with the ID of {document_id}!" })
def update_generator(keyword_detector: RakunDetectorWrapper, generator: ElasticSearcher, ec: ElasticCore, fields: List[str], rakun_extractor_object: RakunExtractor, fact_name: str, fact_value: str, add_spans: bool): for scroll_batch in generator: for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): results = rakun_extractor_object.get_rakun_keywords( keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=fact_name, fact_value=fact_value, add_spans=add_spans) existing_facts.extend(results) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_generator(generator: ElasticSearcher, ec: ElasticCore, mlp_fields: List[str], label_suffix: str, object_id: int, extractor: CRFExtractor = None): """ Tags & updates documents in ES. """ for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling CRFExtractor with ID {object_id} to batch {i + 1}...") for raw_doc in scroll_batch: hit = raw_doc["_source"] existing_facts = hit.get("texta_facts", []) for mlp_field in mlp_fields: new_facts = extractor.tag( hit, field_name=mlp_field, label_suffix=label_suffix)["texta_facts"] if new_facts: existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }
def update_generator(generator: ElasticSearcher, ec: ElasticCore, fields: List[str], fact_name: str, fact_value: str, max_tags: int, object_id: int, object_type: str, tagger_object: Union[Tagger, TaggerGroup], object_args: Dict, tagger: TextTagger = None): for i, scroll_batch in enumerate(generator): logging.getLogger(INFO_LOGGER).info( f"Appyling {object_type} with ID {object_id} to batch {i + 1}...") for raw_doc in scroll_batch: hit = raw_doc["_source"] flat_hit = ec.flatten(hit) existing_facts = hit.get("texta_facts", []) for field in fields: text = flat_hit.get(field, None) if text and isinstance(text, str): if object_type == "tagger": result = tagger_object.apply_loaded_tagger( tagger, text, input_type="text", feedback=None) if result: tags = [result] else: tags = [] else: # update text and tags with MLP combined_texts, ner_tags = get_mlp( object_id, [text], lemmatize=object_args["lemmatize"], use_ner=object_args["use_ner"]) # retrieve tag candidates tag_candidates = get_tag_candidates( object_id, [text], ignore_tags=ner_tags, n_similar_docs=object_args["n_similar_docs"], max_candidates=object_args["n_candidate_tags"]) # get tags (sorted by probability in descending order) tagger_group_tags = apply_tagger_group( object_id, text, tag_candidates, request=None, input_type='text', lemmatize=object_args["lemmatize"], feedback=False, use_async=False) # take only `max_tags` first tags tags = ner_tags + tagger_group_tags[:max_tags] new_facts = to_texta_fact(tags, field, fact_name, fact_value) if new_facts: existing_facts.extend(new_facts) if existing_facts: # Remove duplicates to avoid adding the same facts with repetitive use. existing_facts = ElasticDocument.remove_duplicate_facts( existing_facts) yield { "_index": raw_doc["_index"], "_id": raw_doc["_id"], "_type": raw_doc.get("_type", "_doc"), "_op_type": "update", "_source": { "doc": { "texta_facts": existing_facts } } }