def add_facts_mapping(self, request, pk=None, project_pk=None): es_core = ElasticCore() index = Index.objects.get(pk=pk) if index.is_open: es_core.add_texta_facts_mapping(index.name) return Response({ "message": f"Added the Texta Facts mapping for: {index.name}" }) else: return Response( { "message": f"Index {index.name} is closed, could not add the mapping!" }, status=status.HTTP_400_BAD_REQUEST)
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """Apply Torch Tagger to index.""" try: tagger_object = TorchTaggerObject.objects.get(pk=object_id) tagger = tagger_object.load_tagger() progress = ShowProgress(tagger_object.task) ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices = indices, field_data = fields + ["texta_facts"], # Get facts to add upon existing ones. query = query, output = ElasticSearcher.OUT_RAW, timeout = f"{es_timeout}m", callback_progress=progress, scroll_size = bulk_size ) actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger) for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info)) tagger_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED)
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def reindex_test_dataset(query: dict = None, from_index: Optional[str] = None, hex_size=20) -> str: """ Reindexes the master test dataset into isolated pieces. :param from_index: Index from which to reindex. :param query: Query you want to limit the reindex to. :param hex_size: How many random characters should there be in the new indexes name. :return: Name of the newly generated index. """ from texta_elastic.core import ElasticCore from toolkit.test_settings import TEST_INDEX from_index = from_index if from_index else TEST_INDEX ec = ElasticCore() new_test_index_name = f"ttk_test_{uuid.uuid4().hex[:hex_size]}" ec.create_index(index=new_test_index_name) ec.add_texta_facts_mapping(new_test_index_name) from_scan = elasticsearch_dsl.Search() if query is None else elasticsearch_dsl.Search.from_dict(query) from_scan = from_scan.index(from_index).using(ec.es) from_scan = from_scan.scan() def doc_actions(generator): for document in generator: yield { "_index": new_test_index_name, "_type": "_doc", "_source": document.to_dict(), "retry_on_conflict": 3 } actions = doc_actions(from_scan) from elasticsearch.helpers import bulk bulk(actions=actions, client=ec.es, refresh="wait_for") return new_test_index_name
def apply_crf_extractor_to_index(object_id: int, indices: List[str], mlp_fields: List[str], label_suffix: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """ Applies Extractor to ES index. """ try: # load model crf_object = CRFExtractorObject.objects.get(pk=object_id) extractor = crf_object.load_extractor() # progress progress = ShowProgress(crf_object.task) # add fact field if missing ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] # search searcher = ElasticSearcher( indices=indices, field_data=mlp_fields + ["texta_facts"], # Get facts to add upon existing ones. query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=bulk_size) # create update actions actions = update_generator(generator=searcher, ec=ec, mlp_fields=mlp_fields, label_suffix=label_suffix, object_id=object_id, extractor=extractor) # perform updates try: # as we have defined indices in actions there is no need to do it again (None) ElasticDocument(None).bulk_update(actions) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) # all done crf_object.task.complete() return True except Exception as e: crf_object.task.handle_failed_task(e) raise e