def _get_sample_document(self, id_field: str, id_value: str, index: str): query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict() es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW) ed = ElasticDocument(index=index) response = es.search()["hits"]["hits"] document = response[0] if response else None return ed, document
def more_like_cluster(self, request, *args, **kwargs): serializer = ElasticMoreLikeThisSerializer(data=request.data, partial=True) serializer.is_valid(raise_exception=True) clustering_pk = ClusterViewSet.__handle_clustering_pk(kwargs) clustering_object = ClusteringResult.objects.get(pk=clustering_pk) cluster = Cluster.objects.get(pk=kwargs["pk"]) indices = clustering_object.get_indices() doc_ids = json.loads(cluster.document_ids) ignored_ids = json.loads(clustering_object.ignored_ids) fields = json.loads(clustering_object.fields) document_ids = [{"_id": doc_id} for doc_id in doc_ids] serializer.validated_data.pop("indices", None) serializer.validated_data.pop("like", None) serializer.validated_data.pop("fields", None) es = ElasticSearcher(indices=indices) result = es.more_like_this(indices=indices, mlt_fields=fields, like=document_ids, exclude=ignored_ids, flatten=True, **serializer.validated_data) return Response(result, status=status.HTTP_200_OK)
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """Apply Torch Tagger to index.""" try: tagger_object = TorchTaggerObject.objects.get(pk=object_id) tagger = tagger_object.load_tagger() progress = ShowProgress(tagger_object.task) ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices = indices, field_data = fields + ["texta_facts"], # Get facts to add upon existing ones. query = query, output = ElasticSearcher.OUT_RAW, timeout = f"{es_timeout}m", callback_progress=progress, scroll_size = bulk_size ) actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger) for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info)) tagger_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED)
def check_positive_doc_count(self): # current reindexing tests require approx 2 seconds delay sleep(5) count_new_documents = ElasticSearcher( indices=TEST_INDEX_REINDEX).count() print_output("Bulk add doc count", count_new_documents) assert count_new_documents > 0
def test_that_changing_field_names_works(self): payload = { "description": "RenameFieldName", "new_index": self.new_index_name, "fields": [TEST_FIELD], "field_type": [{ "path": TEST_FIELD, "new_path_name": TEST_FIELD_RENAMED, "field_type": "text" }], "indices": [self.test_index_name], "add_facts_mapping": True } # Reindex the test index into a new one. url = reverse("v2:reindexer-list", kwargs={"project_pk": self.project.pk}) reindex_response = self.client.post(url, data=payload, format='json') print_output('test_that_changing_field_names_works:response.data', reindex_response.data) # Check that the fields have been changed. es = ElasticSearcher(indices=[self.new_index_name]) for document in es: self.assertTrue(TEST_FIELD not in document) self.assertTrue(TEST_FIELD_RENAMED in document) # Manual clean up. es.core.delete_index(self.new_index_name)
def fact_delete_query_task(self, worker_id: int): worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step( 'Scrolling through the indices to delete the facts.') # Get the necessary fields. indices: List[str] = worker_object.get_indices() target_facts = json.loads(worker_object.facts) scroll_size = worker_object.scroll_size searcher = ElasticSearcher( query=json.loads(worker_object.query), indices=indices, field_data=[TEXTA_TAGS_KEY], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=f"{worker_object.es_timeout}m") ed = ElasticDocument(index=None) actions = query_delete_actions_generator(searcher, target_facts) ed.bulk_update(actions) worker_object.task.complete() worker_object.save() return worker_id except Exception as e: worker_object.task.handle_failed_task(e) raise e
def start_fact_delete_query_task(self, worker_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id) try: logging.getLogger(INFO_LOGGER).info( f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}" ) # init progress show_progress = ShowProgress(worker_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=worker_object.get_indices(), output=ElasticSearcher.OUT_DOC, callback_progress=show_progress, scroll_size=worker_object.scroll_size, field_data=["texta_facts"]) count = searcher.count() show_progress.update_step(f'Deleting facts from {count} documents') show_progress.update_view(0) worker_object.task.set_total(count) return True except Exception as e: worker_object.task.handle_failed_task(e) raise e
def _initialize_es(self, project_pk, text_processor, callback_progress, prediction_to_match): # create es doc es_doc = ElasticDocument(self.feedback_index) # if no model objects, return nones for query and search if not self.model_object: return es_doc, None, None # create mathing query query = Query() query.add_string_filter(query_string=self.model_object.MODEL_TYPE, fields=["model_type"]) if self.model_object: query.add_string_filter(query_string=str(self.model_object.pk), fields=["model_id"]) if prediction_to_match: query.add_string_filter(query_string=prediction_to_match, fields=["correct_result"]) # if no index, don't create searcher object if not self.check_index_exists(): return es_doc, None, query.query # create es search es_search = ElasticSearcher(indices=self.feedback_index, query=query.query, text_processor=text_processor, output=ElasticSearcher.OUT_DOC_WITH_ID, callback_progress=callback_progress) # return objects return es_doc, es_search, query.query
def test_that_split_index_with_nested_field_still_has_nested_field(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, data=payload, format="json") print_output( 'test_that_split_index_with_nested_field_still_has_nested_field:response.data', response.data) at_least_once = False es = ElasticSearcher( indices=[INDEX_SPLITTING_TEST_INDEX, INDEX_SPLITTING_TEST_INDEX], field_data=[TEST_INDEX_OBJECT_FIELD], flatten=False) for item in es: data = item.get(TEST_INDEX_OBJECT_FIELD, None) if data: self.assertTrue(isinstance(data, dict)) at_least_once = True self.assertTrue(at_least_once)
def test_index_processing(self): query_string = "inimene" payload = { "description": "TestingIndexProcessing", "fields": [TEST_FIELD], "query": json.dumps( {'query': { 'match': { 'comment_content_lemmas': query_string } }}, ensure_ascii=False) } response = self.client.post(self.url, data=payload, format="json") print_output("test_index_processing:response.data", response.data) # Check if MLP was applied to the documents properly. s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=payload["query"]) for hit in s: self._assert_mlp_contents(hit, TEST_FIELD)
def _get_split_documents_by_id(self, id_field, id_value, text_field): documents = [] query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict() es = ElasticSearcher(query=query, field_data=[id_field, text_field], output=ElasticSearcher.OUT_RAW) for hit in es: for document in hit: documents.append(document) return documents
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object: RegexTaggerGroup = self.get_object() serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = project_object.get_available_or_all_project_indices(indices) # retrieve tagger fields fields = serializer.validated_data["fields"] if not ElasticCore().check_if_indices_exist( tagger_object.project.get_indices()): return Response( { 'error': f'One or more index from {list(tagger_object.project.get_indices())} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply tagger results = { "tagger_group_id": tagger_object.pk, "tagger_group_tag": tagger_object.description, "result": False, "matches": [], "document": flattened_doc } final_matches = [] for field in fields: text = flattened_doc.get(field, None) results["document"][field] = text matches = tagger_object.match_texts([text], as_texta_facts=True, field=field) if matches: final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def test_create_splitter_object_and_task_signal(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, json.dumps(payload), content_type='application/json') print_output( 'test_create_splitter_object_and_task_signal:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) print_output("indices:", splitter_obj.get_indices()) # Check if IndexSplitter object gets created self.assertEqual(response.status_code, status.HTTP_201_CREATED) # Check if Task gets created self.assertTrue(splitter_obj.task is not None) print_output("status of IndexSplitter's Task object", splitter_obj.task.status) # Check if Task gets completed self.assertEqual(splitter_obj.task.status, Task.STATUS_COMPLETED) sleep(5) original_count = ElasticSearcher(indices=self.test_index_name).count() test_count = ElasticSearcher( indices=INDEX_SPLITTING_TEST_INDEX).count() train_count = ElasticSearcher( indices=INDEX_SPLITTING_TRAIN_INDEX).count() print_output('original_count, test_count, train_count', [original_count, test_count, train_count])
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore): index_elastic_search = ElasticSearcher( indices=indices, field_data=index_fields, callback_progress=show_progress, query=query, output=ElasticSearcher.OUT_RAW, scroll_size=scroll_size ) index_actions = add_doc_uuid(generator=index_elastic_search) for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def post(self, request, project_pk: int): """Executes **raw** Elasticsearch query on all project indices.""" project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) serializer = ProjectSearchByQuerySerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"]) if not indices: raise ProjectValidationFailed(detail="No indices supplied and project has no indices") es = None if serializer.validated_data["output_type"]: es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"]) else: es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS) es.update_query(serializer.validated_data["query"]) results = es.search() return Response(results, status=status.HTTP_200_OK)
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def start_mlp_worker(self, mlp_id: int): """ Scrolls the document ID-s and passes them to MLP worker. """ mlp_object = MLPWorker.objects.get(pk=mlp_id) try: logging.getLogger(INFO_LOGGER).info( f"Applying mlp on the index for MLP Task ID: {mlp_id}") # init progress show_progress = ShowProgress(mlp_object.task, multiplier=1) show_progress.update_step('Scrolling document IDs') show_progress.update_view(0) # Get the necessary fields. indices: List[str] = mlp_object.get_indices() es_scroll_size = mlp_object.es_scroll_size es_timeout = mlp_object.es_timeout # create searcher object for scrolling ids searcher = ElasticSearcher(query=json.loads(mlp_object.query), indices=indices, output=ElasticSearcher.OUT_META, callback_progress=show_progress, scroll_size=es_scroll_size, scroll_timeout=f"{es_timeout}m") # add texta facts mappings to the indices if needed for index in indices: searcher.core.add_texta_facts_mapping(index=index) doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE)) # update progress show_progress.update_step( f'Applying MLP to {len(doc_chunks)} documents') show_progress.update_view(0) mlp_object.task.set_total(searcher.count()) mlp_object.task.update_status(Task.STATUS_RUNNING) # pass document id-s to the next task chain = group( apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id) for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id) chain.delay() return True except Exception as e: mlp_object.task.handle_failed_task(e) raise
def apply_analyzers_on_indices(self, worker_id: int): worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id) task_object = worker_object.task try: show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() fields = json.loads(worker_object.fields) detect_lang = worker_object.detect_lang snowball_language = worker_object.stemmer_lang scroll_timeout = f"{worker_object.es_timeout}m" scroll_size = worker_object.bulk_size analyzers = json.loads(worker_object.analyzers) tokenizer = worker_object.tokenizer strip_html = worker_object.strip_html searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=fields, output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout=scroll_timeout) task_object.set_total(searcher.count()) actions = process_analyzer_actions(generator=searcher, worker=worker_object, detect_lang=detect_lang, snowball_language=snowball_language, fields_to_parse=fields, analyzers=analyzers, tokenizer=tokenizer, strip_html=strip_html) # Send the data towards Elasticsearch ed = ElasticDocument("_all") ed.bulk_update(actions=actions, chunk_size=scroll_size) worker_object.task.complete() return worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def test_create_random_split(self): payload = { "description": "Random index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "random", "test_size": 20 } response = self.client.post(self.url, data=payload) print_output('test_create_random_split:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) # Assert Task gets completed self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED) print_output("Task status", Task.STATUS_COMPLETED) sleep(5) original_count = ElasticSearcher(indices=self.test_index_name).count() test_count = ElasticSearcher( indices=INDEX_SPLITTING_TEST_INDEX).count() train_count = ElasticSearcher( indices=INDEX_SPLITTING_TRAIN_INDEX).count() print_output('original_count, test_count, train_count', [original_count, test_count, train_count]) # To avoid any inconsistencies caused by rounding assume sizes are between small limits self.assertTrue(self.is_between_limits(test_count, original_count, 0.2)) self.assertTrue( self.is_between_limits(train_count, original_count, 0.8))
def get_tag_candidates(tagger_group_id: int, text: str, ignore_tags: List[str] = [], n_similar_docs: int = 10, max_candidates: int = 10): """ Finds frequent tags from documents similar to input document. Returns empty list if hybrid option false. """ hybrid_tagger_object = TaggerGroup.objects.get(pk=tagger_group_id) field_paths = json.loads(hybrid_tagger_object.taggers.first().fields) indices = hybrid_tagger_object.get_indices() logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Selecting from following indices: {indices}.") ignore_tags = {tag["tag"]: True for tag in ignore_tags} # create query query = Query() query.add_mlt(field_paths, text) # create Searcher object for MLT es_s = ElasticSearcher(indices=indices, query=query.query) logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Trying to retrieve {n_similar_docs} documents from Elastic..." ) docs = es_s.search(size=n_similar_docs) logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Successfully retrieved {len(docs)} documents from Elastic." ) # dict for tag candidates from elastic tag_candidates = {} # retrieve tags from elastic response for doc in docs: if "texta_facts" in doc: for fact in doc["texta_facts"]: if fact["fact"] == hybrid_tagger_object.fact_name: fact_val = fact["str_val"] if fact_val not in ignore_tags: if fact_val not in tag_candidates: tag_candidates[fact_val] = 0 tag_candidates[fact_val] += 1 # sort and limit candidates tag_candidates = [ item[0] for item in sorted( tag_candidates.items(), key=lambda k: k[1], reverse=True) ][:max_candidates] logging.getLogger(INFO_LOGGER).info( f"[Get Tag Candidates] Retrieved {len(tag_candidates)} tag candidates." ) return tag_candidates
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() if not tagger_object.model.path: return Response({'error': 'model does not exist (yet?)'}, status=status.HTTP_400_BAD_REQUEST) serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = tagger_object.get_available_or_all_indices(indices) # retrieve tagger fields tagger_fields = json.loads(tagger_object.fields) if not ElasticCore().check_if_indices_exist(indices): return Response( { 'error': f'One or more index from {list(indices)} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] # filter out correct fields from the document random_doc_filtered = { k: v for k, v in random_doc.items() if k in tagger_fields } # apply tagger tagger_response = apply_tagger(tagger_object.id, random_doc_filtered, input_type='doc') response = {"document": random_doc, "prediction": tagger_response} return Response(response, status=status.HTTP_200_OK)
def test_applying_lang_detect_with_query(self): mlp_field = f"{TEST_FIELD}_mlp" query_string = "inimene" payload = { "description": "TestingIndexProcessing", "field": TEST_FIELD, "query": json.dumps({'query': {'match': {'comment_content_lemmas': query_string}}}, ensure_ascii=False) } response = self.client.post(self.url, data=payload, format="json") print_output("test_applying_lang_detect_with_query:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"])) for hit in s: if TEST_FIELD in hit: self.assertTrue(f"{mlp_field}.language.detected" in hit) lang_value = hit[f"{mlp_field}.language.detected"] self.assertTrue(lang_value == "et")
def apply_crf_extractor_to_index(object_id: int, indices: List[str], mlp_fields: List[str], label_suffix: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """ Applies Extractor to ES index. """ try: # load model crf_object = CRFExtractorObject.objects.get(pk=object_id) extractor = crf_object.load_extractor() # progress progress = ShowProgress(crf_object.task) # add fact field if missing ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] # search searcher = ElasticSearcher( indices=indices, field_data=mlp_fields + ["texta_facts"], # Get facts to add upon existing ones. query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=bulk_size) # create update actions actions = update_generator(generator=searcher, ec=ec, mlp_fields=mlp_fields, label_suffix=label_suffix, object_id=object_id, extractor=extractor) # perform updates try: # as we have defined indices in actions there is no need to do it again (None) ElasticDocument(None).bulk_update(actions) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) # all done crf_object.task.complete() return True except Exception as e: crf_object.task.handle_failed_task(e) raise e
def extract_from_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get rakun object rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk) serializer = RakunExtractorRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [index["name"] for index in serializer.validated_data["indices"]] indices = project_object.get_available_or_all_project_indices(indices) # retrieve rakun fields fields = serializer.validated_data["fields"] # retrieve param add_spans add_spans = serializer.validated_data["add_spans"] # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply rakun results = { "rakun_id": rakun_object.pk, "description": rakun_object.description, "result": False, "keywords": [], "document": flattened_doc } final_keywords = [] keyword_detector = rakun_object.load_rakun_keyword_detector() for field in fields: text = flattened_doc.get(field, "") results["document"][field] = text keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans) if keywords: final_keywords.extend(keywords) results["result"] = True results["keywords"] = final_keywords return Response(results, status=status.HTTP_200_OK)
def test_processing_with_just_tokenizer(self): payload = { "description": "hello there, kenobi.", "fields": [TEST_FIELD], "analyzers": ["tokenizer"], "indices": [{ "name": self.test_index_name }], "query": json.dumps(TEST_QUERY, ensure_ascii=False) } response = self.client.post(self.list_url, data=payload, format="json") print_output("test_processing_with_just_tokenizer:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name], query=TEST_QUERY) for hit in s: new_field = f'{TEST_FIELD}_es.tokenized_text' self.assertTrue(new_field in hit) self.assertTrue(hit[new_field] != hit[TEST_FIELD])
def _get_negatives(self, size): self.show_progress.update_step("scrolling negative sample") self.show_progress.update_view(0) # iterator for retrieving negative examples negative_sample_iterator = ElasticSearcher( indices=self.indices, field_data=self.field_data, output=ElasticSearcher.OUT_DOC, callback_progress=self.show_progress, text_processor=self.text_processor, scroll_limit=int(size * float(self.tagger_object.negative_multiplier)), ignore_ids=self.ignore_ids, ) # iterator to list negative_sample = list(negative_sample_iterator) # document doct to value string if asked if self.join_fields: negative_sample = self._join_fields(negative_sample) return negative_sample
def test_normal_process_application(self): payload = { "description": "hello there, kenobi.", "analyzers": ["stemmer"], "fields": [TEST_FIELD], "stemmer_lang": "estonian", "indices": [{ "name": self.test_index_name }] } response = self.client.post(self.list_url, data=payload, format="json") print_output("test_normal_process_application:response.data", response.data) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = ElasticSearcher(indices=[self.test_index_name]) for hit in s: new_field = f'{TEST_FIELD}_es.stems' self.assertTrue(new_field in hit) self.assertTrue(hit[new_field] != hit[TEST_FIELD]) break
def apply_lang_on_indices(self, apply_worker_id: int): worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id) task_object = worker_object.task try: load_mlp() show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step( 'scrolling through the indices to apply lang') # Get the necessary fields. indices: List[str] = worker_object.get_indices() field = worker_object.field scroll_size = 100 searcher = ElasticSearcher(query=json.loads(worker_object.query), indices=indices, field_data=[field], output=ElasticSearcher.OUT_RAW, callback_progress=show_progress, scroll_size=scroll_size, scroll_timeout="15m") for index in indices: searcher.core.add_texta_facts_mapping(index=index) actions = process_lang_actions(generator=searcher, field=field, worker_id=apply_worker_id, mlp_class=mlp) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) worker_object.task.complete() return apply_worker_id except Exception as e: task_object.handle_failed_task(e) raise e
def post(self, request, project_pk: int): """Simplified search interface for making Elasticsearch queries.""" serializer = ProjectSimplifiedSearchSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) project_object = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project_object) project_indices = list(project_object.get_indices()) project_fields = project_object.get_elastic_fields(path_list=True) # test if indices exist if not project_indices: raise ProjectValidationFailed(detail="Project has no indices") # test if indices are valid if serializer.validated_data['match_indices']: if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)): raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}") # test if fields are valid if serializer.validated_data['match_fields']: if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)): raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}") es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC) q = Query(operator=serializer.validated_data['operator']) # if input is string, convert to list # if unknown format, return error match_text = serializer.validated_data['match_text'] if isinstance(match_text, list): match_texts = [str(item) for item in match_text if item] elif isinstance(match_text, str): match_texts = [match_text] else: return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST) # add query filters for item in match_texts: q.add_string_filter(item, match_type=serializer.validated_data["match_type"]) # update query es.update_query(q.query) # retrieve results results = es.search(size=serializer.validated_data["size"]) return Response(results, status=status.HTTP_200_OK)