def tag_doc(self, request, pk=None, project_pk=None): serializer = RegexTaggerGroupTagDocumentSerializer(data=request.data) serializer.is_valid(raise_exception=True) tagger_object: RegexTagger = self.get_object() input_document = serializer.validated_data["doc"] fields = serializer.validated_data["fields"] # apply tagger results = { "tagger_id": tagger_object.pk, "tag": tagger_object.description, "result": False, "matches": [] } final_matches = [] for field in fields: flattened_doc = ElasticCore( check_connection=False).flatten(input_document) text = flattened_doc.get(field, None) matches = tagger_object.match_texts([text], as_texta_facts=False) if matches: for match in matches: match.update(field=field) final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def close_index(self, request, pk=None, project_pk=None): es_core = ElasticCore() index = Index.objects.get(pk=pk) es_core.close_index(index.name) index.is_open = False index.save() return Response({"message": f"Closed the index {index.name}"})
def tearDown(self) -> None: Tagger.objects.all().delete() ec = ElasticCore() res = ec.delete_index(self.test_index_copy) ec.delete_index(index=self.test_index_name, ignore=[400, 404]) print_output(f"Delete apply_taggers test index {self.test_index_copy}", res)
def list(self, request, *args, **kwargs): ec = ElasticCore() response = super(IndexViewSet, self).list(request, *args, **kwargs) data = response.data # Get the paginated and sorted queryset results. open_indices = [index for index in data if index["is_open"]] mappings = ec.es.indices.get_mapping() # Doing a stats request with no indices causes trouble. if open_indices: stats = ec.get_index_stats() # Update the paginated and sorted queryset results. for index in response.data: name = index["name"] is_open = index["is_open"] if is_open: has_texta_facts_mapping = self._check_for_facts( index_mappings=mappings, index_name=name) if name in stats: index.update( **stats[name], has_validated_facts=has_texta_facts_mapping) else: index.update(has_validated_facts=False) else: # For the sake of courtesy on the front-end, make closed indices values zero. index.update(size=0, doc_count=0, has_validated_facts=False) return response
def get_field_choices(): es = ElasticCore() if es.connection: return [(a, '{0} - {1}'.format(a['index'], a['path'])) for a in es.get_fields()] else: return []
def tearDown(self) -> None: ec = ElasticCore() ec.delete_index(index=self.test_index_name, ignore=[400, 404]) print_output(f"Delete [Rakun Extractor] test index {self.test_index_name}", None) Embedding.objects.all().delete() ElasticCore().delete_index(index=self.test_index_name, ignore=[400, 404]) print_output(f"Delete Rakun FASTTEXT Embeddings", None)
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int): """Apply Torch Tagger to index.""" try: tagger_object = TorchTaggerObject.objects.get(pk=object_id) tagger = tagger_object.load_tagger() progress = ShowProgress(tagger_object.task) ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices = indices, field_data = fields + ["texta_facts"], # Get facts to add upon existing ones. query = query, output = ElasticSearcher.OUT_RAW, timeout = f"{es_timeout}m", callback_progress=progress, scroll_size = bulk_size ) actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger) for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3): if not success: logging.getLogger(ERROR_LOGGER).exception(json.dumps(info)) tagger_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. tagger_object.task.add_error(error_message) tagger_object.task.update_status(Task.STATUS_FAILED)
def setUp(self): self.test_index_name = reindex_test_dataset() self.user = create_test_user('first_user', '*****@*****.**', 'pw') self.project = project_creation("DocumentImporterAPI", self.test_index_name, self.user) self.validation_project = project_creation("validation_project", "random_index_name", self.user) self.document_id = random.randint(10000000, 90000000) self.uuid = uuid.uuid1() self.source = {"hello": "world", "uuid": self.uuid} self.document = { "_index": self.test_index_name, "_id": self.document_id, "_source": self.source } self.target_field_random_key = uuid.uuid1() self.target_field = f"{self.target_field_random_key}_court_case" self.ec = ElasticCore() self.client.login(username='******', password='******') self._check_inserting_documents()
def skip_for_es6(): ec = ElasticCore() first, second, third = ec.get_version() if first == 7: return False else: return True
def test_project_creation_using_locked_indices(self): """ There was a bug case that warrants this test case. In case any of the indices in Elasticsearch are locked for whatever reason, then project creation fails because it contains/contained lazy index creation which ignores existing indices and thus created FORBIDDEN_ACCESS errors in Elasticsearch. """ index_names = ["locked_index_{}".format(i) for i in range(1, 5)] for index in index_names: self.__create_locked_index(index) response = self.client.post( reverse(f"{VERSION_NAMESPACE}:project-list"), format="json", data={ "title": "faulty_project", "indices_write": index_names, "users_write": [self.admin.username] }) ec = ElasticCore() for index in index_names: ec.delete_index(index) self.assertTrue(response.status_code == status.HTTP_201_CREATED)
def destroy(self, request, pk=None, **kwargs): with transaction.atomic(): index_name = Index.objects.get(pk=pk).name es = ElasticCore() es.delete_index(index_name) Index.objects.filter(pk=pk).delete() return Response( {"message": f"Deleted index {index_name} from Elasticsearch!"})
def update_project_indices(self, serializer, project_obj): ''' add new_index included in the request to the relevant project object ''' index_to_add = serializer.validated_data['new_index'] from texta_elastic.core import ElasticCore ec = ElasticCore() ec.create_index(index_to_add) index, is_open = Index.objects.get_or_create(name=index_to_add) project_obj.indices.add(index) project_obj.save()
def open_index(self, request, pk=None, project_pk=None): es_core = ElasticCore() index = Index.objects.get(pk=pk) es_core.open_index(index.name) if not index.is_open: index.is_open = True index.save() return Response({"message": f"Opened the index {index.name}"})
def _flatten_document(self, document): ec = ElasticCore() source = document.get("_source") annotator_meta = source.pop(TEXTA_ANNOTATOR_KEY) flattened_source = ec.flatten(source) # Skip the annotator meta when flattening and then attach it back. flattened_source[TEXTA_ANNOTATOR_KEY] = annotator_meta document["_source"] = flattened_source return document
def setUp(self): self.test_index_name = reindex_test_dataset() self.ec = ElasticCore() self.user = create_test_user('mlpUser', '*****@*****.**', 'pw') self.project = project_creation("mlpTestProject", self.test_index_name, self.user) self.project.users.add(self.user) self.client.login(username='******', password='******') self.url = reverse(f"{VERSION_NAMESPACE}:mlp_index-list", kwargs={"project_pk": self.project.pk})
def get_field_choices(): """ Retrieves field options from ES. """ es = ElasticCore() if es.connection: return [(a, '{0} - {1}'.format(a['index'], a['path'])) for a in es.get_fields()] else: return []
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object: RegexTaggerGroup = self.get_object() serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) project_object = Project.objects.get(pk=project_pk) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = project_object.get_available_or_all_project_indices(indices) # retrieve tagger fields fields = serializer.validated_data["fields"] if not ElasticCore().check_if_indices_exist( tagger_object.project.get_indices()): return Response( { 'error': f'One or more index from {list(tagger_object.project.get_indices())} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] flattened_doc = ElasticCore(check_connection=False).flatten(random_doc) # apply tagger results = { "tagger_group_id": tagger_object.pk, "tagger_group_tag": tagger_object.description, "result": False, "matches": [], "document": flattened_doc } final_matches = [] for field in fields: text = flattened_doc.get(field, None) results["document"][field] = text matches = tagger_object.match_texts([text], as_texta_facts=True, field=field) if matches: final_matches.extend(matches) results["result"] = True results["matches"] = final_matches return Response(results, status=status.HTTP_200_OK)
def check_for_existence(value): ec = ElasticCore() index = parse_index_input(value) in_elastic = ec.check_if_indices_exist(indices=[index]) if in_elastic: # This line helps keep the database and Elastic in sync. index, is_created = Index.objects.get_or_create(name=index) else: # We check for a loose Index object just in case and delete them. Index.objects.filter(name=index).delete() raise NoIndexExists(f"Could not access index '{index}'")
def add_texta_meta_mapping(indices: List[str]): """ Adds the mapping for texta_meta. :param indices: Which indices to target for the schemas. :return: """ from texta_elastic.core import ElasticCore ec = ElasticCore() for index in indices: ec.add_texta_meta_mapping(index)
def add_annotation_mapping(indices: List[str]): """ Adds the mapping for the annotator into indices to ensure smooth sailing. :param indices: Which indices to target for the schemas. :return: """ from texta_elastic.core import ElasticCore ec = ElasticCore() for index in indices: ec.add_annotator_mapping(index)
def setUp(self) -> None: self.client.login(username="******", password="******") self.ec = ElasticCore() self.ids = [] self.index_names = [ "test_for_index_endpoint_1", "test_for_index_endpoint_2" ] for index_name in self.index_names: index, is_created = Index.objects.get_or_create(name=index_name) self.ec.es.indices.create(index=index_name, ignore=[400, 404]) self.ids.append(index.pk)
def add_indices(self, request, pk=None, project_pk=None): project: Project = self.get_object() serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [index.name for index in serializer.validated_data["indices"]] ec = ElasticCore() exists = ec.check_if_indices_exist(indices) if exists and indices: for index_name in indices: index, is_created = Index.objects.get_or_create(name=index_name) project.indices.add(index) return Response({"detail": f"Added indices '{str(indices)}' to the project!"}) else: raise ValidationError(f"Could not validate indices f'{str(indices)}'")
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str], fields: List[str], query: dict, es_timeout: int, bulk_size: int, fact_name: str, add_spans: bool): """Apply Rakun Keyword Extractor to index.""" logging.getLogger(INFO_LOGGER).info( f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!" ) rakun_extractor_object = RakunExtractor.objects.get(id=object_id) try: progress = ShowProgress(rakun_extractor_object.task) # retrieve fields field_data = fields ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=field_data + ["texta_facts"], # Get facts to add upon existing ones. query=query, timeout=f"{es_timeout}m", output=ElasticSearcher.OUT_RAW, callback_progress=progress, scroll_size=bulk_size) keyword_detector = rakun_extractor_object.load_rakun_keyword_detector() actions = update_generator( keyword_detector=keyword_detector, generator=searcher, ec=ec, fields=field_data, rakun_extractor_object=rakun_extractor_object, fact_name=fact_name, fact_value="", add_spans=add_spans) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) rakun_extractor_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. rakun_extractor_object.task.add_error(error_message) rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
def __init__(self, project_pk, model_object=None, text_processor=None, callback_progress=None, prediction_to_match=None, es_prefix=get_core_setting("TEXTA_ES_PREFIX"), deploy_key=getattr(settings, "DEPLOY_KEY")): self.es_core = ElasticCore() self.project_pk = project_pk self.feedback_index = f"{es_prefix}texta-{deploy_key}-feedback-project-{project_pk}" self.model_object = model_object self.es_doc, self.es_search, self.query = self._initialize_es( project_pk, text_processor, callback_progress, prediction_to_match)
def apply_search_fields_tagger_on_index(object_id: int): search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id) task_object = search_fields_tagger.task """Apply Search Fields Tagger to index.""" try: progress = ShowProgress(task_object) progress.update_step('scrolling search fields') # Get the necessary fields. indices: List[str] = search_fields_tagger.get_indices() fields: List[str] = json.loads(search_fields_tagger.fields) fact_name: str = search_fields_tagger.fact_name scroll_timeout = search_fields_tagger.es_timeout scroll_size = search_fields_tagger.bulk_size use_breakup = search_fields_tagger.use_breakup breakup_character = search_fields_tagger.breakup_character ec = ElasticCore() [ec.add_texta_facts_mapping(index) for index in indices] searcher = ElasticSearcher( indices=indices, field_data=fields + ["texta_facts"], # Get facts to add upon existing ones. query=json.loads(search_fields_tagger.query), output=ElasticSearcher.OUT_RAW, scroll_timeout=f"{scroll_timeout}m", callback_progress=progress, scroll_size=scroll_size) actions = update_search_fields_generator( generator=searcher, ec=ec, fields=fields, fact_name=fact_name, search_field_tagger_object=search_fields_tagger, use_breakup=use_breakup, breakup_character=breakup_character) # Send the data towards Elasticsearch ed = ElasticDocument("_all") elastic_response = ed.bulk_update(actions=actions) return object_id except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e
def setUp(self) -> None: self.test_index_name = reindex_test_dataset() self.user = create_test_user('Owner', '*****@*****.**', 'pw') self.unauthorized_user = create_test_user('unauthorized', '*****@*****.**', 'pw') self.file_name = "d41d8cd98f00b204e9800998ecf8427e.txt" self.project = project_creation("test_doc_parser", index_title=None, author=self.user) self.project.users.add(self.user) self.unauth_project = project_creation("unauth_project", index_title=None, author=self.user) self.file = SimpleUploadedFile("text.txt", b"file_content", content_type="text/html") self.client.login(username='******', password='******') self._basic_pipeline_functionality() self.file_path = self._get_file_path() self.ec = ElasticCore()
def check_and_create(indices: str): from texta_elastic.core import ElasticCore ec = ElasticCore() if isinstance(indices, list): indices = indices elif isinstance(indices, str): indices = indices.split(",") for index in indices: does_exist = ec.check_if_indices_exist([index]) if does_exist: Index.objects.get_or_create(name=index) else: Index.objects.filter(name=index).delete()
def create(self, request, **kwargs): data = IndexSerializer(data=request.data) data.is_valid(raise_exception=True) es = ElasticCore() index = data.validated_data["name"] is_open = data.validated_data["is_open"] description = data.validated_data["description"] added_by = data.validated_data["added_by"] test = data.validated_data["test"] source = data.validated_data["source"] client = data.validated_data["client"] domain = data.validated_data["domain"] # Using get_or_create to avoid unique name constraints on creation. if es.check_if_indices_exist([index]): # Even if the index already exists, create the index object just in case index, is_created = Index.objects.get_or_create(name=index) if is_created: utc_time = es.get_index_creation_date(index) index.is_open = is_open index.description = description index.added_by = added_by index.test = test index.source = source index.client = client index.domain = domain index.created_at = utc_time index.save() raise ElasticIndexAlreadyExists() else: es.create_index(index=index) if not is_open: es.close_index(index) index, is_created = Index.objects.get_or_create(name=index) if is_created: utc_time = es.get_index_creation_date(index) index.is_open = is_open index.description = description index.added_by = added_by index.test = test index.source = source index.client = client index.domain = domain index.created_at = utc_time index.save() return Response( {"message": f"Added index {index} into Elasticsearch!"}, status=status.HTTP_201_CREATED)
def setUp(self): # Owner of the project self.test_index_name = reindex_test_dataset() self.secondary_index = reindex_test_dataset() self.index, is_created = Index.objects.get_or_create( name=self.secondary_index) self.user = create_test_user('annotator', '*****@*****.**', 'pw') self.user2 = create_test_user('annotator2', '*****@*****.**', 'pw2') self.project = project_creation("entityTestProject", self.test_index_name, self.user) self.project.indices.add(self.index) self.project.users.add(self.user) self.project.users.add(self.user2) self.client.login(username='******', password='******') self.ec = ElasticCore() self.list_view_url = reverse("v2:annotator-list", kwargs={"project_pk": self.project.pk}) self.annotator = self._create_annotator() self.pull_document_url = reverse("v2:annotator-pull-document", kwargs={ "project_pk": self.project.pk, "pk": self.annotator["id"] })
def test_applying_the_regex_tagger_group_to_the_index(self): ec = ElasticCore() tg_description = "toxic" tg_id, tagger_ids = self.__create_tagger_group( "%s" % tg_description, (("racism", ["juut", "neeger", "tõmmu"]), ("hate", ["pederast", "debiilik"]))) url = reverse( f"{VERSION_NAMESPACE}:regex_tagger_group-apply-tagger-group", kwargs={ "project_pk": self.project.pk, "pk": tg_id }) response = self.client.post( url, { "description": "Test Run", "fields": [TEST_FIELD], "indices": [{ "name": self.test_index_name }] }) self.assertTrue(response.status_code == status.HTTP_201_CREATED) s = elasticsearch_dsl.Search(index=self.test_index_name, using=ec.es) has_group_fact = False for hit in s.scan(): facts = hit.to_dict().get("texta_facts", []) for fact in facts: if fact["fact"] == tg_description: has_group_fact = True self.assertTrue(has_group_fact) print_output( 'test_applying_the_regex_tagger_group_to_the_index:response.data', response.data)