def get_field_choices(): es = ElasticCore() if es.connection: return [(a, '{0} - {1}'.format(a['index'], a['path'])) for a in es.get_fields()] else: return []
def get_field_choices(): """ Retrieves field options from ES. """ es = ElasticCore() if es.connection: return [(a, '{0} - {1}'.format(a['index'], a['path'])) for a in es.get_fields()] else: return []
class ReindexerViewTests(APITransactionTestCase): def setUp(self): """ user needs to be admin, because of changed indices permissions """ self.test_index_name = reindex_test_dataset() self.default_password = '******' self.default_username = '******' self.user = create_test_user(self.default_username, '*****@*****.**', self.default_password) # create admin to test indices removal from project self.admin = create_test_user(name='admin', password='******') self.admin.is_superuser = True self.admin.save() self.project = project_creation("ReindexerTestProject", self.test_index_name, self.user) self.project.users.add(self.user) self.ec = ElasticCore() self.client.login(username=self.default_username, password=self.default_password) self.new_index_name = f"{TEST_FIELD}_2" def tearDown(self) -> None: self.ec.delete_index(index=self.test_index_name, ignore=[400, 404]) def test_run(self): existing_new_index_payload = { "description": "TestWrongField", "indices": [self.test_index_name], "new_index": REINDEXER_TEST_INDEX, # index created for test purposes } wrong_fields_payload = { "description": "TestWrongField", "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, "fields": ['12345'], } wrong_indices_payload = { "description": "TestWrongIndex", "indices": ["Wrong_Index"], "new_index": TEST_INDEX_REINDEX, } pick_fields_payload = { "description": "TestManyReindexerFields", # this has a problem with possible name duplicates "fields": [TEST_FIELD, 'comment_content_clean.text', 'texta_facts'], "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, } # duplicate name problem? # if you want to actually test it, add an index to indices and project indices join_indices_fields_payload = { "description": "TestReindexerJoinFields", "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, } test_query_payload = { "description": "TestQueryFiltering", "scroll_size": 100, "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, "query": json.dumps(TEST_QUERY) } random_docs_payload = { "description": "TestReindexerRandomFields", "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, "random_size": 500, } update_field_type_payload = { "description": "TestReindexerUpdateFieldType", "fields": [], "indices": [self.test_index_name], "new_index": TEST_INDEX_REINDEX, "field_type": [ { "path": "comment_subject", "field_type": "long", "new_path_name": "CHANGED_NAME" }, { "path": "comment_content_lemmas", "field_type": "fact", "new_path_name": "CHANGED_TOO" }, { "path": "comment_content_clean.stats.text_length", "field_type": "boolean", "new_path_name": "CHANGED_AS_WELL" }, ], } for REINDEXER_VALIDATION_TEST_INDEX in ( REINDEXER_VALIDATION_TEST_INDEX_1, REINDEXER_VALIDATION_TEST_INDEX_2, REINDEXER_VALIDATION_TEST_INDEX_3, REINDEXER_VALIDATION_TEST_INDEX_4, REINDEXER_VALIDATION_TEST_INDEX_5, REINDEXER_VALIDATION_TEST_INDEX_6): new_index_validation_payload = { "description": "TestNewIndexValidation", "indices": [self.test_index_name], "new_index": REINDEXER_VALIDATION_TEST_INDEX } url = f'{TEST_VERSION_PREFIX}/projects/{self.project.id}/elastic/reindexer/' self.check_new_index_validation(url, new_index_validation_payload) for payload in ( existing_new_index_payload, wrong_indices_payload, wrong_fields_payload, pick_fields_payload, join_indices_fields_payload, test_query_payload, random_docs_payload, update_field_type_payload, ): url = f'{TEST_VERSION_PREFIX}/projects/{self.project.id}/elastic/reindexer/' self.run_create_reindexer_task_signal(self.project, url, payload) def run_create_reindexer_task_signal(self, project, url, payload, overwrite=False): """ Tests the endpoint for a new Reindexer task, and if a new Task gets created via the signal checks if new_index was removed """ try: self.ec.delete_index(TEST_INDEX_REINDEX) except: print(f'{TEST_INDEX_REINDEX} was not deleted') response = self.client.post(url, payload, format='json') print_output('run_create_reindexer_task_signal:response.data', response.data) self.check_update_forbidden(url, payload) self.is_new_index_created_if_yes_remove(response, payload, project) self.is_reindexed_index_added_to_project_if_yes_remove( response, payload['new_index'], project) assert TEST_INDEX_REINDEX not in ElasticCore().get_indices() def check_new_index_validation(self, url, new_index_validation_payload): response = self.client.post(url, new_index_validation_payload, format='json') print_output('new_index_validation:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(response.data["detail"].code, "invalid_index_name") def is_new_index_created_if_yes_remove(self, response, payload, project): """ Check if new_index gets created Check if new_index gets re-indexed and completed remove test new_index """ if project.get_indices() is None or response.exception: self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) else: self.assertEqual(response.status_code, status.HTTP_201_CREATED) created_reindexer = Reindexer.objects.get(id=response.data['id']) print_output("Re-index task status: ", created_reindexer.task.status) self.assertEqual(created_reindexer.task.status, Task.STATUS_COMPLETED) # self.check_positive_doc_count() new_index = response.data['new_index'] delete_response = self.ec.delete_index(new_index) print_output("Reindexer Test index remove status", delete_response) def is_reindexed_index_added_to_project_if_yes_remove( self, response, new_index, project): # project resource user is not supposed to have indices remove permission, so use admin self.client.login(username='******', password='******') url = f'{TEST_VERSION_PREFIX}/projects/{project.id}/' check = self.client.get(url, format='json') if response.status_code == 201: assert new_index in [ index["name"] for index in check.data['indices'] ] print_output('Re-indexed index added to project', check.data) index_pk = Index.objects.get(name=new_index).pk remove_index_url = reverse( f"{VERSION_NAMESPACE}:project-remove-indices", kwargs={"pk": self.project.pk}) remove_response = self.client.post(remove_index_url, {"indices": [index_pk]}, format='json') print_output("Re-indexed index removed from project", remove_response.status_code) self.delete_reindexing_task(project, response) if response.status_code == 400: print_output('Re-indexed index not added to project', check.data) check = self.client.get(url, format='json') assert new_index not in [ index["name"] for index in check.data['indices'] ] # Log in with project user again self.client.login(username=self.default_username, password=self.default_password) def validate_fields(self, project, payload): project_fields = self.ec.get_fields(project.get_indices()) project_field_paths = [field["path"] for field in project_fields] for field in payload['fields']: if field not in project_field_paths: return False return True def validate_indices(self, project, payload): for index in payload['indices']: if index not in project.get_indices(): return False return True def check_positive_doc_count(self): # current reindexing tests require approx 2 seconds delay sleep(5) count_new_documents = ElasticSearcher( indices=TEST_INDEX_REINDEX).count() print_output("Bulk add doc count", count_new_documents) assert count_new_documents > 0 def check_update_forbidden(self, url, payload): put_response = self.client.put(url, payload, format='json') patch_response = self.client.patch(url, payload, format='json') print_output("put_response.data", put_response.data) print_output("patch_response.data", patch_response.data) self.assertEqual(put_response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED) self.assertEqual(patch_response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED) def delete_reindexing_task(self, project, response): """ test delete reindex task """ task_url = response.data['url'] get_response = self.client.get(task_url) self.assertEqual(get_response.status_code, status.HTTP_200_OK) delete_response = self.client.delete(task_url, format='json') self.assertEqual(delete_response.status_code, status.HTTP_204_NO_CONTENT) get_response = self.client.get(task_url) self.assertEqual(get_response.status_code, status.HTTP_404_NOT_FOUND) def test_that_changing_field_names_works(self): payload = { "description": "RenameFieldName", "new_index": self.new_index_name, "fields": [TEST_FIELD], "field_type": [{ "path": TEST_FIELD, "new_path_name": TEST_FIELD_RENAMED, "field_type": "text" }], "indices": [self.test_index_name], "add_facts_mapping": True } # Reindex the test index into a new one. url = reverse("v2:reindexer-list", kwargs={"project_pk": self.project.pk}) reindex_response = self.client.post(url, data=payload, format='json') print_output('test_that_changing_field_names_works:response.data', reindex_response.data) # Check that the fields have been changed. es = ElasticSearcher(indices=[self.new_index_name]) for document in es: self.assertTrue(TEST_FIELD not in document) self.assertTrue(TEST_FIELD_RENAMED in document) # Manual clean up. es.core.delete_index(self.new_index_name)
def annotator_task(self, annotator_task_id): annotator_obj = Annotator.objects.get(pk=annotator_task_id) annotator_group_children = [] indices = annotator_obj.get_indices() users = [user.pk for user in annotator_obj.annotator_users.all()] task_object = annotator_obj.task annotator_fields = json.loads(annotator_obj.fields) all_fields = annotator_fields all_fields.append("texta_meta.document_uuid") if annotator_obj.annotation_type == 'entity': all_fields.append("texta_facts") all_fields.append(texta_mlp.settings.META_KEY) # Include MLP Meta key here so it would be pulled from Elasticsearch. project_obj = Project.objects.get(id=annotator_obj.project_id) new_field_type = get_selected_fields(indices, annotator_fields) field_type = add_field_type(new_field_type) add_facts_mapping = annotator_obj.add_facts_mapping scroll_size = 100 new_indices = [] new_annotators = [] for user in users: annotating_user = User.objects.get(pk=user) new_annotators.append(annotating_user.pk) for index in indices: new_indices.append(f"{index}_{user}_{annotator_obj.task_id}") query = annotator_obj.query logging.getLogger(INFO_LOGGER).info(f"Starting task annotator with Task ID {annotator_obj.task_id}.") try: ec = ElasticCore() index_fields = ec.get_fields(indices) index_fields = [index_field["path"] for index_field in index_fields] # ElasticSearcher seems to be broken when handling scrolls with only the main field in its field_data instead of all of them in dot notation. # Hence this ugly hack is needed if I want to include the MLP meta field inside the output. for annotator_field in json.loads(annotator_obj.fields): for index_field in index_fields: stripped_mlp_field = annotator_field.split("_mlp.")[0] if "_mlp." in annotator_field else annotator_field if texta_mlp.settings.META_KEY in index_field and stripped_mlp_field in index_field: all_fields.append(index_field) show_progress = ShowProgress(task_object, multiplier=1) show_progress.update_step("scrolling data") show_progress.update_view(0) __add_meta_to_original_index(indices, index_fields, show_progress, query, scroll_size, ec) for new_annotator in new_annotators: new_annotator_obj = Annotator.objects.create( annotator_uid=f"{annotator_obj.description}_{new_annotator}_{annotator_obj.task_id}", description=f"{annotator_obj.description}", author=annotator_obj.author, project=annotator_obj.project, total=annotator_obj.total, fields=annotator_obj.fields, add_facts_mapping=add_facts_mapping, annotation_type=annotator_obj.annotation_type, binary_configuration=annotator_obj.binary_configuration, multilabel_configuration=annotator_obj.multilabel_configuration, entity_configuration=annotator_obj.entity_configuration, ) new_annotator_obj.annotator_users.add(new_annotator) for new_index in new_indices: logging.getLogger(INFO_LOGGER).info(f"New Index check {new_index} for user {new_annotator}") logging.getLogger(INFO_LOGGER).info(f"Index object {indices}") for index in indices: if new_index == f"{index}_{new_annotator}_{annotator_obj.task_id}": elastic_search = ElasticSearcher(indices=indices, field_data=all_fields, callback_progress=show_progress, query=query, scroll_size=scroll_size) elastic_doc = ElasticDocument(new_index) logging.getLogger(INFO_LOGGER).info(f"Updating index schema for index {new_index}") ''' the operations that don't require a mapping update have been completed ''' schema_input = update_field_types(indices, all_fields, field_type, flatten_doc=False) updated_schema = update_mapping(schema_input, new_index, add_facts_mapping, add_texta_meta_mapping=True) logging.getLogger(INFO_LOGGER).info(f"Creating new index {new_index} for user {new_annotator}") # create new_index create_index_res = ElasticCore().create_index(new_index, updated_schema) index_model, is_created = Index.objects.get_or_create(name=new_index) project_obj.indices.add(index_model) index_user = index_model.name.rsplit('_', 2)[1] if str(index_user) == str(new_annotator): new_annotator_obj.indices.add(index_model) logging.getLogger(INFO_LOGGER).info("Indexing documents.") # set new_index name as mapping name bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=False) new_annotator_obj.save() annotator_group_children.append(new_annotator_obj.id) logging.getLogger(INFO_LOGGER).info(f"Saving new annotator object ID {new_annotator_obj.id}") new_annotator_obj.add_annotation_mapping(new_indices) new_annotator_obj.add_texta_meta_mapping(new_indices) annotator_obj.annotator_users.clear() annotator_obj.save() annotator_group, is_created = AnnotatorGroup.objects.get_or_create(project=annotator_obj.project, parent=annotator_obj) annotator_group.children.add(*annotator_group_children) # declare the job done task_object.complete() except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) task_object.add_error(str(e)) task_object.update_status(Task.STATUS_FAILED) raise e logging.getLogger(INFO_LOGGER).info(f"Annotator with Task ID {annotator_obj.task_id} successfully completed.") return True