Exemple #1
0
def get_field_choices():
    es = ElasticCore()
    if es.connection:
        return [(a, '{0} - {1}'.format(a['index'], a['path']))
                for a in es.get_fields()]
    else:
        return []
Exemple #2
0
def get_field_choices():
    """
    Retrieves field options from ES.
    """
    es = ElasticCore()
    if es.connection:
        return [(a, '{0} - {1}'.format(a['index'], a['path']))
                for a in es.get_fields()]
    else:
        return []
Exemple #3
0
class ReindexerViewTests(APITransactionTestCase):
    def setUp(self):
        """ user needs to be admin, because of changed indices permissions """
        self.test_index_name = reindex_test_dataset()
        self.default_password = '******'
        self.default_username = '******'
        self.user = create_test_user(self.default_username, '*****@*****.**',
                                     self.default_password)

        # create admin to test indices removal from project
        self.admin = create_test_user(name='admin', password='******')
        self.admin.is_superuser = True
        self.admin.save()
        self.project = project_creation("ReindexerTestProject",
                                        self.test_index_name, self.user)
        self.project.users.add(self.user)
        self.ec = ElasticCore()
        self.client.login(username=self.default_username,
                          password=self.default_password)

        self.new_index_name = f"{TEST_FIELD}_2"

    def tearDown(self) -> None:
        self.ec.delete_index(index=self.test_index_name, ignore=[400, 404])

    def test_run(self):
        existing_new_index_payload = {
            "description": "TestWrongField",
            "indices": [self.test_index_name],
            "new_index":
            REINDEXER_TEST_INDEX,  # index created for test purposes
        }
        wrong_fields_payload = {
            "description": "TestWrongField",
            "indices": [self.test_index_name],
            "new_index": TEST_INDEX_REINDEX,
            "fields": ['12345'],
        }
        wrong_indices_payload = {
            "description": "TestWrongIndex",
            "indices": ["Wrong_Index"],
            "new_index": TEST_INDEX_REINDEX,
        }
        pick_fields_payload = {
            "description": "TestManyReindexerFields",
            # this has a problem with possible name duplicates
            "fields":
            [TEST_FIELD, 'comment_content_clean.text', 'texta_facts'],
            "indices": [self.test_index_name],
            "new_index": TEST_INDEX_REINDEX,
        }
        # duplicate name problem?
        # if you want to actually test it, add an index to indices and project indices
        join_indices_fields_payload = {
            "description": "TestReindexerJoinFields",
            "indices": [self.test_index_name],
            "new_index": TEST_INDEX_REINDEX,
        }
        test_query_payload = {
            "description": "TestQueryFiltering",
            "scroll_size": 100,
            "indices": [self.test_index_name],
            "new_index": TEST_INDEX_REINDEX,
            "query": json.dumps(TEST_QUERY)
        }
        random_docs_payload = {
            "description": "TestReindexerRandomFields",
            "indices": [self.test_index_name],
            "new_index": TEST_INDEX_REINDEX,
            "random_size": 500,
        }

        update_field_type_payload = {
            "description":
            "TestReindexerUpdateFieldType",
            "fields": [],
            "indices": [self.test_index_name],
            "new_index":
            TEST_INDEX_REINDEX,
            "field_type": [
                {
                    "path": "comment_subject",
                    "field_type": "long",
                    "new_path_name": "CHANGED_NAME"
                },
                {
                    "path": "comment_content_lemmas",
                    "field_type": "fact",
                    "new_path_name": "CHANGED_TOO"
                },
                {
                    "path": "comment_content_clean.stats.text_length",
                    "field_type": "boolean",
                    "new_path_name": "CHANGED_AS_WELL"
                },
            ],
        }
        for REINDEXER_VALIDATION_TEST_INDEX in (
                REINDEXER_VALIDATION_TEST_INDEX_1,
                REINDEXER_VALIDATION_TEST_INDEX_2,
                REINDEXER_VALIDATION_TEST_INDEX_3,
                REINDEXER_VALIDATION_TEST_INDEX_4,
                REINDEXER_VALIDATION_TEST_INDEX_5,
                REINDEXER_VALIDATION_TEST_INDEX_6):
            new_index_validation_payload = {
                "description": "TestNewIndexValidation",
                "indices": [self.test_index_name],
                "new_index": REINDEXER_VALIDATION_TEST_INDEX
            }
            url = f'{TEST_VERSION_PREFIX}/projects/{self.project.id}/elastic/reindexer/'
            self.check_new_index_validation(url, new_index_validation_payload)

        for payload in (
                existing_new_index_payload,
                wrong_indices_payload,
                wrong_fields_payload,
                pick_fields_payload,
                join_indices_fields_payload,
                test_query_payload,
                random_docs_payload,
                update_field_type_payload,
        ):
            url = f'{TEST_VERSION_PREFIX}/projects/{self.project.id}/elastic/reindexer/'
            self.run_create_reindexer_task_signal(self.project, url, payload)

    def run_create_reindexer_task_signal(self,
                                         project,
                                         url,
                                         payload,
                                         overwrite=False):
        """ Tests the endpoint for a new Reindexer task, and if a new Task gets created via the signal
           checks if new_index was removed """
        try:
            self.ec.delete_index(TEST_INDEX_REINDEX)
        except:
            print(f'{TEST_INDEX_REINDEX} was not deleted')
        response = self.client.post(url, payload, format='json')
        print_output('run_create_reindexer_task_signal:response.data',
                     response.data)
        self.check_update_forbidden(url, payload)
        self.is_new_index_created_if_yes_remove(response, payload, project)
        self.is_reindexed_index_added_to_project_if_yes_remove(
            response, payload['new_index'], project)
        assert TEST_INDEX_REINDEX not in ElasticCore().get_indices()

    def check_new_index_validation(self, url, new_index_validation_payload):
        response = self.client.post(url,
                                    new_index_validation_payload,
                                    format='json')
        print_output('new_index_validation:response.data', response.data)
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
        self.assertEqual(response.data["detail"].code, "invalid_index_name")

    def is_new_index_created_if_yes_remove(self, response, payload, project):
        """ Check if new_index gets created
            Check if new_index gets re-indexed and completed
            remove test new_index """
        if project.get_indices() is None or response.exception:
            self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
        else:
            self.assertEqual(response.status_code, status.HTTP_201_CREATED)
            created_reindexer = Reindexer.objects.get(id=response.data['id'])
            print_output("Re-index task status: ",
                         created_reindexer.task.status)
            self.assertEqual(created_reindexer.task.status,
                             Task.STATUS_COMPLETED)
            # self.check_positive_doc_count()
            new_index = response.data['new_index']
            delete_response = self.ec.delete_index(new_index)
            print_output("Reindexer Test index remove status", delete_response)

    def is_reindexed_index_added_to_project_if_yes_remove(
            self, response, new_index, project):
        # project resource user is not supposed to have indices remove permission, so use admin
        self.client.login(username='******', password='******')
        url = f'{TEST_VERSION_PREFIX}/projects/{project.id}/'
        check = self.client.get(url, format='json')
        if response.status_code == 201:
            assert new_index in [
                index["name"] for index in check.data['indices']
            ]
            print_output('Re-indexed index added to project', check.data)
            index_pk = Index.objects.get(name=new_index).pk
            remove_index_url = reverse(
                f"{VERSION_NAMESPACE}:project-remove-indices",
                kwargs={"pk": self.project.pk})
            remove_response = self.client.post(remove_index_url,
                                               {"indices": [index_pk]},
                                               format='json')
            print_output("Re-indexed index removed from project",
                         remove_response.status_code)
            self.delete_reindexing_task(project, response)

        if response.status_code == 400:
            print_output('Re-indexed index not added to project', check.data)

        check = self.client.get(url, format='json')
        assert new_index not in [
            index["name"] for index in check.data['indices']
        ]
        # Log in with project user again
        self.client.login(username=self.default_username,
                          password=self.default_password)

    def validate_fields(self, project, payload):
        project_fields = self.ec.get_fields(project.get_indices())
        project_field_paths = [field["path"] for field in project_fields]
        for field in payload['fields']:
            if field not in project_field_paths:
                return False
        return True

    def validate_indices(self, project, payload):
        for index in payload['indices']:
            if index not in project.get_indices():
                return False
        return True

    def check_positive_doc_count(self):
        # current reindexing tests require approx 2 seconds delay
        sleep(5)
        count_new_documents = ElasticSearcher(
            indices=TEST_INDEX_REINDEX).count()
        print_output("Bulk add doc count", count_new_documents)
        assert count_new_documents > 0

    def check_update_forbidden(self, url, payload):
        put_response = self.client.put(url, payload, format='json')
        patch_response = self.client.patch(url, payload, format='json')
        print_output("put_response.data", put_response.data)
        print_output("patch_response.data", patch_response.data)
        self.assertEqual(put_response.status_code,
                         status.HTTP_405_METHOD_NOT_ALLOWED)
        self.assertEqual(patch_response.status_code,
                         status.HTTP_405_METHOD_NOT_ALLOWED)

    def delete_reindexing_task(self, project, response):
        """ test delete reindex task """
        task_url = response.data['url']
        get_response = self.client.get(task_url)
        self.assertEqual(get_response.status_code, status.HTTP_200_OK)
        delete_response = self.client.delete(task_url, format='json')
        self.assertEqual(delete_response.status_code,
                         status.HTTP_204_NO_CONTENT)
        get_response = self.client.get(task_url)
        self.assertEqual(get_response.status_code, status.HTTP_404_NOT_FOUND)

    def test_that_changing_field_names_works(self):
        payload = {
            "description":
            "RenameFieldName",
            "new_index":
            self.new_index_name,
            "fields": [TEST_FIELD],
            "field_type": [{
                "path": TEST_FIELD,
                "new_path_name": TEST_FIELD_RENAMED,
                "field_type": "text"
            }],
            "indices": [self.test_index_name],
            "add_facts_mapping":
            True
        }

        # Reindex the test index into a new one.
        url = reverse("v2:reindexer-list",
                      kwargs={"project_pk": self.project.pk})
        reindex_response = self.client.post(url, data=payload, format='json')
        print_output('test_that_changing_field_names_works:response.data',
                     reindex_response.data)

        # Check that the fields have been changed.
        es = ElasticSearcher(indices=[self.new_index_name])
        for document in es:
            self.assertTrue(TEST_FIELD not in document)
            self.assertTrue(TEST_FIELD_RENAMED in document)

        # Manual clean up.
        es.core.delete_index(self.new_index_name)
Exemple #4
0
def annotator_task(self, annotator_task_id):
    annotator_obj = Annotator.objects.get(pk=annotator_task_id)
    annotator_group_children = []

    indices = annotator_obj.get_indices()
    users = [user.pk for user in annotator_obj.annotator_users.all()]

    task_object = annotator_obj.task
    annotator_fields = json.loads(annotator_obj.fields)
    all_fields = annotator_fields
    all_fields.append("texta_meta.document_uuid")

    if annotator_obj.annotation_type == 'entity':
        all_fields.append("texta_facts")
        all_fields.append(texta_mlp.settings.META_KEY)  # Include MLP Meta key here so it would be pulled from Elasticsearch.

    project_obj = Project.objects.get(id=annotator_obj.project_id)
    new_field_type = get_selected_fields(indices, annotator_fields)
    field_type = add_field_type(new_field_type)
    add_facts_mapping = annotator_obj.add_facts_mapping
    scroll_size = 100

    new_indices = []
    new_annotators = []

    for user in users:
        annotating_user = User.objects.get(pk=user)
        new_annotators.append(annotating_user.pk)
        for index in indices:
            new_indices.append(f"{index}_{user}_{annotator_obj.task_id}")

    query = annotator_obj.query

    logging.getLogger(INFO_LOGGER).info(f"Starting task annotator with Task ID {annotator_obj.task_id}.")

    try:
        ec = ElasticCore()
        index_fields = ec.get_fields(indices)
        index_fields = [index_field["path"] for index_field in index_fields]

        # ElasticSearcher seems to be broken when handling scrolls with only the main field in its field_data instead of all of them in dot notation.
        # Hence this ugly hack is needed if I want to include the MLP meta field inside the output.
        for annotator_field in json.loads(annotator_obj.fields):
            for index_field in index_fields:
                stripped_mlp_field = annotator_field.split("_mlp.")[0] if "_mlp." in annotator_field else annotator_field
                if texta_mlp.settings.META_KEY in index_field and stripped_mlp_field in index_field:
                    all_fields.append(index_field)

        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step("scrolling data")
        show_progress.update_view(0)

        __add_meta_to_original_index(indices, index_fields, show_progress, query, scroll_size, ec)

        for new_annotator in new_annotators:
            new_annotator_obj = Annotator.objects.create(
                annotator_uid=f"{annotator_obj.description}_{new_annotator}_{annotator_obj.task_id}",
                description=f"{annotator_obj.description}",
                author=annotator_obj.author,
                project=annotator_obj.project,
                total=annotator_obj.total,
                fields=annotator_obj.fields,
                add_facts_mapping=add_facts_mapping,
                annotation_type=annotator_obj.annotation_type,
                binary_configuration=annotator_obj.binary_configuration,
                multilabel_configuration=annotator_obj.multilabel_configuration,
                entity_configuration=annotator_obj.entity_configuration,
            )
            new_annotator_obj.annotator_users.add(new_annotator)
            for new_index in new_indices:
                logging.getLogger(INFO_LOGGER).info(f"New Index check {new_index} for user {new_annotator}")
                logging.getLogger(INFO_LOGGER).info(f"Index object {indices}")

                for index in indices:
                    if new_index == f"{index}_{new_annotator}_{annotator_obj.task_id}":

                        elastic_search = ElasticSearcher(indices=indices, field_data=all_fields, callback_progress=show_progress, query=query, scroll_size=scroll_size)
                        elastic_doc = ElasticDocument(new_index)

                        logging.getLogger(INFO_LOGGER).info(f"Updating index schema for index {new_index}")
                        ''' the operations that don't require a mapping update have been completed '''
                        schema_input = update_field_types(indices, all_fields, field_type, flatten_doc=False)
                        updated_schema = update_mapping(schema_input, new_index, add_facts_mapping, add_texta_meta_mapping=True)

                        logging.getLogger(INFO_LOGGER).info(f"Creating new index {new_index} for user {new_annotator}")
                        # create new_index
                        create_index_res = ElasticCore().create_index(new_index, updated_schema)

                        index_model, is_created = Index.objects.get_or_create(name=new_index)
                        project_obj.indices.add(index_model)
                        index_user = index_model.name.rsplit('_', 2)[1]
                        if str(index_user) == str(new_annotator):
                            new_annotator_obj.indices.add(index_model)

                        logging.getLogger(INFO_LOGGER).info("Indexing documents.")
                        # set new_index name as mapping name
                        bulk_add_documents(elastic_search, elastic_doc, index=new_index, chunk_size=scroll_size, flatten_doc=False)

            new_annotator_obj.save()
            annotator_group_children.append(new_annotator_obj.id)
            logging.getLogger(INFO_LOGGER).info(f"Saving new annotator object ID {new_annotator_obj.id}")

        new_annotator_obj.add_annotation_mapping(new_indices)
        new_annotator_obj.add_texta_meta_mapping(new_indices)

        annotator_obj.annotator_users.clear()
        annotator_obj.save()

        annotator_group, is_created = AnnotatorGroup.objects.get_or_create(project=annotator_obj.project, parent=annotator_obj)
        annotator_group.children.add(*annotator_group_children)

        # declare the job done
        task_object.complete()

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e

    logging.getLogger(INFO_LOGGER).info(f"Annotator with Task ID {annotator_obj.task_id} successfully completed.")
    return True