Example #1
0
    def tag_doc(self, request, pk=None, project_pk=None):
        serializer = RegexTaggerGroupTagDocumentSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)
        tagger_object: RegexTagger = self.get_object()

        input_document = serializer.validated_data["doc"]
        fields = serializer.validated_data["fields"]

        # apply tagger
        results = {
            "tagger_id": tagger_object.pk,
            "tag": tagger_object.description,
            "result": False,
            "matches": []
        }
        final_matches = []
        for field in fields:

            flattened_doc = ElasticCore(
                check_connection=False).flatten(input_document)
            text = flattened_doc.get(field, None)
            matches = tagger_object.match_texts([text], as_texta_facts=False)

            if matches:
                for match in matches:
                    match.update(field=field)
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Example #2
0
 def close_index(self, request, pk=None, project_pk=None):
     es_core = ElasticCore()
     index = Index.objects.get(pk=pk)
     es_core.close_index(index.name)
     index.is_open = False
     index.save()
     return Response({"message": f"Closed the index {index.name}"})
Example #3
0
 def tearDown(self) -> None:
     Tagger.objects.all().delete()
     ec = ElasticCore()
     res = ec.delete_index(self.test_index_copy)
     ec.delete_index(index=self.test_index_name, ignore=[400, 404])
     print_output(f"Delete apply_taggers test index {self.test_index_copy}",
                  res)
Example #4
0
    def list(self, request, *args, **kwargs):
        ec = ElasticCore()

        response = super(IndexViewSet, self).list(request, *args, **kwargs)

        data = response.data  # Get the paginated and sorted queryset results.
        open_indices = [index for index in data if index["is_open"]]
        mappings = ec.es.indices.get_mapping()

        # Doing a stats request with no indices causes trouble.
        if open_indices:
            stats = ec.get_index_stats()

            # Update the paginated and sorted queryset results.
            for index in response.data:
                name = index["name"]
                is_open = index["is_open"]
                if is_open:
                    has_texta_facts_mapping = self._check_for_facts(
                        index_mappings=mappings, index_name=name)
                    if name in stats:
                        index.update(
                            **stats[name],
                            has_validated_facts=has_texta_facts_mapping)
                    else:
                        index.update(has_validated_facts=False)
                else:
                    # For the sake of courtesy on the front-end, make closed indices values zero.
                    index.update(size=0,
                                 doc_count=0,
                                 has_validated_facts=False)

        return response
Example #5
0
def get_field_choices():
    es = ElasticCore()
    if es.connection:
        return [(a, '{0} - {1}'.format(a['index'], a['path']))
                for a in es.get_fields()]
    else:
        return []
Example #6
0
 def tearDown(self) -> None:
     ec = ElasticCore()
     ec.delete_index(index=self.test_index_name, ignore=[400, 404])
     print_output(f"Delete [Rakun Extractor] test index {self.test_index_name}", None)
     Embedding.objects.all().delete()
     ElasticCore().delete_index(index=self.test_index_name, ignore=[400, 404])
     print_output(f"Delete Rakun FASTTEXT Embeddings", None)
Example #7
0
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int):
    """Apply Torch Tagger to index."""
    try:
        tagger_object = TorchTaggerObject.objects.get(pk=object_id)
        tagger = tagger_object.load_tagger()

        progress = ShowProgress(tagger_object.task)

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices = indices,
            field_data = fields + ["texta_facts"],  # Get facts to add upon existing ones.
            query = query,
            output = ElasticSearcher.OUT_RAW,
            timeout = f"{es_timeout}m",
            callback_progress=progress,
            scroll_size = bulk_size
        )

        actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger)
        for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3):
            if not success:
                logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

        tagger_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
Example #8
0
    def setUp(self):
        self.test_index_name = reindex_test_dataset()
        self.user = create_test_user('first_user', '*****@*****.**', 'pw')
        self.project = project_creation("DocumentImporterAPI",
                                        self.test_index_name, self.user)

        self.validation_project = project_creation("validation_project",
                                                   "random_index_name",
                                                   self.user)

        self.document_id = random.randint(10000000, 90000000)
        self.uuid = uuid.uuid1()
        self.source = {"hello": "world", "uuid": self.uuid}
        self.document = {
            "_index": self.test_index_name,
            "_id": self.document_id,
            "_source": self.source
        }

        self.target_field_random_key = uuid.uuid1()
        self.target_field = f"{self.target_field_random_key}_court_case"
        self.ec = ElasticCore()

        self.client.login(username='******', password='******')
        self._check_inserting_documents()
Example #9
0
def skip_for_es6():
    ec = ElasticCore()
    first, second, third = ec.get_version()
    if first == 7:
        return False
    else:
        return True
Example #10
0
    def test_project_creation_using_locked_indices(self):
        """
        There was a bug case that warrants this test case.
        In case any of the indices in Elasticsearch are locked for whatever reason,
        then project creation fails because it contains/contained  lazy index creation
        which ignores existing indices and thus created FORBIDDEN_ACCESS errors in Elasticsearch.
        """
        index_names = ["locked_index_{}".format(i) for i in range(1, 5)]
        for index in index_names:
            self.__create_locked_index(index)

        response = self.client.post(
            reverse(f"{VERSION_NAMESPACE}:project-list"),
            format="json",
            data={
                "title": "faulty_project",
                "indices_write": index_names,
                "users_write": [self.admin.username]
            })

        ec = ElasticCore()
        for index in index_names:
            ec.delete_index(index)

        self.assertTrue(response.status_code == status.HTTP_201_CREATED)
Example #11
0
 def destroy(self, request, pk=None, **kwargs):
     with transaction.atomic():
         index_name = Index.objects.get(pk=pk).name
         es = ElasticCore()
         es.delete_index(index_name)
         Index.objects.filter(pk=pk).delete()
         return Response(
             {"message": f"Deleted index {index_name} from Elasticsearch!"})
Example #12
0
 def update_project_indices(self, serializer, project_obj):
     ''' add new_index included in the request to the relevant project object '''
     index_to_add = serializer.validated_data['new_index']
     from texta_elastic.core import ElasticCore
     ec = ElasticCore()
     ec.create_index(index_to_add)
     index, is_open = Index.objects.get_or_create(name=index_to_add)
     project_obj.indices.add(index)
     project_obj.save()
Example #13
0
    def open_index(self, request, pk=None, project_pk=None):
        es_core = ElasticCore()
        index = Index.objects.get(pk=pk)
        es_core.open_index(index.name)
        if not index.is_open:
            index.is_open = True
            index.save()

        return Response({"message": f"Opened the index {index.name}"})
Example #14
0
 def _flatten_document(self, document):
     ec = ElasticCore()
     source = document.get("_source")
     annotator_meta = source.pop(TEXTA_ANNOTATOR_KEY)
     flattened_source = ec.flatten(source)
     # Skip the annotator meta when flattening and then attach it back.
     flattened_source[TEXTA_ANNOTATOR_KEY] = annotator_meta
     document["_source"] = flattened_source
     return document
Example #15
0
 def setUp(self):
     self.test_index_name = reindex_test_dataset()
     self.ec = ElasticCore()
     self.user = create_test_user('mlpUser', '*****@*****.**', 'pw')
     self.project = project_creation("mlpTestProject", self.test_index_name,
                                     self.user)
     self.project.users.add(self.user)
     self.client.login(username='******', password='******')
     self.url = reverse(f"{VERSION_NAMESPACE}:mlp_index-list",
                        kwargs={"project_pk": self.project.pk})
Example #16
0
def get_field_choices():
    """
    Retrieves field options from ES.
    """
    es = ElasticCore()
    if es.connection:
        return [(a, '{0} - {1}'.format(a['index'], a['path']))
                for a in es.get_fields()]
    else:
        return []
Example #17
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object: RegexTaggerGroup = self.get_object()

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve tagger fields
        fields = serializer.validated_data["fields"]
        if not ElasticCore().check_if_indices_exist(
                tagger_object.project.get_indices()):
            return Response(
                {
                    'error':
                    f'One or more index from {list(tagger_object.project.get_indices())} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply tagger
        results = {
            "tagger_group_id": tagger_object.pk,
            "tagger_group_tag": tagger_object.description,
            "result": False,
            "matches": [],
            "document": flattened_doc
        }

        final_matches = []
        for field in fields:
            text = flattened_doc.get(field, None)
            results["document"][field] = text
            matches = tagger_object.match_texts([text],
                                                as_texta_facts=True,
                                                field=field)

            if matches:
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Example #18
0
def check_for_existence(value):
    ec = ElasticCore()
    index = parse_index_input(value)
    in_elastic = ec.check_if_indices_exist(indices=[index])
    if in_elastic:
        # This line helps keep the database and Elastic in sync.
        index, is_created = Index.objects.get_or_create(name=index)
    else:
        # We check for a loose Index object just in case and delete them.
        Index.objects.filter(name=index).delete()
        raise NoIndexExists(f"Could not access index '{index}'")
Example #19
0
    def add_texta_meta_mapping(indices: List[str]):
        """
        Adds the mapping for texta_meta.
        :param indices: Which indices to target for the schemas.
        :return:
        """
        from texta_elastic.core import ElasticCore

        ec = ElasticCore()
        for index in indices:
            ec.add_texta_meta_mapping(index)
Example #20
0
    def add_annotation_mapping(indices: List[str]):
        """
        Adds the mapping for the annotator into indices to ensure smooth sailing.
        :param indices: Which indices to target for the schemas.
        :return:
        """
        from texta_elastic.core import ElasticCore

        ec = ElasticCore()
        for index in indices:
            ec.add_annotator_mapping(index)
Example #21
0
    def setUp(self) -> None:
        self.client.login(username="******", password="******")
        self.ec = ElasticCore()
        self.ids = []
        self.index_names = [
            "test_for_index_endpoint_1", "test_for_index_endpoint_2"
        ]

        for index_name in self.index_names:
            index, is_created = Index.objects.get_or_create(name=index_name)
            self.ec.es.indices.create(index=index_name, ignore=[400, 404])
            self.ids.append(index.pk)
Example #22
0
 def add_indices(self, request, pk=None, project_pk=None):
     project: Project = self.get_object()
     serializer = self.get_serializer(data=request.data)
     serializer.is_valid(raise_exception=True)
     indices = [index.name for index in serializer.validated_data["indices"]]
     ec = ElasticCore()
     exists = ec.check_if_indices_exist(indices)
     if exists and indices:
         for index_name in indices:
             index, is_created = Index.objects.get_or_create(name=index_name)
             project.indices.add(index)
         return Response({"detail": f"Added indices '{str(indices)}' to the project!"})
     else:
         raise ValidationError(f"Could not validate indices f'{str(indices)}'")
Example #23
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Example #24
0
 def __init__(self,
              project_pk,
              model_object=None,
              text_processor=None,
              callback_progress=None,
              prediction_to_match=None,
              es_prefix=get_core_setting("TEXTA_ES_PREFIX"),
              deploy_key=getattr(settings, "DEPLOY_KEY")):
     self.es_core = ElasticCore()
     self.project_pk = project_pk
     self.feedback_index = f"{es_prefix}texta-{deploy_key}-feedback-project-{project_pk}"
     self.model_object = model_object
     self.es_doc, self.es_search, self.query = self._initialize_es(
         project_pk, text_processor, callback_progress, prediction_to_match)
Example #25
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Example #26
0
    def setUp(self) -> None:
        self.test_index_name = reindex_test_dataset()
        self.user = create_test_user('Owner', '*****@*****.**', 'pw')
        self.unauthorized_user = create_test_user('unauthorized', '*****@*****.**', 'pw')
        self.file_name = "d41d8cd98f00b204e9800998ecf8427e.txt"

        self.project = project_creation("test_doc_parser", index_title=None, author=self.user)
        self.project.users.add(self.user)
        self.unauth_project = project_creation("unauth_project", index_title=None, author=self.user)

        self.file = SimpleUploadedFile("text.txt", b"file_content", content_type="text/html")
        self.client.login(username='******', password='******')
        self._basic_pipeline_functionality()
        self.file_path = self._get_file_path()
        self.ec = ElasticCore()
Example #27
0
    def check_and_create(indices: str):
        from texta_elastic.core import ElasticCore
        ec = ElasticCore()

        if isinstance(indices, list):
            indices = indices
        elif isinstance(indices, str):
            indices = indices.split(",")

        for index in indices:
            does_exist = ec.check_if_indices_exist([index])
            if does_exist:
                Index.objects.get_or_create(name=index)
            else:
                Index.objects.filter(name=index).delete()
Example #28
0
    def create(self, request, **kwargs):
        data = IndexSerializer(data=request.data)
        data.is_valid(raise_exception=True)

        es = ElasticCore()
        index = data.validated_data["name"]
        is_open = data.validated_data["is_open"]
        description = data.validated_data["description"]
        added_by = data.validated_data["added_by"]
        test = data.validated_data["test"]
        source = data.validated_data["source"]
        client = data.validated_data["client"]
        domain = data.validated_data["domain"]

        # Using get_or_create to avoid unique name constraints on creation.
        if es.check_if_indices_exist([index]):
            # Even if the index already exists, create the index object just in case
            index, is_created = Index.objects.get_or_create(name=index)

            if is_created:
                utc_time = es.get_index_creation_date(index)
                index.is_open = is_open
                index.description = description
                index.added_by = added_by
                index.test = test
                index.source = source
                index.client = client
                index.domain = domain
                index.created_at = utc_time
            index.save()
            raise ElasticIndexAlreadyExists()

        else:
            es.create_index(index=index)
            if not is_open:
                es.close_index(index)

            index, is_created = Index.objects.get_or_create(name=index)
            if is_created:
                utc_time = es.get_index_creation_date(index)
                index.is_open = is_open
                index.description = description
                index.added_by = added_by
                index.test = test
                index.source = source
                index.client = client
                index.domain = domain
                index.created_at = utc_time
            index.save()

            return Response(
                {"message": f"Added index {index} into Elasticsearch!"},
                status=status.HTTP_201_CREATED)
Example #29
0
    def setUp(self):
        # Owner of the project
        self.test_index_name = reindex_test_dataset()
        self.secondary_index = reindex_test_dataset()
        self.index, is_created = Index.objects.get_or_create(
            name=self.secondary_index)
        self.user = create_test_user('annotator', '*****@*****.**', 'pw')
        self.user2 = create_test_user('annotator2', '*****@*****.**', 'pw2')
        self.project = project_creation("entityTestProject",
                                        self.test_index_name, self.user)
        self.project.indices.add(self.index)
        self.project.users.add(self.user)
        self.project.users.add(self.user2)

        self.client.login(username='******', password='******')
        self.ec = ElasticCore()

        self.list_view_url = reverse("v2:annotator-list",
                                     kwargs={"project_pk": self.project.pk})
        self.annotator = self._create_annotator()
        self.pull_document_url = reverse("v2:annotator-pull-document",
                                         kwargs={
                                             "project_pk": self.project.pk,
                                             "pk": self.annotator["id"]
                                         })
    def test_applying_the_regex_tagger_group_to_the_index(self):
        ec = ElasticCore()
        tg_description = "toxic"
        tg_id, tagger_ids = self.__create_tagger_group(
            "%s" % tg_description, (("racism", ["juut", "neeger", "tõmmu"]),
                                    ("hate", ["pederast", "debiilik"])))
        url = reverse(
            f"{VERSION_NAMESPACE}:regex_tagger_group-apply-tagger-group",
            kwargs={
                "project_pk": self.project.pk,
                "pk": tg_id
            })
        response = self.client.post(
            url, {
                "description": "Test Run",
                "fields": [TEST_FIELD],
                "indices": [{
                    "name": self.test_index_name
                }]
            })
        self.assertTrue(response.status_code == status.HTTP_201_CREATED)
        s = elasticsearch_dsl.Search(index=self.test_index_name, using=ec.es)
        has_group_fact = False
        for hit in s.scan():
            facts = hit.to_dict().get("texta_facts", [])
            for fact in facts:
                if fact["fact"] == tg_description:
                    has_group_fact = True

        self.assertTrue(has_group_fact)
        print_output(
            'test_applying_the_regex_tagger_group_to_the_index:response.data',
            response.data)