Example #1
0
 def _get_sample_document(self, id_field: str, id_value: str, index: str):
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, output=ElasticSearcher.OUT_RAW)
     ed = ElasticDocument(index=index)
     response = es.search()["hits"]["hits"]
     document = response[0] if response else None
     return ed, document
Example #2
0
    def more_like_cluster(self, request, *args, **kwargs):
        serializer = ElasticMoreLikeThisSerializer(data=request.data,
                                                   partial=True)
        serializer.is_valid(raise_exception=True)

        clustering_pk = ClusterViewSet.__handle_clustering_pk(kwargs)
        clustering_object = ClusteringResult.objects.get(pk=clustering_pk)
        cluster = Cluster.objects.get(pk=kwargs["pk"])
        indices = clustering_object.get_indices()
        doc_ids = json.loads(cluster.document_ids)
        ignored_ids = json.loads(clustering_object.ignored_ids)

        fields = json.loads(clustering_object.fields)
        document_ids = [{"_id": doc_id} for doc_id in doc_ids]

        serializer.validated_data.pop("indices", None)
        serializer.validated_data.pop("like", None)
        serializer.validated_data.pop("fields", None)

        es = ElasticSearcher(indices=indices)
        result = es.more_like_this(indices=indices,
                                   mlt_fields=fields,
                                   like=document_ids,
                                   exclude=ignored_ids,
                                   flatten=True,
                                   **serializer.validated_data)

        return Response(result, status=status.HTTP_200_OK)
Example #3
0
def apply_tagger_to_index(object_id: int, indices: List[str], fields: List[str], fact_name: str, fact_value: str, query: dict, bulk_size: int, max_chunk_bytes: int, es_timeout: int):
    """Apply Torch Tagger to index."""
    try:
        tagger_object = TorchTaggerObject.objects.get(pk=object_id)
        tagger = tagger_object.load_tagger()

        progress = ShowProgress(tagger_object.task)

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices = indices,
            field_data = fields + ["texta_facts"],  # Get facts to add upon existing ones.
            query = query,
            output = ElasticSearcher.OUT_RAW,
            timeout = f"{es_timeout}m",
            callback_progress=progress,
            scroll_size = bulk_size
        )

        actions = update_generator(generator=searcher, ec=ec, fields=fields, fact_name=fact_name, fact_value=fact_value, tagger_object=tagger_object, tagger=tagger)
        for success, info in streaming_bulk(client=ec.es, actions=actions, refresh="wait_for", chunk_size=bulk_size, max_chunk_bytes=max_chunk_bytes, max_retries=3):
            if not success:
                logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))

        tagger_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        tagger_object.task.add_error(error_message)
        tagger_object.task.update_status(Task.STATUS_FAILED)
Example #4
0
 def check_positive_doc_count(self):
     # current reindexing tests require approx 2 seconds delay
     sleep(5)
     count_new_documents = ElasticSearcher(
         indices=TEST_INDEX_REINDEX).count()
     print_output("Bulk add doc count", count_new_documents)
     assert count_new_documents > 0
Example #5
0
    def test_that_changing_field_names_works(self):
        payload = {
            "description":
            "RenameFieldName",
            "new_index":
            self.new_index_name,
            "fields": [TEST_FIELD],
            "field_type": [{
                "path": TEST_FIELD,
                "new_path_name": TEST_FIELD_RENAMED,
                "field_type": "text"
            }],
            "indices": [self.test_index_name],
            "add_facts_mapping":
            True
        }

        # Reindex the test index into a new one.
        url = reverse("v2:reindexer-list",
                      kwargs={"project_pk": self.project.pk})
        reindex_response = self.client.post(url, data=payload, format='json')
        print_output('test_that_changing_field_names_works:response.data',
                     reindex_response.data)

        # Check that the fields have been changed.
        es = ElasticSearcher(indices=[self.new_index_name])
        for document in es:
            self.assertTrue(TEST_FIELD not in document)
            self.assertTrue(TEST_FIELD_RENAMED in document)

        # Manual clean up.
        es.core.delete_index(self.new_index_name)
Example #6
0
def fact_delete_query_task(self, worker_id: int):
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step(
            'Scrolling through the indices to delete the facts.')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        target_facts = json.loads(worker_object.facts)
        scroll_size = worker_object.scroll_size

        searcher = ElasticSearcher(
            query=json.loads(worker_object.query),
            indices=indices,
            field_data=[TEXTA_TAGS_KEY],
            output=ElasticSearcher.OUT_RAW,
            callback_progress=show_progress,
            scroll_size=scroll_size,
            scroll_timeout=f"{worker_object.es_timeout}m")

        ed = ElasticDocument(index=None)
        actions = query_delete_actions_generator(searcher, target_facts)
        ed.bulk_update(actions)

        worker_object.task.complete()
        worker_object.save()

        return worker_id

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Example #7
0
def start_fact_delete_query_task(self, worker_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    worker_object = DeleteFactsByQueryTask.objects.get(pk=worker_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Celery: Starting task for deleting facts by query for project with ID: {worker_object.pk}"
        )

        # init progress
        show_progress = ShowProgress(worker_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=worker_object.get_indices(),
                                   output=ElasticSearcher.OUT_DOC,
                                   callback_progress=show_progress,
                                   scroll_size=worker_object.scroll_size,
                                   field_data=["texta_facts"])

        count = searcher.count()

        show_progress.update_step(f'Deleting facts from {count} documents')
        show_progress.update_view(0)
        worker_object.task.set_total(count)
        return True

    except Exception as e:
        worker_object.task.handle_failed_task(e)
        raise e
Example #8
0
 def _initialize_es(self, project_pk, text_processor, callback_progress,
                    prediction_to_match):
     # create es doc
     es_doc = ElasticDocument(self.feedback_index)
     # if no model objects, return nones for query and search
     if not self.model_object:
         return es_doc, None, None
     # create mathing query
     query = Query()
     query.add_string_filter(query_string=self.model_object.MODEL_TYPE,
                             fields=["model_type"])
     if self.model_object:
         query.add_string_filter(query_string=str(self.model_object.pk),
                                 fields=["model_id"])
     if prediction_to_match:
         query.add_string_filter(query_string=prediction_to_match,
                                 fields=["correct_result"])
     # if no index, don't create searcher object
     if not self.check_index_exists():
         return es_doc, None, query.query
     # create es search
     es_search = ElasticSearcher(indices=self.feedback_index,
                                 query=query.query,
                                 text_processor=text_processor,
                                 output=ElasticSearcher.OUT_DOC_WITH_ID,
                                 callback_progress=callback_progress)
     # return objects
     return es_doc, es_search, query.query
Example #9
0
    def test_that_split_index_with_nested_field_still_has_nested_field(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output(
            'test_that_split_index_with_nested_field_still_has_nested_field:response.data',
            response.data)
        at_least_once = False
        es = ElasticSearcher(
            indices=[INDEX_SPLITTING_TEST_INDEX, INDEX_SPLITTING_TEST_INDEX],
            field_data=[TEST_INDEX_OBJECT_FIELD],
            flatten=False)
        for item in es:
            data = item.get(TEST_INDEX_OBJECT_FIELD, None)
            if data:
                self.assertTrue(isinstance(data, dict))
                at_least_once = True
        self.assertTrue(at_least_once)
Example #10
0
    def test_index_processing(self):
        query_string = "inimene"
        payload = {
            "description":
            "TestingIndexProcessing",
            "fields": [TEST_FIELD],
            "query":
            json.dumps(
                {'query': {
                    'match': {
                        'comment_content_lemmas': query_string
                    }
                }},
                ensure_ascii=False)
        }

        response = self.client.post(self.url, data=payload, format="json")
        print_output("test_index_processing:response.data", response.data)

        # Check if MLP was applied to the documents properly.
        s = ElasticSearcher(indices=[self.test_index_name],
                            output=ElasticSearcher.OUT_DOC,
                            query=payload["query"])
        for hit in s:
            self._assert_mlp_contents(hit, TEST_FIELD)
Example #11
0
 def _get_split_documents_by_id(self, id_field, id_value, text_field):
     documents = []
     query = Search().query(Q("term", **{f"{id_field}.keyword": id_value})).to_dict()
     es = ElasticSearcher(query=query, field_data=[id_field, text_field], output=ElasticSearcher.OUT_RAW)
     for hit in es:
         for document in hit:
             documents.append(document)
     return documents
Example #12
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object: RegexTaggerGroup = self.get_object()

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve tagger fields
        fields = serializer.validated_data["fields"]
        if not ElasticCore().check_if_indices_exist(
                tagger_object.project.get_indices()):
            return Response(
                {
                    'error':
                    f'One or more index from {list(tagger_object.project.get_indices())} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply tagger
        results = {
            "tagger_group_id": tagger_object.pk,
            "tagger_group_tag": tagger_object.description,
            "result": False,
            "matches": [],
            "document": flattened_doc
        }

        final_matches = []
        for field in fields:
            text = flattened_doc.get(field, None)
            results["document"][field] = text
            matches = tagger_object.match_texts([text],
                                                as_texta_facts=True,
                                                field=field)

            if matches:
                final_matches.extend(matches)
                results["result"] = True

        results["matches"] = final_matches

        return Response(results, status=status.HTTP_200_OK)
Example #13
0
    def test_create_splitter_object_and_task_signal(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url,
                                    json.dumps(payload),
                                    content_type='application/json')

        print_output(
            'test_create_splitter_object_and_task_signal:response.data',
            response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])
        print_output("indices:", splitter_obj.get_indices())
        # Check if IndexSplitter object gets created
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        # Check if Task gets created
        self.assertTrue(splitter_obj.task is not None)
        print_output("status of IndexSplitter's Task object",
                     splitter_obj.task.status)
        # Check if Task gets completed
        self.assertEqual(splitter_obj.task.status, Task.STATUS_COMPLETED)

        sleep(5)

        original_count = ElasticSearcher(indices=self.test_index_name).count()
        test_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TEST_INDEX).count()
        train_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TRAIN_INDEX).count()

        print_output('original_count, test_count, train_count',
                     [original_count, test_count, train_count])
Example #14
0
def __add_meta_to_original_index(indices: List[str], index_fields: List[str], show_progress: ShowProgress, query: dict, scroll_size: int, elastic_wrapper: ElasticCore):
    index_elastic_search = ElasticSearcher(
        indices=indices,
        field_data=index_fields,
        callback_progress=show_progress,
        query=query,
        output=ElasticSearcher.OUT_RAW,
        scroll_size=scroll_size
    )
    index_actions = add_doc_uuid(generator=index_elastic_search)
    for success, info in streaming_bulk(client=elastic_wrapper.es, actions=index_actions, refresh="wait_for", chunk_size=scroll_size, max_retries=3):
        if not success:
            logging.getLogger(ERROR_LOGGER).exception(json.dumps(info))
Example #15
0
def apply_search_fields_tagger_on_index(object_id: int):
    search_fields_tagger = SearchFieldsTagger.objects.get(pk=object_id)
    task_object = search_fields_tagger.task
    """Apply Search Fields Tagger to index."""
    try:
        progress = ShowProgress(task_object)
        progress.update_step('scrolling search fields')

        # Get the necessary fields.
        indices: List[str] = search_fields_tagger.get_indices()
        fields: List[str] = json.loads(search_fields_tagger.fields)
        fact_name: str = search_fields_tagger.fact_name
        scroll_timeout = search_fields_tagger.es_timeout
        scroll_size = search_fields_tagger.bulk_size

        use_breakup = search_fields_tagger.use_breakup
        breakup_character = search_fields_tagger.breakup_character

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=json.loads(search_fields_tagger.query),
            output=ElasticSearcher.OUT_RAW,
            scroll_timeout=f"{scroll_timeout}m",
            callback_progress=progress,
            scroll_size=scroll_size)

        actions = update_search_fields_generator(
            generator=searcher,
            ec=ec,
            fields=fields,
            fact_name=fact_name,
            search_field_tagger_object=search_fields_tagger,
            use_breakup=use_breakup,
            breakup_character=breakup_character)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)
        return object_id

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        task_object.add_error(str(e))
        task_object.update_status(Task.STATUS_FAILED)
        raise e
Example #16
0
    def post(self, request, project_pk: int):
        """Executes **raw** Elasticsearch query on all project indices."""
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        serializer = ProjectSearchByQuerySerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = project.get_available_or_all_project_indices(serializer.validated_data["indices"])

        if not indices:
            raise ProjectValidationFailed(detail="No indices supplied and project has no indices")

        es = None
        if serializer.validated_data["output_type"]:
            es = ElasticSearcher(indices=indices, output=serializer.validated_data["output_type"])
        else:
            es = ElasticSearcher(indices=indices, output=ElasticSearcher.OUT_DOC_WITH_TOTAL_HL_AGGS)

        es.update_query(serializer.validated_data["query"])
        results = es.search()
        return Response(results, status=status.HTTP_200_OK)
Example #17
0
def apply_rakun_extractor_to_index(self, object_id: int, indices: List[str],
                                   fields: List[str], query: dict,
                                   es_timeout: int, bulk_size: int,
                                   fact_name: str, add_spans: bool):
    """Apply Rakun Keyword Extractor to index."""
    logging.getLogger(INFO_LOGGER).info(
        f"Starting task 'apply_rakun_extractor_to_index' with ID: {object_id}!"
    )
    rakun_extractor_object = RakunExtractor.objects.get(id=object_id)
    try:
        progress = ShowProgress(rakun_extractor_object.task)

        # retrieve fields
        field_data = fields

        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]

        searcher = ElasticSearcher(
            indices=indices,
            field_data=field_data +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            timeout=f"{es_timeout}m",
            output=ElasticSearcher.OUT_RAW,
            callback_progress=progress,
            scroll_size=bulk_size)
        keyword_detector = rakun_extractor_object.load_rakun_keyword_detector()
        actions = update_generator(
            keyword_detector=keyword_detector,
            generator=searcher,
            ec=ec,
            fields=field_data,
            rakun_extractor_object=rakun_extractor_object,
            fact_name=fact_name,
            fact_value="",
            add_spans=add_spans)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        rakun_extractor_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        rakun_extractor_object.task.add_error(error_message)
        rakun_extractor_object.task.update_status(Task.STATUS_FAILED)
Example #18
0
def start_mlp_worker(self, mlp_id: int):
    """
    Scrolls the document ID-s and passes them to MLP worker.
    """
    mlp_object = MLPWorker.objects.get(pk=mlp_id)

    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Applying mlp on the index for MLP Task ID: {mlp_id}")
        # init progress
        show_progress = ShowProgress(mlp_object.task, multiplier=1)
        show_progress.update_step('Scrolling document IDs')
        show_progress.update_view(0)
        # Get the necessary fields.
        indices: List[str] = mlp_object.get_indices()
        es_scroll_size = mlp_object.es_scroll_size
        es_timeout = mlp_object.es_timeout

        # create searcher object for scrolling ids
        searcher = ElasticSearcher(query=json.loads(mlp_object.query),
                                   indices=indices,
                                   output=ElasticSearcher.OUT_META,
                                   callback_progress=show_progress,
                                   scroll_size=es_scroll_size,
                                   scroll_timeout=f"{es_timeout}m")
        # add texta facts mappings to the indices if needed
        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        doc_chunks = list(chunks_iter(searcher, MLP_BATCH_SIZE))

        # update progress
        show_progress.update_step(
            f'Applying MLP to {len(doc_chunks)} documents')
        show_progress.update_view(0)

        mlp_object.task.set_total(searcher.count())
        mlp_object.task.update_status(Task.STATUS_RUNNING)

        # pass document id-s to the next task
        chain = group(
            apply_mlp_on_es_docs.s([doc["_id"] for doc in meta_chunk], mlp_id)
            for meta_chunk in doc_chunks) | end_mlp_task.si(mlp_id)
        chain.delay()
        return True

    except Exception as e:
        mlp_object.task.handle_failed_task(e)
        raise
Example #19
0
def apply_analyzers_on_indices(self, worker_id: int):
    worker_object = ApplyESAnalyzerWorker.objects.get(pk=worker_id)
    task_object = worker_object.task
    try:
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        fields = json.loads(worker_object.fields)
        detect_lang = worker_object.detect_lang
        snowball_language = worker_object.stemmer_lang
        scroll_timeout = f"{worker_object.es_timeout}m"
        scroll_size = worker_object.bulk_size
        analyzers = json.loads(worker_object.analyzers)
        tokenizer = worker_object.tokenizer
        strip_html = worker_object.strip_html

        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=fields,
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout=scroll_timeout)

        task_object.set_total(searcher.count())

        actions = process_analyzer_actions(generator=searcher,
                                           worker=worker_object,
                                           detect_lang=detect_lang,
                                           snowball_language=snowball_language,
                                           fields_to_parse=fields,
                                           analyzers=analyzers,
                                           tokenizer=tokenizer,
                                           strip_html=strip_html)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        ed.bulk_update(actions=actions, chunk_size=scroll_size)

        worker_object.task.complete()

        return worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Example #20
0
    def test_create_random_split(self):
        payload = {
            "description": "Random index splitting",
            "indices": [{
                "name": self.test_index_name
            }],
            "train_index": INDEX_SPLITTING_TRAIN_INDEX,
            "test_index": INDEX_SPLITTING_TEST_INDEX,
            "distribution": "random",
            "test_size": 20
        }

        response = self.client.post(self.url, data=payload)
        print_output('test_create_random_split:response.data', response.data)

        splitter_obj = IndexSplitter.objects.get(id=response.data['id'])

        # Assert Task gets completed
        self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED)
        print_output("Task status", Task.STATUS_COMPLETED)

        sleep(5)

        original_count = ElasticSearcher(indices=self.test_index_name).count()
        test_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TEST_INDEX).count()
        train_count = ElasticSearcher(
            indices=INDEX_SPLITTING_TRAIN_INDEX).count()

        print_output('original_count, test_count, train_count',
                     [original_count, test_count, train_count])
        # To avoid any inconsistencies caused by rounding assume sizes are between small limits
        self.assertTrue(self.is_between_limits(test_count, original_count,
                                               0.2))
        self.assertTrue(
            self.is_between_limits(train_count, original_count, 0.8))
Example #21
0
def get_tag_candidates(tagger_group_id: int,
                       text: str,
                       ignore_tags: List[str] = [],
                       n_similar_docs: int = 10,
                       max_candidates: int = 10):
    """
    Finds frequent tags from documents similar to input document.
    Returns empty list if hybrid option false.
    """
    hybrid_tagger_object = TaggerGroup.objects.get(pk=tagger_group_id)
    field_paths = json.loads(hybrid_tagger_object.taggers.first().fields)
    indices = hybrid_tagger_object.get_indices()
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Selecting from following indices: {indices}.")
    ignore_tags = {tag["tag"]: True for tag in ignore_tags}
    # create query
    query = Query()
    query.add_mlt(field_paths, text)
    # create Searcher object for MLT
    es_s = ElasticSearcher(indices=indices, query=query.query)
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Trying to retrieve {n_similar_docs} documents from Elastic..."
    )
    docs = es_s.search(size=n_similar_docs)
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Successfully retrieved {len(docs)} documents from Elastic."
    )
    # dict for tag candidates from elastic
    tag_candidates = {}
    # retrieve tags from elastic response
    for doc in docs:
        if "texta_facts" in doc:
            for fact in doc["texta_facts"]:
                if fact["fact"] == hybrid_tagger_object.fact_name:
                    fact_val = fact["str_val"]
                    if fact_val not in ignore_tags:
                        if fact_val not in tag_candidates:
                            tag_candidates[fact_val] = 0
                        tag_candidates[fact_val] += 1
    # sort and limit candidates
    tag_candidates = [
        item[0] for item in sorted(
            tag_candidates.items(), key=lambda k: k[1], reverse=True)
    ][:max_candidates]
    logging.getLogger(INFO_LOGGER).info(
        f"[Get Tag Candidates] Retrieved {len(tag_candidates)} tag candidates."
    )
    return tag_candidates
Example #22
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object = self.get_object()
        # check if tagger exists

        if not tagger_object.model.path:
            raise NonExistantModelError()

        if not tagger_object.model.path:
            return Response({'error': 'model does not exist (yet?)'},
                            status=status.HTTP_400_BAD_REQUEST)

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = tagger_object.get_available_or_all_indices(indices)

        # retrieve tagger fields
        tagger_fields = json.loads(tagger_object.fields)
        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]

        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        # apply tagger
        tagger_response = apply_tagger(tagger_object.id,
                                       random_doc_filtered,
                                       input_type='doc')
        response = {"document": random_doc, "prediction": tagger_response}
        return Response(response, status=status.HTTP_200_OK)
Example #23
0
 def test_applying_lang_detect_with_query(self):
     mlp_field = f"{TEST_FIELD}_mlp"
     query_string = "inimene"
     payload = {
         "description": "TestingIndexProcessing",
         "field": TEST_FIELD,
         "query": json.dumps({'query': {'match': {'comment_content_lemmas': query_string}}}, ensure_ascii=False)
     }
     response = self.client.post(self.url, data=payload, format="json")
     print_output("test_applying_lang_detect_with_query:response.data", response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name], output=ElasticSearcher.OUT_DOC, query=json.loads(payload["query"]))
     for hit in s:
         if TEST_FIELD in hit:
             self.assertTrue(f"{mlp_field}.language.detected" in hit)
             lang_value = hit[f"{mlp_field}.language.detected"]
             self.assertTrue(lang_value == "et")
Example #24
0
def apply_crf_extractor_to_index(object_id: int, indices: List[str],
                                 mlp_fields: List[str], label_suffix: str,
                                 query: dict, bulk_size: int,
                                 max_chunk_bytes: int, es_timeout: int):
    """
    Applies Extractor to ES index.
    """
    try:
        # load model
        crf_object = CRFExtractorObject.objects.get(pk=object_id)
        extractor = crf_object.load_extractor()
        # progress
        progress = ShowProgress(crf_object.task)
        # add fact field if missing
        ec = ElasticCore()
        [ec.add_texta_facts_mapping(index) for index in indices]
        # search
        searcher = ElasticSearcher(
            indices=indices,
            field_data=mlp_fields +
            ["texta_facts"],  # Get facts to add upon existing ones.
            query=query,
            output=ElasticSearcher.OUT_RAW,
            timeout=f"{es_timeout}m",
            callback_progress=progress,
            scroll_size=bulk_size)
        # create update actions
        actions = update_generator(generator=searcher,
                                   ec=ec,
                                   mlp_fields=mlp_fields,
                                   label_suffix=label_suffix,
                                   object_id=object_id,
                                   extractor=extractor)
        # perform updates
        try:
            # as we have defined indices in actions there is no need to do it again (None)
            ElasticDocument(None).bulk_update(actions)
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
        # all done
        crf_object.task.complete()
        return True

    except Exception as e:
        crf_object.task.handle_failed_task(e)
        raise e
Example #25
0
    def extract_from_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get rakun object
        rakun_object: RakunExtractor = RakunExtractor.objects.get(pk=pk)

        serializer = RakunExtractorRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        project_object = Project.objects.get(pk=project_pk)
        indices = [index["name"] for index in serializer.validated_data["indices"]]
        indices = project_object.get_available_or_all_project_indices(indices)

        # retrieve rakun fields
        fields = serializer.validated_data["fields"]

        # retrieve param add_spans
        add_spans = serializer.validated_data["add_spans"]

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(size=1)[0]
        flattened_doc = ElasticCore(check_connection=False).flatten(random_doc)

        # apply rakun
        results = {
            "rakun_id": rakun_object.pk,
            "description": rakun_object.description,
            "result": False,
            "keywords": [],
            "document": flattened_doc
        }
        final_keywords = []
        keyword_detector = rakun_object.load_rakun_keyword_detector()
        for field in fields:
            text = flattened_doc.get(field, "")
            results["document"][field] = text
            keywords = rakun_object.get_rakun_keywords(keyword_detector=keyword_detector, texts=[text], field_path=field, fact_name=rakun_object.description, fact_value="", add_spans=add_spans)

            if keywords:
                final_keywords.extend(keywords)
                results["result"] = True

        results["keywords"] = final_keywords
        return Response(results, status=status.HTTP_200_OK)
Example #26
0
 def test_processing_with_just_tokenizer(self):
     payload = {
         "description": "hello there, kenobi.",
         "fields": [TEST_FIELD],
         "analyzers": ["tokenizer"],
         "indices": [{
             "name": self.test_index_name
         }],
         "query": json.dumps(TEST_QUERY, ensure_ascii=False)
     }
     response = self.client.post(self.list_url, data=payload, format="json")
     print_output("test_processing_with_just_tokenizer:response.data",
                  response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name], query=TEST_QUERY)
     for hit in s:
         new_field = f'{TEST_FIELD}_es.tokenized_text'
         self.assertTrue(new_field in hit)
         self.assertTrue(hit[new_field] != hit[TEST_FIELD])
Example #27
0
 def _get_negatives(self, size):
     self.show_progress.update_step("scrolling negative sample")
     self.show_progress.update_view(0)
     # iterator for retrieving negative examples
     negative_sample_iterator = ElasticSearcher(
         indices=self.indices,
         field_data=self.field_data,
         output=ElasticSearcher.OUT_DOC,
         callback_progress=self.show_progress,
         text_processor=self.text_processor,
         scroll_limit=int(size *
                          float(self.tagger_object.negative_multiplier)),
         ignore_ids=self.ignore_ids,
     )
     # iterator to list
     negative_sample = list(negative_sample_iterator)
     # document doct to value string if asked
     if self.join_fields:
         negative_sample = self._join_fields(negative_sample)
     return negative_sample
Example #28
0
 def test_normal_process_application(self):
     payload = {
         "description": "hello there, kenobi.",
         "analyzers": ["stemmer"],
         "fields": [TEST_FIELD],
         "stemmer_lang": "estonian",
         "indices": [{
             "name": self.test_index_name
         }]
     }
     response = self.client.post(self.list_url, data=payload, format="json")
     print_output("test_normal_process_application:response.data",
                  response.data)
     self.assertTrue(response.status_code == status.HTTP_201_CREATED)
     s = ElasticSearcher(indices=[self.test_index_name])
     for hit in s:
         new_field = f'{TEST_FIELD}_es.stems'
         self.assertTrue(new_field in hit)
         self.assertTrue(hit[new_field] != hit[TEST_FIELD])
         break
Example #29
0
def apply_lang_on_indices(self, apply_worker_id: int):
    worker_object = ApplyLangWorker.objects.get(pk=apply_worker_id)
    task_object = worker_object.task
    try:
        load_mlp()
        show_progress = ShowProgress(task_object, multiplier=1)
        show_progress.update_step(
            'scrolling through the indices to apply lang')

        # Get the necessary fields.
        indices: List[str] = worker_object.get_indices()
        field = worker_object.field

        scroll_size = 100
        searcher = ElasticSearcher(query=json.loads(worker_object.query),
                                   indices=indices,
                                   field_data=[field],
                                   output=ElasticSearcher.OUT_RAW,
                                   callback_progress=show_progress,
                                   scroll_size=scroll_size,
                                   scroll_timeout="15m")

        for index in indices:
            searcher.core.add_texta_facts_mapping(index=index)

        actions = process_lang_actions(generator=searcher,
                                       field=field,
                                       worker_id=apply_worker_id,
                                       mlp_class=mlp)

        # Send the data towards Elasticsearch
        ed = ElasticDocument("_all")
        elastic_response = ed.bulk_update(actions=actions)

        worker_object.task.complete()

        return apply_worker_id

    except Exception as e:
        task_object.handle_failed_task(e)
        raise e
Example #30
0
    def post(self, request, project_pk: int):
        """Simplified search interface for making Elasticsearch queries."""
        serializer = ProjectSimplifiedSearchSerializer(data=request.data)
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        project_object = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project_object)
        project_indices = list(project_object.get_indices())
        project_fields = project_object.get_elastic_fields(path_list=True)
        # test if indices exist
        if not project_indices:
            raise ProjectValidationFailed(detail="Project has no indices")
        # test if indices are valid
        if serializer.validated_data['match_indices']:
            if not set(serializer.validated_data['match_indices']).issubset(set(project_indices)):
                raise ProjectValidationFailed(detail=f"Index names are not valid for this project. allowed values are: {project_indices}")
        # test if fields are valid
        if serializer.validated_data['match_fields']:
            if not set(serializer.validated_data['match_fields']).issubset(set(project_fields)):
                raise ProjectValidationFailed(detail=f"Fields names are not valid for this project. allowed values are: {project_fields}")

        es = ElasticSearcher(indices=project_indices, output=ElasticSearcher.OUT_DOC)
        q = Query(operator=serializer.validated_data['operator'])
        # if input is string, convert to list
        # if unknown format, return error
        match_text = serializer.validated_data['match_text']
        if isinstance(match_text, list):
            match_texts = [str(item) for item in match_text if item]
        elif isinstance(match_text, str):
            match_texts = [match_text]
        else:
            return Response({'error': f'match text is in unknown format: {match_text}'}, status=status.HTTP_400_BAD_REQUEST)
        # add query filters
        for item in match_texts:
            q.add_string_filter(item, match_type=serializer.validated_data["match_type"])
        # update query
        es.update_query(q.query)
        # retrieve results
        results = es.search(size=serializer.validated_data["size"])
        return Response(results, status=status.HTTP_200_OK)