def get_significant_words(indices: List[str], fields: List[str], document_ids: List[str], stop_words: List = None, exclude=""): """ This is a helper function to parse all the given fields and use the document_ids as input to make a significant_words aggregation. Args: exclude: Regex compatible string for which words to exclude, uses the exclude parameter of Elasticsearch aggregations. stop_words: Optional parameter to remove stopwords from the results. indices: Indices from which to perform the aggregation. fields: From which fields can you get the text content needed for comparison. document_ids: IDs of the documents you want to use as baseline for the aggregation. Returns: List of dictionaries with the signifcant word and how many times it occurs in the documents. """ ed = ElasticDocument("*") ea = ElasticAggregator(indices=indices) stop_words = StopWords._get_stop_words(custom_stop_words=stop_words) # Validate that those documents exist. validated_docs: List[dict] = ed.get_bulk(document_ids) if validated_docs: unique_ids = list(set([index["_id"] for index in validated_docs])) significant_words = [] for field in fields: sw = ea.get_significant_words(document_ids=unique_ids, field=field, stop_words=stop_words, exclude=exclude) significant_words += sw return significant_words else: return []
def post(self, request, project_pk: int): """ Returns existing fact names and values from Elasticsearch. """ serializer = ProjectFactAggregatorSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = serializer.validated_data["indices"] indices = [index["name"] for index in indices] # retrieve and validate project indices project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) project_indices = project.get_available_or_all_project_indices(indices) # Gives all if n one, the default, is entered. if not project_indices: return Response([]) key_field = serializer.validated_data["key_field"] value_field = serializer.validated_data["value_field"] filter_by_key = serializer.validated_data["filter_by_key"] max_count = serializer.validated_data["max_count"] query = serializer.validated_data["query"] if isinstance(query, str): query = json.loads(query) aggregator = ElasticAggregator(indices=project_indices, query=query) results = aggregator.facts_abstract(key_field=key_field, value_field=value_field, filter_by_key=filter_by_key, size=max_count) return Response(results, status=status.HTTP_200_OK)
def validate_pos_label(data): """ For Tagger, TorchTagger and BertTagger. Checks if the inserted pos label is present in the fact values. """ fact_name = data.get("fact_name") # If fact name is not selected, the value for pos label doesn't matter if not fact_name: return data indices = [index.get("name") for index in data.get("indices")] pos_label = data.get("pos_label") serializer_query = data.get("query") try: # If query is passed as a JSON string query = json.loads(serializer_query) except Exception as e: # if query is passed as a JSON dict query = serializer_query ag = ElasticAggregator(indices=indices, query=query) fact_values = ag.facts(size=10, filter_by_fact_name=fact_name, include_values=True) # If there exists exactly two possible values for the selected fact, check if pos label # is selected and if it is present in corresponding fact values. if len(fact_values) == 2: if not pos_label: raise ValidationError(f"The fact values corresponding to the selected query and fact '{fact_name}' are binary. You must specify param 'pos_label' for evaluation purposes. Allowed values for 'pos_label' are: {fact_values}") elif pos_label not in fact_values: raise ValidationError(f"The specified pos label '{pos_label}' is NOT one of the fact values for fact '{fact_name}'. Please select an existing fact value. Allowed fact values are: {fact_values}") return data
def validate_fact(indices: List[str], query: dict, fact: str): """ Check if given fact exists in the selected indices. """ ag = ElasticAggregator(indices=indices, query=deepcopy(query)) fact_values = ag.get_fact_values_distribution( fact, fact_name_size=choices.DEFAULT_MAX_FACT_AGGREGATION_SIZE) if not fact_values: raise ValidationError( f"Fact '{fact}' not present in any of the selected indices ({indices})." ) return True
def _get_tags(self, fact_name, min_count=50, max_count=None, query={}): """Finds possible tags for training by aggregating active project's indices.""" active_indices = self.tagger_object.get_indices() es_a = ElasticAggregator(indices=active_indices, query=query) # limit size to 10000 unique tags tag_values = es_a.facts(filter_by_fact_name=fact_name, min_count=min_count, max_count=max_count, size=10000) return tag_values
def post(self, request, project_pk: int): """ Returns existing fact names and values from Elasticsearch. """ serializer = ProjectGetFactsSerializer(data=request.data) if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) indices = serializer.validated_data["indices"] indices = [index["name"] for index in indices] # retrieve and validate project indices project = get_object_or_404(Project, pk=project_pk) self.check_object_permissions(request, project) project_indices = project.get_available_or_all_project_indices(indices) # Gives all if n one, the default, is entered. if not project_indices: return Response([]) vals_per_name = serializer.validated_data['values_per_name'] include_values = serializer.validated_data['include_values'] fact_name = serializer.validated_data['fact_name'] include_doc_path = serializer.validated_data['include_doc_path'] exclude_zero_spans = serializer.validated_data['exclude_zero_spans'] mlp_doc_path = serializer.validated_data['mlp_doc_path'] aggregator = ElasticAggregator(indices=project_indices) if mlp_doc_path and exclude_zero_spans: # If exclude_zerp_spans is enabled and mlp_doc_path specified, the other values don't have any effect - # this behaviour might need to change at some point fact_map = aggregator.facts(size=1, include_values=True, include_doc_path=True, exclude_zero_spans=exclude_zero_spans) else: fact_map = aggregator.facts(size=vals_per_name, include_values=include_values, filter_by_fact_name=fact_name, include_doc_path=include_doc_path, exclude_zero_spans=exclude_zero_spans) if fact_name: fact_map_list = [v for v in fact_map] elif mlp_doc_path and exclude_zero_spans: # Return only fact names where doc_path contains mlp_doc_path as a parent field and facts have spans. # NB! Doesn't take into account the situation where facts have the same name, but different doc paths! Could happen! fact_map_list = [k for k, v in fact_map.items() if v and mlp_doc_path == v[0]["doc_path"].rsplit(".", 1)[0]] elif include_values: fact_map_list = [{'name': k, 'values': v} for k, v in fact_map.items()] else: fact_map_list = [v for v in fact_map] return Response(fact_map_list, status=status.HTTP_200_OK)
def test_create_custom_split(self): custom_distribution = {"FUBAR": 10, "bar": 15} payload = { "description": "Original index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "custom", "fact": self.FACT, "custom_distribution": json.dumps(custom_distribution) } response = self.client.post(self.url, data=payload, format="json") print_output('test_create_custom_split:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) # Assert Task gets completed self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED) print_output("Task status", Task.STATUS_COMPLETED) sleep(5) original_distribution = ElasticAggregator( indices=self.test_index_name).get_fact_values_distribution( self.FACT) test_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution( self.FACT) train_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution( self.FACT) print_output( 'original_dist, test_dist, train_dist', [original_distribution, test_distribution, train_distribution]) for label, quant in custom_distribution.items(): self.assertEqual(test_distribution[label], min(quant, original_distribution[label])) for label in original_distribution.keys(): if label not in custom_distribution: self.assertTrue(label not in test_distribution) self.assertTrue(original_distribution[label], train_distribution[label])
def get_tags(self, fact_name, active_project, min_count=1000, max_count=None, indices=None): """Finds possible tags for training by aggregating active project's indices.""" active_indices = list( active_project.get_indices()) if indices is None else indices es_a = ElasticAggregator(indices=active_indices) # limit size to 10000 unique tags tag_values = es_a.facts(filter_by_fact_name=fact_name, min_count=min_count, max_count=max_count, size=10000) return tag_values
def run_apply_crf_to_index_with_specified_label_suffix(self): """Tests applying extractor to index with specified label suffix using apply_to_index endpoint.""" test_tagger_id = self.test_crf_ids[0] url = f'{self.url}{test_tagger_id}/apply_to_index/' label_suffix = "CRF_TEST" payload = { "description": "apply crf test task", "mlp_fields": ["text_mlp"], "indices": [{ "name": self.test_index_copy }], "label_suffix": label_suffix } response = self.client.post(url, payload, format='json') print_output('test_apply_crf_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = CRFExtractor.objects.get(pk=test_tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_crf_to_index_with_specified_label_suffix: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results = ElasticAggregator( indices=[self.test_index_copy]).get_fact_values_distribution( f"GPE_{label_suffix}") print_output( "test_apply_crf_to_index_with_specified_label_suffix:elastic aggerator results:", results) # assert we have more facts than before self.assertTrue(len(results) > 1)
def validate_fact_value(indices: List[str], query: dict, fact: str, fact_value: str): """ Check if given fact value exists under given fact. """ # Fact value is allowed to be empty if not fact_value: return True ag = ElasticAggregator(indices=indices, query=deepcopy(query)) fact_values = ag.facts(size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=fact, include_values=True) if fact_value not in fact_values: raise ValidationError( f"Fact value '{fact_value}' not in the list of fact values for fact '{fact}'." ) return True
def validate_evaluation_type(indices: List[str], query: dict, evaluation_type: str, true_fact: str, pred_fact: str, true_value: str, pred_value: str): """ Checks if the chosen facts (and values) are applicable for the chosen evaluation type. """ if evaluation_type == "binary": if not true_value or not pred_value: raise ValidationError( f"Please specify true and predicted values for evaluation type 'binary'." ) #elif evaluation_type == "multilabel": # if true_value or pred_value: # raise ValidationError(f"Please leave true and predicted values unspeficied for evaluation type 'multilabel'.") elif evaluation_type == "entity": if true_value or pred_value: raise ValidationError( f"Please leave true and predicted values unspeficied for evaluation type 'entity'." ) ag = ElasticAggregator(indices=indices, query=deepcopy(query)) true_fact_results = ag.facts_abstract(key_field="fact", value_field="spans", filter_by_key=true_fact, size=5) pred_fact_results = ag.facts_abstract(key_field="fact", value_field="spans", filter_by_key=pred_fact, size=5) if len(true_fact_results) == 1: spans = json.loads(true_fact_results[0]) if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0): raise ValidationError( f"Did not find non-zero spans for selected true fact '{true_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'." ) if len(pred_fact_results) == 1: spans = json.loads(pred_fact_results[0]) if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0): raise ValidationError( f"Did not find non-zero spans for selected predicted fact '{pred_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'." ) return True
def test_create_equal_split(self): payload = { "description": "Original index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "equal", "test_size": 20, "fact": self.FACT } response = self.client.post(self.url, data=payload) print_output('test_create_equal_split:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) # Assert Task gets completed self.assertEqual(Task.STATUS_COMPLETED, Task.STATUS_COMPLETED) print_output("Task status", Task.STATUS_COMPLETED) sleep(5) original_distribution = ElasticAggregator( indices=self.test_index_name).get_fact_values_distribution( self.FACT) test_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution( self.FACT) train_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution( self.FACT) print_output( 'original_dist, test_dist, train_dist', [original_distribution, test_distribution, train_distribution]) for label, quant in original_distribution.items(): if (quant > 20): self.assertEqual(test_distribution[label], 20) self.assertEqual(train_distribution[label], quant - 20) else: self.assertEqual(test_distribution[label], quant) self.assertTrue(label not in train_distribution)
def test_create_original_split_fact_value_given(self): payload = { "description": "Original index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "original", "test_size": 20, "fact": self.FACT, "str_val": "FUBAR" } response = self.client.post(self.url, data=payload, format="json") print_output( 'test_create_original_split_fact_value_given:response.data', response.data) splitter_obj = IndexSplitter.objects.get(id=response.data['id']) sleep(5) original_distribution = ElasticAggregator( indices=self.test_index_name).get_fact_values_distribution( self.FACT) test_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution( self.FACT) train_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution( self.FACT) print_output( 'original_dist, test_dist, train_dist', [original_distribution, test_distribution, train_distribution]) for label, quant in original_distribution.items(): if label == "FUBAR": self.assertTrue( self.is_between_limits(test_distribution[label], quant, 0.2)) self.assertTrue( self.is_between_limits(train_distribution[label], quant, 0.8))
def _get_max_class_size(self) -> int: """Aggregates over values of the selected fact and returns the size of the largest class.""" max_class_size = 0 fact_name = self._get_fact_name() try: query = json.loads(self.tagger_object.query) except: query = self.tagger_object.query if fact_name: es_aggregator = ElasticAggregator(indices=self.indices, query=query) facts = es_aggregator.get_fact_values_distribution( fact_name=fact_name, fact_name_size=10, fact_value_size=10) logging.getLogger(INFO_LOGGER).info(f"Class frequencies: {facts}") max_class_size = max(facts.values()) return max_class_size
def validate_entity_facts(indices: List[str], query: dict, true_fact: str, pred_fact: str, doc_path: str): """ Check if facts chosen for entity evaluation follow all the necessary requirements. """ ag = ElasticAggregator(indices=indices, query=deepcopy(query)) true_fact_doc_paths = ag.facts_abstract(key_field="fact", value_field="doc_path", filter_by_key=true_fact) pred_fact_doc_paths = ag.facts_abstract(key_field="fact", value_field="doc_path", filter_by_key=pred_fact) if doc_path: if doc_path not in true_fact_doc_paths: raise ValidationError( f"The selected true_fact ('{true_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')." ) if doc_path not in pred_fact_doc_paths: raise ValidationError( f"The selected predicted_fact ('{pred_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')." ) if not doc_path: if set(true_fact_doc_paths) != set(pred_fact_doc_paths): raise ValidationError( f"The doc paths for true and predicted facts are different (true = {true_fact_doc_paths}; predicted = {pred_fact_doc_paths}). Please make sure you are evaluating facts based on the same fields." ) if len(true_fact_doc_paths) > 1: raise ValidationError( f"Selected true fact ({true_fact}) is related to two or more fields {true_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'." ) if len(pred_fact_doc_paths) > 1: raise ValidationError( f"Selected predicted fact ({pred_fact}) is related to two or more fields {pred_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'." ) return True
def run_apply_tagger_group_to_index(self): """Tests applying tagger group to index using apply_to_index endpoint.""" # Make sure reindexer task has finished while self.reindexer_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_tagger_group_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status) sleep(2) url = f'{self.url}{self.test_imported_tagger_group_id}/apply_to_index/' payload = { "description": "apply tagger test task", "new_fact_name": self.new_fact_name, "indices": [{ "name": self.test_index_copy }], "fields": [TEST_FIELD], "lemmatize": False, "n_similar_docs": 10, "n_candidate_tags": 10 } response = self.client.post(url, payload, format='json') print_output('test_apply_tagger_group_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_group_object = TaggerGroup.objects.get( pk=self.test_imported_tagger_group_id) # Wait til the task has finished while tagger_group_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_tagger_group_to_index: waiting for applying tagger task to finish, current status:', tagger_group_object.task.status) sleep(2) results = ElasticAggregator( indices=[self.test_index_copy]).get_fact_values_distribution( self.new_fact_name) print_output( "test_apply_tagger_group_to_index:elastic aggerator results:", results) # Check if at least one new fact is added self.assertTrue(len(results) >= 1) # clean imported_tagger_group = TaggerGroup.objects.get( id=self.test_imported_tagger_group_id) for tagger in imported_tagger_group.taggers.all(): # Remove tagger files after test is done self.add_cleanup_files(tagger.id)
def test_query_given(self): payload = { "description": "Original index splitting", "indices": [{ "name": self.test_index_name }], "train_index": INDEX_SPLITTING_TRAIN_INDEX, "test_index": INDEX_SPLITTING_TEST_INDEX, "distribution": "original", "test_size": 20, "fact": self.FACT, "str_val": "bar", "query": json.dumps(TEST_QUERY) } response = self.client.post(self.url, data=payload, format="json") print_output('test_query_given:response.data', response.data) original_distribution = ElasticAggregator( indices=self.test_index_name).get_fact_values_distribution( self.FACT) test_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TEST_INDEX).get_fact_values_distribution( self.FACT) train_distribution = ElasticAggregator( indices=INDEX_SPLITTING_TRAIN_INDEX).get_fact_values_distribution( self.FACT) print_output( 'original_dist, test_dist, train_dist', [original_distribution, test_distribution, train_distribution]) self.assertTrue("bar" in test_distribution) self.assertTrue("bar" in train_distribution) self.assertTrue("foo" not in train_distribution and "foo" not in test_distribution) self.assertTrue("FUBAR" not in train_distribution and "FUBAR" not in test_distribution)
def run_apply_crf_to_index(self): """Tests applying extractor to index using apply_to_index endpoint.""" test_tagger_id = self.test_crf_ids[0] url = f'{self.url}{test_tagger_id}/apply_to_index/' payload = { "description": "apply crf test task", "mlp_fields": ["text_mlp"], "indices": [{ "name": self.test_index_copy }], } response = self.client.post(url, payload, format='json') print_output('test_apply_crf_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = CRFExtractor.objects.get(pk=test_tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_crf_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results_old = ElasticAggregator( indices=[self.test_index_name]).get_fact_values_distribution("GPE") print_output( "test_apply_crf_to_index_before:elastic aggerator results:", results_old) results_new = ElasticAggregator( indices=[self.test_index_copy]).get_fact_values_distribution("GPE") print_output( "test_apply_crf_to_index_after:elastic aggerator results:", results_new) # assert we have more facts than before for item in ["China", "Russia", "Iran"]: self.assertTrue(results_old[item] < results_new[item])
def run_test_apply_tagger_to_index(self): """Tests applying tagger to index using apply_to_index endpoint.""" # Make sure reindexer task has finished while self.reindexer_object.task.status != Task.STATUS_COMPLETED: print_output('[Regex Tagger] test_apply_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status) sleep(2) tagger_payload = { "description": "LOLL", "lexicon": ["loll"], "counter_lexicon": ["päris"] } response = self.client.post(self.url, tagger_payload) print_output('[Regex Tagger] new regex tagger for applying on the index:response.data', response.data) created_id = response.data['id'] self.tagger_id = created_id url = f'{self.url}{self.tagger_id}/apply_to_index/' payload = { "description": "apply tagger test task", "indices": [{"name": self.test_index_copy}], "fields": [TEST_FIELD] } response = self.client.post(url, payload, format='json') print_output('[Regex Tagger] test_apply_tagger_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = RegexTagger.objects.get(pk=self.tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output("tagger object:", tagger_object.to_json()) print_output('[Regex Tagger] test_apply_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results = ElasticAggregator(indices=[self.test_index_copy]).get_fact_values_distribution("LOLL") print_output("[Regex Tagger] test_apply_tagger_to_index:elastic aggerator results:", results) # Check if expected number if new facts is added fact_value_1 = "loll" fact_value_2 = "lollikindel" n_fact_value_1 = 28 n_fact_value_2 = 1 self.assertTrue(fact_value_1 in results) self.assertTrue(fact_value_2 in results) self.assertTrue(results[fact_value_1] == n_fact_value_1) self.assertTrue(results[fact_value_2] == n_fact_value_2)
def run_apply_multiclass_tagger_to_index(self): """Tests applying multiclass BERT tagger to index using apply_to_index endpoint.""" # Make sure reindexer task has finished while self.reindexer_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_multiclass_bert_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status) sleep(2) url = f'{self.url}{self.test_imported_multiclass_gpu_tagger_id}/apply_to_index/' payload = { "description": "apply bert tagger to index test task", "new_fact_name": self.new_multiclass_fact_name, "new_fact_value": self.new_fact_value, "indices": [{ "name": self.test_index_copy }], "fields": TEST_FIELD_CHOICE } response = self.client.post(url, payload, format='json') print_output( 'test_apply_multiclass_bert_tagger_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = BertTaggerObject.objects.get( pk=self.test_imported_multiclass_gpu_tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_multiclass_bert_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results = ElasticAggregator( indices=[self.test_index_copy]).get_fact_values_distribution( self.new_multiclass_fact_name) print_output( "test_apply_multiclass_bert_tagger_to_index:elastic aggerator results:", results) # Check if the expected facts and the expected number of them are added to the index expected_fact_value = "bar" expected_number_of_facts = 30 self.assertTrue(expected_fact_value in results) self.assertTrue( results[expected_fact_value] == expected_number_of_facts) self.add_cleanup_files(self.test_imported_multiclass_gpu_tagger_id)
def run_apply_multiclass_tagger_to_index(self): """Tests applying multiclass tagger to index using apply_to_index endpoint.""" # Make sure reindexer task has finished while self.reindexer_object.task.status != Task.STATUS_COMPLETED: print_output('test_apply_multiclass_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status) sleep(2) test_tagger_id = self.test_imported_multiclass_tagger_id url = f'{self.url}{test_tagger_id}/apply_to_index/' payload = { "description": "apply multiclass tagger test task", "new_fact_name": self.new_fact_name, "indices": [{"name": self.test_index_copy}], "fields": TEST_FIELD_CHOICE, "lemmatize": False } response = self.client.post(url, payload, format='json') print_output('test_apply_multiclass_tagger_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = Tagger.objects.get(pk=test_tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output('test_apply_mutliclass_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results = ElasticAggregator(indices=[self.test_index_copy]).get_fact_values_distribution(self.new_fact_name) print_output("test_apply_multiclass_tagger_to_index:elastic aggerator results:", results) # Check if applying the tagger results in at least 1 new fact self.assertTrue(len(results) >= 1) fact_value_1 = "bar" fact_value_2 = "foo" n_fact_value_1 = 18 n_fact_value_2 = 12 # Check if expected number of new facts is added to the index self.assertTrue(fact_value_1 in results) self.assertTrue(fact_value_2 in results) self.assertTrue(results[fact_value_1] == n_fact_value_1) self.assertTrue(results[fact_value_2] == n_fact_value_2) self.add_cleanup_files(test_tagger_id)
def run_apply_binary_tagger_to_index(self): """Tests applying binary torch tagger to index using apply_to_index endpoint.""" # Make sure reindexer task has finished while self.reindexer_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_binary_torch_tagger_to_index: waiting for reindexer task to finish, current status:', self.reindexer_object.task.status) sleep(2) url = f'{self.url}{self.test_tagger_id}/apply_to_index/' payload = { "description": "apply torch tagger to index test task", "new_fact_name": self.new_fact_name, "new_fact_value": self.new_fact_value, "indices": [{ "name": self.test_index_copy }], "fields": TEST_FIELD_CHOICE } response = self.client.post(url, payload, format='json') print_output('test_apply_binary_torch_tagger_to_index:response.data', response.data) self.assertEqual(response.status_code, status.HTTP_201_CREATED) tagger_object = TorchTagger.objects.get(pk=self.test_tagger_id) # Wait til the task has finished while tagger_object.task.status != Task.STATUS_COMPLETED: print_output( 'test_apply_binary_torch_tagger_to_index: waiting for applying tagger task to finish, current status:', tagger_object.task.status) sleep(2) results = ElasticAggregator( indices=[self.test_index_copy]).get_fact_values_distribution( self.new_fact_name) print_output( "test_apply_binary_torch_tagger_to_index:elastic aggerator results:", results) # Check if expected number of facts is added self.assertTrue(results[self.new_fact_value] > 10)
def evaluate_entity_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting entity evaluator task for Evaluator with ID {object_id}." ) evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact add_misclassified_examples = evaluator_object.add_misclassified_examples token_based = evaluator_object.token_based # If the user hasn't defined a field, retrieve it automatically if not evaluator_object.field: es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) true_fact_doc_paths = es_aggregator.facts_abstract( key_field="fact", value_field="doc_path", filter_by_key=true_fact) doc_path = true_fact_doc_paths[0] else: doc_path = evaluator_object.field searcher = ElasticSearcher(indices=indices, field_data=[doc_path, "texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Get number of documents n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() evaluator_object.document_count = n_docs evaluator_object.scores_imprecise = False evaluator_object.score_after_scroll = False evaluator_object.add_individual_results = False # Save model updates evaluator_object.save() # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) scores, misclassified = scroll_and_score_entity( searcher, evaluator_object, true_fact, pred_fact, doc_path, token_based, n_batches, add_misclassified_examples) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" classes = ["other", true_fact] evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)
def evaluate_tags_task(object_id: int, indices: List[str], query: dict, es_timeout: int = 10, scroll_size: int = 100): try: logging.getLogger(INFO_LOGGER).info( f"Starting evaluator task for Evaluator with ID {object_id}.") evaluator_object = Evaluator.objects.get(pk=object_id) progress = ShowProgress(evaluator_object.task, multiplier=1) # Retreieve facts and sklearn average function from the model true_fact = evaluator_object.true_fact pred_fact = evaluator_object.predicted_fact true_fact_value = evaluator_object.true_fact_value pred_fact_value = evaluator_object.predicted_fact_value average = evaluator_object.average_function add_individual_results = evaluator_object.add_individual_results searcher = ElasticSearcher(indices=indices, field_data=["texta_facts"], query=query, output=ElasticSearcher.OUT_RAW, timeout=f"{es_timeout}m", callback_progress=progress, scroll_size=scroll_size) # Binary if true_fact_value and pred_fact_value: logging.getLogger(INFO_LOGGER).info( f"Starting binary evaluation. Comparing following fact and fact value pairs: TRUE: ({true_fact}: {true_fact_value}), PREDICTED: ({pred_fact}: {pred_fact_value})." ) # Set the evaluation type in the model evaluator_object.evaluation_type = "binary" true_set = {true_fact_value, "other"} pred_set = {pred_fact_value, "other"} classes = ["other", true_fact_value] n_total_classes = len(classes) # Multilabel/multiclass else: logging.getLogger(INFO_LOGGER).info( f"Starting multilabel evaluation. Comparing facts TRUE: '{true_fact}', PRED: '{pred_fact}'." ) # Make deepcopy of the query to avoid modifying Searcher's query. es_aggregator = ElasticAggregator(indices=indices, query=deepcopy(query)) # Get all fact values corresponding to true and predicted facts to construct total set of labels # needed for confusion matrix, individual score calculations and memory imprint calculations true_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=true_fact) pred_fact_values = es_aggregator.facts( size=choices.DEFAULT_MAX_AGGREGATION_SIZE, filter_by_fact_name=pred_fact) true_set = set(true_fact_values) pred_set = set(pred_fact_values) classes = list(true_set.union(pred_set)) n_total_classes = len(classes) # Add dummy classes for missing labels classes.extend( [choices.MISSING_TRUE_LABEL, choices.MISSING_PRED_LABEL]) ## Set the evaluation type in the model evaluator_object.evaluation_type = "multilabel" classes.sort(key=lambda x: x[0].lower()) # Get number of documents in the query to estimate memory imprint n_docs = searcher.count() evaluator_object.task.total = n_docs evaluator_object.task.save() logging.getLogger(INFO_LOGGER).info( f"Number of documents: {n_docs} | Number of classes: {len(classes)}" ) # Get the memory buffer value from core variables core_memory_buffer_value_gb = get_core_setting( "TEXTA_EVALUATOR_MEMORY_BUFFER_GB") # Calculate the value based on given ratio if the core variable is empty memory_buffer_gb = calculate_memory_buffer( memory_buffer=core_memory_buffer_value_gb, ratio=EVALUATOR_MEMORY_BUFFER_RATIO, unit="gb") required_memory = get_memory_imprint( n_docs=n_docs, n_classes=len(classes), eval_type=evaluator_object.evaluation_type, unit="gb", int_size=64) enough_memory = is_enough_memory_available( required_memory=required_memory, memory_buffer=memory_buffer_gb, unit="gb") # Enable scoring after each scroll if there isn't enough memory # for calculating the scores for the whole set of documents at once. score_after_scroll = False if enough_memory else True # If scoring after each scroll is enabled and scores are averaged after each scroll # the results for each averaging function besides `micro` are imprecise scores_imprecise = True if (score_after_scroll and average != "micro") else False # Store document counts, labels' class counts and indicatior if scores are imprecise evaluator_object.document_count = n_docs evaluator_object.n_true_classes = len(true_set) evaluator_object.n_predicted_classes = len(pred_set) evaluator_object.n_total_classes = n_total_classes evaluator_object.scores_imprecise = scores_imprecise evaluator_object.score_after_scroll = score_after_scroll # Save model updates evaluator_object.save() logging.getLogger(INFO_LOGGER).info( f"Enough available memory: {enough_memory} | Score after scroll: {score_after_scroll}" ) # Get number of batches for the logger n_batches = math.ceil(n_docs / scroll_size) # Scroll and score tags scores, bin_scores = scroll_and_score( generator=searcher, evaluator_object=evaluator_object, true_fact=true_fact, pred_fact=pred_fact, true_fact_value=true_fact_value, pred_fact_value=pred_fact_value, classes=classes, average=average, score_after_scroll=score_after_scroll, n_batches=n_batches, add_individual_results=add_individual_results) logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}") for conn in connections.all(): conn.close_if_unusable_or_obsolete() confusion = scores["confusion_matrix"] confusion = np.asarray(confusion, dtype="int64") if len(classes) <= choices.DEFAULT_MAX_CONFUSION_CLASSES: # Delete empty rows and columns corresponding to missing pred/true labels from the confusion matrix confusion, classes = delete_empty_rows_and_cols(confusion, classes) scores["confusion_matrix"] = confusion.tolist() # Generate confusion matrix plot and save it image_name = f"{secrets.token_hex(15)}.png" evaluator_object.plot.save(image_name, create_confusion_plot( scores["confusion_matrix"], classes), save=False) image_path = pathlib.Path(MEDIA_URL) / image_name evaluator_object.plot.name = str(image_path) # Add final scores to the model evaluator_object.precision = scores["precision"] evaluator_object.recall = scores["recall"] evaluator_object.f1_score = scores["f1_score"] evaluator_object.accuracy = scores["accuracy"] evaluator_object.confusion_matrix = json.dumps( scores["confusion_matrix"]) evaluator_object.individual_results = json.dumps( remove_not_found(bin_scores), ensure_ascii=False) evaluator_object.add_misclassified_examples = False evaluator_object.save() evaluator_object.task.complete() return True except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) error_message = f"{str(e)[:100]}..." # Take first 100 characters in case the error message is massive. evaluator_object.task.add_error(error_message) evaluator_object.task.update_status(Task.STATUS_FAILED)