Ejemplo n.º 1
0
    def post(self, request, project_pk: int):
        """
        Returns existing fact names and values from Elasticsearch.
        """
        serializer = ProjectFactAggregatorSerializer(data=request.data)

        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)

        indices = serializer.validated_data["indices"]
        indices = [index["name"] for index in indices]

        # retrieve and validate project indices
        project = get_object_or_404(Project, pk=project_pk)
        self.check_object_permissions(request, project)
        project_indices = project.get_available_or_all_project_indices(indices)  # Gives all if n   one, the default, is entered.

        if not project_indices:
            return Response([])

        key_field = serializer.validated_data["key_field"]
        value_field = serializer.validated_data["value_field"]
        filter_by_key = serializer.validated_data["filter_by_key"]
        max_count = serializer.validated_data["max_count"]
        query = serializer.validated_data["query"]

        if isinstance(query, str):
            query = json.loads(query)

        aggregator = ElasticAggregator(indices=project_indices, query=query)
        results = aggregator.facts_abstract(key_field=key_field, value_field=value_field, filter_by_key=filter_by_key, size=max_count)

        return Response(results, status=status.HTTP_200_OK)
Ejemplo n.º 2
0
def validate_evaluation_type(indices: List[str], query: dict,
                             evaluation_type: str, true_fact: str,
                             pred_fact: str, true_value: str, pred_value: str):
    """ Checks if the chosen facts (and values) are applicable for the chosen evaluation type.
    """

    if evaluation_type == "binary":
        if not true_value or not pred_value:
            raise ValidationError(
                f"Please specify true and predicted values for evaluation type 'binary'."
            )
    #elif evaluation_type == "multilabel":
    #    if true_value or pred_value:
    #        raise ValidationError(f"Please leave true and predicted values unspeficied for evaluation type 'multilabel'.")
    elif evaluation_type == "entity":
        if true_value or pred_value:
            raise ValidationError(
                f"Please leave true and predicted values unspeficied for evaluation type 'entity'."
            )

        ag = ElasticAggregator(indices=indices, query=deepcopy(query))

        true_fact_results = ag.facts_abstract(key_field="fact",
                                              value_field="spans",
                                              filter_by_key=true_fact,
                                              size=5)
        pred_fact_results = ag.facts_abstract(key_field="fact",
                                              value_field="spans",
                                              filter_by_key=pred_fact,
                                              size=5)

        if len(true_fact_results) == 1:
            spans = json.loads(true_fact_results[0])
            if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0):
                raise ValidationError(
                    f"Did not find non-zero spans for selected true fact '{true_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'."
                )

        if len(pred_fact_results) == 1:
            spans = json.loads(pred_fact_results[0])
            if not spans[0] or (spans[0][0] == 0 and spans[0][1] == 0):
                raise ValidationError(
                    f"Did not find non-zero spans for selected predicted fact '{pred_fact}'. Please make sure to use facts with existing spans for evaluation_type 'entity'."
                )

    return True
Ejemplo n.º 3
0
def validate_entity_facts(indices: List[str], query: dict, true_fact: str,
                          pred_fact: str, doc_path: str):
    """ Check if facts chosen for entity evaluation follow all the necessary requirements. """

    ag = ElasticAggregator(indices=indices, query=deepcopy(query))

    true_fact_doc_paths = ag.facts_abstract(key_field="fact",
                                            value_field="doc_path",
                                            filter_by_key=true_fact)
    pred_fact_doc_paths = ag.facts_abstract(key_field="fact",
                                            value_field="doc_path",
                                            filter_by_key=pred_fact)

    if doc_path:
        if doc_path not in true_fact_doc_paths:
            raise ValidationError(
                f"The selected true_fact ('{true_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')."
            )

        if doc_path not in pred_fact_doc_paths:
            raise ValidationError(
                f"The selected predicted_fact ('{pred_fact}') doesn't contain any instances corresponding to the selected field('{doc_path}')."
            )

    if not doc_path:
        if set(true_fact_doc_paths) != set(pred_fact_doc_paths):
            raise ValidationError(
                f"The doc paths for true and predicted facts are different (true = {true_fact_doc_paths}; predicted = {pred_fact_doc_paths}). Please make sure you are evaluating facts based on the same fields."
            )

        if len(true_fact_doc_paths) > 1:
            raise ValidationError(
                f"Selected true fact ({true_fact}) is related to two or more fields {true_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'."
            )

        if len(pred_fact_doc_paths) > 1:
            raise ValidationError(
                f"Selected predicted fact ({pred_fact}) is related to two or more fields {pred_fact_doc_paths}, but the value for parameter 'field' isn't defined. Please define parameter 'field'."
            )

    return True
Ejemplo n.º 4
0
def evaluate_entity_tags_task(object_id: int,
                              indices: List[str],
                              query: dict,
                              es_timeout: int = 10,
                              scroll_size: int = 100):
    try:
        logging.getLogger(INFO_LOGGER).info(
            f"Starting entity evaluator task for Evaluator with ID {object_id}."
        )

        evaluator_object = Evaluator.objects.get(pk=object_id)
        progress = ShowProgress(evaluator_object.task, multiplier=1)

        true_fact = evaluator_object.true_fact
        pred_fact = evaluator_object.predicted_fact

        add_misclassified_examples = evaluator_object.add_misclassified_examples
        token_based = evaluator_object.token_based

        # If the user hasn't defined a field, retrieve it automatically
        if not evaluator_object.field:
            es_aggregator = ElasticAggregator(indices=indices,
                                              query=deepcopy(query))
            true_fact_doc_paths = es_aggregator.facts_abstract(
                key_field="fact",
                value_field="doc_path",
                filter_by_key=true_fact)
            doc_path = true_fact_doc_paths[0]
        else:
            doc_path = evaluator_object.field

        searcher = ElasticSearcher(indices=indices,
                                   field_data=[doc_path, "texta_facts"],
                                   query=query,
                                   output=ElasticSearcher.OUT_RAW,
                                   timeout=f"{es_timeout}m",
                                   callback_progress=progress,
                                   scroll_size=scroll_size)

        # Get number of documents
        n_docs = searcher.count()
        evaluator_object.task.total = n_docs
        evaluator_object.task.save()

        evaluator_object.document_count = n_docs
        evaluator_object.scores_imprecise = False
        evaluator_object.score_after_scroll = False
        evaluator_object.add_individual_results = False

        # Save model updates
        evaluator_object.save()

        # Get number of batches for the logger
        n_batches = math.ceil(n_docs / scroll_size)

        scores, misclassified = scroll_and_score_entity(
            searcher, evaluator_object, true_fact, pred_fact, doc_path,
            token_based, n_batches, add_misclassified_examples)

        logging.getLogger(INFO_LOGGER).info(f"Final scores: {scores}")

        for conn in connections.all():
            conn.close_if_unusable_or_obsolete()

        # Generate confusion matrix plot and save it
        image_name = f"{secrets.token_hex(15)}.png"
        classes = ["other", true_fact]
        evaluator_object.plot.save(image_name,
                                   create_confusion_plot(
                                       scores["confusion_matrix"], classes),
                                   save=False)
        image_path = pathlib.Path(MEDIA_URL) / image_name
        evaluator_object.plot.name = str(image_path)

        evaluator_object.save()
        evaluator_object.task.complete()
        return True

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception(e)
        error_message = f"{str(e)[:100]}..."  # Take first 100 characters in case the error message is massive.
        evaluator_object.task.add_error(error_message)
        evaluator_object.task.update_status(Task.STATUS_FAILED)