Esempio n. 1
0
def compute_ee_prf1(pred_batch_tags: list,
                    gold_batch_tags: list) -> Tuple[float, float, float]:
    """
    Compute precision, recall, f1 for named entity recognition.

    TODO: We temporarly consider one event for a given event type only for simplifing preocessing.

    给定一个文档,假设答案包含两个事件实例:

        A1: 事件类型1,要素一取值1,要素二取值2,要素三取值3,要素四取值4
        A2: 事件类型1,要素一取值1,要素二取值6,要素三取值7,要素四取值8

        假设选手给出两个预测结果:
        P1: 事件类型1,要素一取值1,要素二取值2,要素三取值3,要素四取值8
        P2: 事件类型1,要素一取值1,要素二取值2
        P3: 事件类型2,要素一取值9,要素二取值10

    评测时,会采用不放回的方式给每条答案寻找最相似的预测(即事件类型相同且相同要素最多的),例如上例中 A1 与 P1 事件类型相同且 3 个要素相
    同,A1 与 P2 事件类型相同且 2 个要素相同,故与 A1 最相似的预测是 P1,命中 3 个要素。由于采用不放回的方式,此时预测集合剩下 P2、P3,
    与 A2 最相似的预测是 P2,命中 1 个要素。此时两条答案均已找到最相似的预测,可以计算 Precision 和 Recall,如下:


        Precision = (3+1)/(4+2+2), Recall = (3+1)/(4+4)


        注:在给每条答案寻找最相似预测时,相同事件类型的答案会按照要素个数的多少定优先级,如上例中 A1 和 A2 事件类型相同,但 A1 要素个数
        多,故优先为 A1 寻找最相似预测。

        事件要素精确率=识别事件类型与要素和标注相同的数量/识别出事件类型与要素总数量

        事件要素召回率=识别出事件类型与要素和标注相同的数量/标注的事件类型与要素总数量

        事件要素F1值=(2事件要素精确率事件要素召回率)/(事件要素精确率+事件要素召回率)

    Args:
        pred_batch_tags (list): A batch of sentence tags from predictions.
        gold_batch_tags (list): A batch of sentence tags from gold annotations.
    """
    num_sentences = len(pred_batch_tags)
    true_pred, pred_samples, gold_samples = [], [], []
    for i in range(num_sentences):
        for event_id, sequences in enumerate(pred_batch_tags[i]):
            pred_arguments = to_spans(
                pred_batch_tags[i][event_id],
                ["X"] * len(pred_batch_tags[i][event_id]),
                [1.0] * len(pred_batch_tags[i][event_id]))
            gold_arguments = to_spans(
                gold_batch_tags[i][event_id],
                ["X"] * len(gold_batch_tags[i][event_id]),
                [1.0] * len(gold_batch_tags[i][event_id]))
            pred = [str(argument) for argument in pred_arguments]
            gold = [str(argument) for argument in gold_arguments]
            pred_samples.extend(pred)
            gold_samples.extend(gold)
            true_pred.extend(list(set(pred).intersection(set(gold))))
    precision = safe_division(len(true_pred), len(pred_samples))
    recall = safe_division(len(true_pred), len(gold_samples))
    f1 = safe_division(2 * precision * recall, precision + recall)

    return (precision, recall, f1)
Esempio n. 2
0
def compute_prf1(tp: int, fp: int, fn: int) -> Tuple[float, float, float]:
    """
    Compute precision, recall, f1.
        There are four types of predicted results in Confusion matrix:
            1) true positive (TP) eqv. with hit
            2) true negative (TN) eqv. with correct rejection
            3) false positive (FP) eqv. with false alarm, Type I error
            4) false negative (FN) eqv. with miss, Type II error

    Also we can compute the evaluation metric by the following equations:
        Precision = TP / (TP + FP)
        Recall = TP / (TP + FN)
        F1 = 2*TP / (2*TP + FP + FN)

    References:
        [1] https://en.wikipedia.org/wiki/Confusion_matrix

    Args:
        tp (int): The number of true positive samples.
        fp (int): The number of false positive samples.
        fn (int): The number of false negative samples.
    """
    precision = safe_division(tp, tp + fp)
    recall = safe_division(tp, tp + fn)
    f1 = safe_division(2 * tp, 2 * tp + fp + fn)

    return (precision, recall, f1)
Esempio n. 3
0
def test_safe_division():
    """Test the function `safe_division`."""
    assert safe_division(1, 0) == 0
    assert safe_division(2, 0) == 0
    assert safe_division(2, 0.01) == 2 / 0.01
    assert safe_division(1, 1) == 1 / 1
    assert safe_division(1, 2) == 1 / 2
Esempio n. 4
0
def sentence_distance(s1: str, s2: str, n: int = 3) -> float:
    r"""
    Given a pair of sentences, s1 and s2, calculate the reverse of the mean Jaccard Index between the
    sentences' n-grams sets to represent the semantic distances.

        DIST(a, b) = 1 - \sum_{i=1}^{n} \frac{ngram_{a} \cap ngram_{b}}{ngram_{a} \cup ngram_{b}}

    Args:
        s1 (str): Sentence 1.
        s2 (str): Sentence 2.
        n (int): The maximum n-gram length.

    Reference:
        [1] https://www.aclweb.org/anthology/N18-3005/
        [2] Data Collection for a Production Dialogue System: A Clinc Perspective
    """
    assert isinstance(s1, str) and isinstance(s2, str)
    assert isinstance(n, int) and n > 0
    if s1 == s2:
        return 0.0
    prob_sum = 0.0
    while n > 0:
        set_1 = set([s1[i: i + n] for i in range(len(s1) - n + 1)])
        set_2 = set([s2[i: i + n] for i in range(len(s2) - n + 1)])
        inter_sets = set_1.intersection(set_2)
        union_sets = set_1.union(set_2)
        prob_sum += safe_division(len(inter_sets), len(union_sets))
        n -= 1

    return 1.0 - prob_sum
Esempio n. 5
0
    def _test(self, loader: DataLoader) -> Dict[str, Any]:
        """
        Test a model and return the results.

        Args:
            loader (DataLoader): The dataloader used to be tested.
        """
        self._model.eval()
        batch_true_tags, batch_pred_tags = [], []
        test_loss = 0.0
        with torch.no_grad():
            for _, batch_group in enumerate(loader):
                true_tags, pred_tags = self._test_callback(
                    batch_group, self._model, self._out_adapter, self._config)
                batch_true_tags.extend(true_tags)
                batch_pred_tags.extend(pred_tags)
                loss = self._train_callback(batch_group, self._model,
                                            self._loss_fn, self._config)
                test_loss += loss.item()
        precision_score, recall_score, f1_score = compute_ner_prf1(
            batch_pred_tags, batch_true_tags)
        test_loss = safe_division(test_loss, len(loader))
        results = {
            "f1-score": f1_score,
            "precision-score": precision_score,
            "recall-score": recall_score,
            "test-loss": test_loss
        }

        return results
Esempio n. 6
0
def span_distribution(dataset: List[dict]) -> Dict[str, Any]:
    """
    Calculate sentence length distribution, average spans per sentence, label distribution, etc.
    Data's format should be as the same as the following sample:

        dataset = [
            {"text": "abcdefg", "spans": [{"start": 0, "end": 1, "label": "example"}]}
        ]
    Args:
        dataset (List[dict]): Training set.
    """
    results = {
        "sentence": {
            "max": 0,
            "min": 0,
            "avg": 0,
            "dist": defaultdict(int)
        },
        "span": {
            "max": 0,
            "min": 0,
            "avg": 0,
            "dist": defaultdict(int)
        },
        "label": {
            "dist": defaultdict(int)
        },
        "entities": set()
    }
    for sample in dataset:
        results["sentence"]["dist"][len(sample["text"])] += 1
        for ins in sample["spans"]:
            results["span"]["dist"][ins["end"] - ins["start"] + 1] += 1
            results["label"]["dist"][ins["label"]] += 1
            results["entities"].add(ins["text"])
    for category in ["sentence", "span", "label"]:
        if category != "label":
            results[category]["avg"] = safe_division(
                sum([
                    length * count
                    for length, count in results[category]["dist"].items()
                ]), sum(results[category]["dist"].values()))
            lengths = list(results[category]["dist"].keys())
            if len(lengths) == 0:
                lengths += [0]
            results[category]["max"] = max(lengths)
            results[category]["min"] = min(lengths)
            results[category]["dist"] = sorted(list(
                results[category]["dist"].items()),
                                               key=lambda t: t[0])
        else:
            results[category]["dist"] = list(results[category]["dist"].items())
    results["entities"] = sorted(list(results["entities"]))

    return results
Esempio n. 7
0
def span_coverage_ratio(train_set: List[dict],
                        test_set: List[dict]) -> Union[float, int]:
    r"""
    Calculate Span Coverage Ratio (SCR) of test set on the training set. All data's format should be as the same
    as the following sample:

        train/dev/test-set = [
            {"text": "abcdefg", "spans": [{"start": 0, "end": 1, "label": "example"}]}
        ]

    We use the following equation to calculate Span Coverage Ratio:

        p = \frac{\sum_{k=1}^{K} \frac{\#(e_i^{tr,k})}{C^{tr}} \#(e_i^{te,k})}{C^{te}}

    Args:
        train_set (List[dict]): Training set.
        test_set (List[dict]): Test set.

    References:
        [1] Rethinking Generalization of Neural Models: A Named Entity Recognition Case Study
        [2] https://github.com/pfliu-nlp/Named-Entity-Recognition-NER-Papers
    """
    labels = set()
    train_count, test_count = defaultdict(int), defaultdict(int)
    for sample in train_set:
        for span in sample["spans"]:
            labels.add(span["label"])
            train_count[span["label"]] += 1
    for sample in test_set:
        for span in sample["spans"]:
            labels.add(span["label"])
            test_count[span["label"]] += 1
    p = 0.0
    n_train_spans, n_test_spans = sum(train_count.values()), sum(
        test_count.values())
    for label in labels:
        p += safe_division(train_count[label],
                           n_train_spans) * test_count[label]
    p = safe_division(p, n_test_spans)

    return p
Esempio n. 8
0
def compute_ner_prf1(pred_batch_tags: list, gold_batch_tags: list) -> Tuple[float, float, float]:
    """
    Compute precision, recall, f1 for named entity recognition.

    Args:
        pred_batch_tags (list): A batch of sentence tags from predictions.
        gold_batch_tags (list): A batch of sentence tags from gold annotations.
    """
    num_sentences = len(pred_batch_tags)
    true_pred, pred_samples, gold_samples = [], [], []
    for i in range(num_sentences):
        pred = [str(span) for span in to_spans(pred_batch_tags[i], ["X"]*len(pred_batch_tags[i]), [1.0]*len(pred_batch_tags[i]))]
        gold = [str(span) for span in to_spans(gold_batch_tags[i], ["X"]*len(gold_batch_tags[i]), [1.0]*len(gold_batch_tags[i]))]
        pred_samples.extend(pred)
        gold_samples.extend(gold)
        true_pred.extend(list(set(pred).intersection(set(gold))))
    precision = safe_division(len(true_pred), len(pred_samples))
    recall = safe_division(len(true_pred), len(gold_samples))
    f1 = safe_division(2*precision*recall, precision+recall)

    return (precision, recall, f1)
Esempio n. 9
0
def lexicon_distribution(gazetteer: Gazetteer,
                         dataset: List[dict]) -> Dict[str, Any]:
    """
    Given a dataset, compute matched lexicon distribution from a gazetteer.

    Args:
        gazetteer (Gazetteer): Gazetteer used to search matched lexicons.
        dataset (List[dict]): Dataset.
    """
    results = {
        "token": {
            "max": 0,
            "min": 0,
            "avg": 0,
            "dist": defaultdict(int)
        },
        "sentence": {
            "max": 0,
            "min": 0,
            "avg": 0,
            "dist": defaultdict(int)
        }
    }
    batch_tokens = [list(sample["text"]) for sample in dataset]
    batch_token_matched_lexicons = [[[] for _, _ in enumerate(sample["text"])]
                                    for _, sample in enumerate(dataset)]
    batch_sentence_matched_lenxicons = [[] for _, _ in enumerate(dataset)]
    for i, tokens in enumerate(batch_tokens):
        for j, _ in enumerate(tokens):
            matched_lexicons = gazetteer.search(tokens[j:])
            for lexicon in matched_lexicons:
                batch_sentence_matched_lenxicons[i].append(lexicon)
                for k in range(j, j + len(lexicon)):
                    batch_token_matched_lexicons[i][k].append(lexicon)
    # sentence level
    for _, sentence_matched_lexicons in enumerate(
            batch_sentence_matched_lenxicons):
        results["sentence"]["dist"][len(sentence_matched_lexicons)] += 1
    # token level
    for batch_id, _ in enumerate(batch_token_matched_lexicons):
        for token_pos_id, _ in enumerate(
                batch_token_matched_lexicons[batch_id]):
            results["token"]["dist"][len(
                batch_token_matched_lexicons[batch_id][token_pos_id])] += 1

    for category in ["sentence", "token"]:
        results[category]["avg"] = safe_division(
            sum([
                number * count
                for number, count in results[category]["dist"].items()
            ]), sum(results[category]["dist"].values()))
        numbers = list(results[category]["dist"].keys())
        if len(numbers) == 0:
            numbers += [0]
        results[category]["max"] = max(numbers)
        results[category]["min"] = min(numbers)
        results[category]["dist"] = sorted(list(
            results[category]["dist"].items()),
                                           key=lambda t: t[0])

    # entity coverage ratio in gazetteer (gECR)
    total_count, matched_count = 0, 0
    for sample in dataset:
        for span in sample["spans"]:
            total_count += 1
            if gazetteer.exist(list(span["text"])):
                matched_count += 1
    results["gECR"] = safe_division(matched_count, total_count)

    return results