Beispiel #1
0
  def get_all_fscores(self):
    n_correct = len(self.correct)
    true_entities = get_entities(self.y_true)
    pred_entities = get_entities(self.y_pred)
    n_true = len(true_entities)
    n_pred = len(pred_entities)

    
    p = (n_correct ) / n_pred if n_pred > 0 else 0
    r = n_correct / n_true if n_true > 0 else 0
    exact_f_score = 2 * p * r / (p + r) if p + r > 0 else 0



    p = (n_correct + len((self.right_label_over_span)) ) / n_pred if n_pred > 0 else 0
    r = (n_correct + len((self.right_label_over_span)) )/ n_true if n_true > 0 else 0
    relaxed_f_score = 2 * p * r / (p + r) if p + r > 0 else 0

    overlap_pred_score = self.get_overlap_score()
    p = (n_correct + overlap_pred_score) / n_pred if n_pred > 0 else 0
    r = (n_correct + overlap_pred_score) / n_true if n_true > 0 else 0
    user_exp_f_score = 2 * p * r / (p + r) if p + r > 0 else 0
    exact_f_score, relaxed_f_score, user_exp_f_score

    return exact_f_score,  user_exp_f_score,relaxed_f_score
Beispiel #2
0
def get_error_types(y_true,y_pred):
  true_entities = get_entities(y_true)
  pred_entities = get_entities(y_pred)
  correct = set(true_entities)&set(pred_entities)
  true_entities_rest = set(true_entities)-correct
  pred_entities_rest = set(pred_entities)-correct

  right_label_overlapping_span = []
  wrong_label_overlapping_span = []
  wrong_label_right_span= []
  complete_false_positive = []
  complete_false_negative = []
  for true_entity in list(true_entities_rest):
    for pred_entity in list(pred_entities_rest):
      overlap = get_overlap(true_entity, pred_entity)
      if len(overlap)>0:
        if true_entity[0]==pred_entity[0]:
          right_label_overlapping_span.append((true_entity, pred_entity))
        elif (true_entity[1]==pred_entity[1]) & (true_entity[2]==pred_entity[2]):
          wrong_label_right_span.append((true_entity, pred_entity))
        else:
          wrong_label_overlapping_span.append((true_entity, pred_entity))

  complete_false_positive = pred_entities_rest - set([item[1] for item in right_label_overlapping_span])-\
                                                set([item[1] for item in wrong_label_overlapping_span])-set([item[1] for item in wrong_label_right_span]) 

  complete_false_negative = true_entities_rest - set([item[0] for item in right_label_overlapping_span])-\
                                                set([item[0] for item in wrong_label_overlapping_span])-set([item[0] for item in wrong_label_right_span])
  return  correct, right_label_overlapping_span, wrong_label_overlapping_span, wrong_label_right_span, complete_false_positive, complete_false_negative
Beispiel #3
0
def recall_score_span(y_true, y_pred, average='micro', suffix=False):
    """Compute the recall.

    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
    true positives and ``fn`` the number of false negatives. The recall is
    intuitively the ability of the classifier to find all the positive samples.

    The best value is 1 and the worst value is 0.

    Args:
        y_true : 2d array. Ground truth (correct) target values.
        y_pred : 2d array. Estimated targets as returned by a tagger.

    Returns:
        score : float.

    Example:
        >>> from seqeval.metrics import recall_score
        >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        >>> recall_score(y_true, y_pred)
        0.50
    """
    true_entities = set([(y, z) for x, y, z in get_entities(y_true, suffix)])
    pred_entities = set([(y, z) for x, y, z in get_entities(y_pred, suffix)])

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    score = nb_correct / nb_true if nb_true > 0 else 0

    return score
Beispiel #4
0
    def test_get_entities_with_non_NE_input(self):
        y_true = ['O', 'O', 'O', 'MISC', 'MISC', 'MISC', 'O', 'PER', 'PER']
        with self.assertWarns(UserWarning):
            get_entities(y_true)

        with self.assertWarns(UserWarning):
            get_entities(y_true, suffix=True)
Beispiel #5
0
def evaluate_term_multi_token(test_data, all_preds_bio):

    y_pred = []
    for row in all_preds_bio:
        for tag in row:
            y_pred.append(tag)

    texts_idxes = []
    y_gold = []
    tokens_list = []
    i = 0
    for line in test_data:
        for gold_bio, token in zip(line['tags'], line['tokens']):
            y_gold.append(gold_bio)
            tokens_list.append(token)
            texts_idxes.append(i)
        i = i + 1

    gold_entities = set(get_entities(y_gold))
    pred_entities = set(get_entities(y_pred))
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in gold_entities:
        d1[e[0]].add((e[1], e[2]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    err_analysis_list = []
    for type_name, gold_entities in d1.items():
        pred_entities = d2[type_name]
        for pred_ent in pred_entities:
            if pred_ent in gold_entities:
                eval_type = 'TP'
            else:
                eval_type = 'FP'
            tokens = ' '.join(tokens_list[pred_ent[0]:pred_ent[-1] + 1])
            text = test_data[texts_idxes[pred_ent[0]]]['text']
            err_analysis_list.append({
                'Eval_type': eval_type,
                'Term': tokens,
                'Text': text
            })

        for gold_ent in gold_entities:
            if gold_ent not in pred_entities:
                eval_type = 'FN'
                tokens = ' '.join(tokens_list[gold_ent[0]:gold_ent[-1] + 1])
                text = test_data[texts_idxes[gold_ent[0]]]['text']
                err_analysis_list.append({
                    'Eval_type': eval_type,
                    'Term': tokens,
                    'Text': text
                })

    return err_analysis_list
Beispiel #6
0
def run(file_name_list, mode):
    import datetime
    from tqdm import tqdm
    print(f"------------------start For {mode}----------------------")
    conlls = []
    all_del_sen = 0
    all_label_num = 0
    for file_name in sorted(file_name_list):
        txt_name = file_name
        if mode != "test":
            ann_name = file_name[:-4] + ".csv"
            preprocess = PreProcess(txt_name, ann_name)
            all_label_num += preprocess.ann.shape[0]
        else:
            preprocess = PreProcess(txt_name)
        conll, info = preprocess.brat2conll()
        all_del_sen += info["del_sen"]
        conlls.extend(conll)

    max_len = max([len(s) + 2 for s in conlls])

    label_num = 0
    for conll in conlls:
        label = [c[-1] for c in conll]
        label_num += len(get_entities(label))

    print(f'''
            句子总数   : {len(conlls)}
            删除句子个数: {all_del_sen}
            句子最大长度: {max_len}
            原始实体个数: {all_label_num}
            当前实体个数: {label_num}
        ''')
    # 打印lanbel的分布情况
    label_dict = {}
    for sentence in conlls:
        label = [s[-1] for s in sentence]
        for entity in get_entities(label):
            entity = entity[0]
            label_dict[entity] = label_dict.get(entity, 0) + 1

    if mode != "test":
        print("实体分布情况:")
        totle_num = 0
        for l in LABELS_LIST:
            if l not in label_dict:
                label_dict.update({l: 0})
        totle_num = sum(label_dict.values())
        for k, v in sorted(label_dict.items(), key=lambda x: x[0]):
            print(f"{k}:\t{v}\t{v/totle_num}")
        print(f"All: {totle_num}")

    print("-------------------END----------------------\n")
    return conlls
Beispiel #7
0
def entity_visualization(texts: List[List[str]],
                         labels: List[List[str]],
                         output_fname='entity_texts.html'):
    texts_c = deepcopy(texts)
    texts_c = [item[:-1] for item in texts_c]
    entities = [get_entities(item) for item in labels]
    all_entities = list(
        set([sub_item[0] for item in entities for sub_item in item]))
    all_entities = [item for item in all_entities if item != 'O']
    nb_entities = len(all_entities)
    if nb_entities > len(ENTITY_COLOR):
        rest_nb_colors = nb_entities - len(ENTITY_COLOR)
        colors = ENTITY_COLOR + [
            '#' +
            ''.join([random.choice('0123456789ABCDEF') for j in range(6)])
            for i in range(rest_nb_colors)
        ]
    else:
        colors = ENTITY_COLOR[:nb_entities]
    assert len(colors) == nb_entities
    entity_colors = {all_entities[i]: colors[i] for i in range(nb_entities)}

    with open(output_fname, 'w') as fout:
        for x, y in zip(texts_c, entities):
            fout.write(entity2html(x, y, entity_colors))
Beispiel #8
0
    def restrict_entities(text, tag, pred_prob, threshold=0.85):
        """Return restricted entities according to tag sequence: only keep at most one entity for
        each entity type
        """
        group_entities = defaultdict(list)

        chunks = get_entities(tag)
        for chunk_type, chunk_start, chunk_end in chunks:
            chunk_end += 1
            score = float(np.average(pred_prob[chunk_start: chunk_end]))
            if score >= threshold:
                entity = ''.join(text[chunk_start: chunk_end])
                group_entities[chunk_type].append((entity, score, chunk_start, chunk_end))

        results = []
        for entity_type, group in group_entities.items():
            entity = sorted(group, key=lambda x: x[0])[-1]
            results.append({
                'name': entity[0],
                'type': entity_type,
                'score': entity[1],
                'beginOffset': entity[2],
                'endOffset': entity[3]
            })
        return results
Beispiel #9
0
    def read_examples_from_file(self, file_path) -> List[InputExample]:
        guid_index = 1
        examples = []
        with open(file_path, encoding="utf-8") as f:
            words, labels = [], []
            metainfo = None
            for line in f:
                line = line.rstrip()
                if line.startswith("#\tpassage"):
                    metainfo = line
                elif line == "":
                    if words:
                        prods = get_entities(labels)
                        for etype, ss, se in prods:
                            # create prod-specific instance
                            assert etype == "arm_description"
                            inst_labels = ["O"] * len(words)
                            inst_labels[ss] = "B-arm_description"
                            inst_labels[ss + 1:se +
                                        1] = ["I-arm_description"] * (se - ss)
                            examples.append(
                                InputExample(guid=f"{guid_index}",
                                             words=words,
                                             metainfo=metainfo,
                                             labels=inst_labels))
                            guid_index += 1
                        words, labels = [], []
                else:
                    cols = line.strip().split('\t')
                    words.append(cols[0])
                    labels.append(cols[1])

        return examples
Beispiel #10
0
    def call(self, predictions, log_verbose=False):
        ''' main func entrypoint'''
        preds = predictions["preds"]
        output_index = predictions["output_index"]
        if output_index is None:
            res_file = self.config["solver"]["postproc"].get("res_file", "")
            label_path_file = self.config["data"]["task"]["label_vocab"]
        else:
            res_file = self.config["solver"]["postproc"][output_index].get(
                "res_file", "")
            label_path_file = self.config["data"]["task"]["label_vocab"][
                output_index]

        if res_file == "":
            logging.info(
                "Infer res not saved. You can check 'res_file' in your config."
            )
            return
        res_dir = os.path.dirname(res_file)
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        logging.info("Save inference result to: {}".format(res_file))

        preds = ids_to_sentences(preds, label_path_file)

        with open(res_file, "w", encoding="utf-8") as in_f:
            for i, pre in enumerate(preds):
                entities = get_entities(pre)  # [('PER', 0, 1), ('LOC', 3, 3)]
                if not entities:
                    in_f.write("Null")
                else:
                    new_line = "\t".join(
                        [" ".join(map(str, entity)) for entity in entities])
                    in_f.write(new_line)
                in_f.write("\n")
def write_outputs_to_json(out_file: str,
                          examples: List[Example],
                          y_preds: List[TAG_SEQUENCE]) -> None:
    """Writes a JSON with prediction outputs.

    Args:
        out_file: path to an output file or '-' to use stdout.
        examples: list of Example instances with associated tokens.
        y_preds: list of predicted tag sequences for each example.
    """
    output = []
    for example, y_pred in zip(examples, y_preds):
        predicted_entities = []

        for entity in get_entities(y_pred):
            entity_class, start_token_ix, end_token_ix = entity
            start_char = example.doc_tokens[start_token_ix].offset
            end_token = example.doc_tokens[end_token_ix]
            end_char = end_token.offset + len(end_token)

            predicted_entities.append({
                'class': entity_class,
                'start_char': start_char,
                'end_char': end_char,
                'text': example.orig_text[start_char:end_char],
            })
        output.append({
            'doc_id': example.doc_id,
            'text': example.orig_text,
            'entities': predicted_entities,
        })

    with smart_open(out_file) as fd:
        json.dump(output, fd)
Beispiel #12
0
 def calc_char_offset(self, words, tags):
     """
     Examples:
         >>> words = ['EU', 'rejects', 'German', 'call']
         >>> tags = ['B-ORG', 'O', 'B-MISC', 'O']
         >>> entities = get_entities(tags)
         >>> entities
         [['ORG', 0, 0], ['MISC', 2, 2]]
         >>> self.calc_char_offset(words, tags)
         {
           'text': 'EU rejects German call',
           'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
         }
     """
     doc = ' '.join(words)
     j = {'text': ' '.join(words), 'labels': []}
     pos = defaultdict(int)
     for label, start_offset, end_offset in get_entities(tags):
         entity = ' '.join(words[start_offset:end_offset + 1])
         char_left = doc.index(entity, pos[entity])
         char_right = char_left + len(entity)
         span = [char_left, char_right, label]
         j['labels'].append(span)
         pos[entity] = char_right
     return j
Beispiel #13
0
 def _build_response(self, split_text, tags, poss, segs=[], words=[]):
     if self.basic_token == 'char':
         res = {
             'words': split_text,
             'pos': poss,
             'char_pos': poss,
             'char_word': words,
             'seg': segs,
             'entities': []
         }
     else:
         res = {'words': split_text, 'pos': poss, 'entities': []}
     chunks = get_entities(tags)
     for chunk_type, chunk_start, chunk_end in chunks:
         chunk = self.post_process_chunk(chunk_type, chunk_start, chunk_end,
                                         split_text, poss)
         if chunk is not None:
             entity = {
                 'text': chunk,
                 'type': chunk_type,
                 'beginOffset': chunk_start,
                 'endOffset': chunk_end
             }
             res['entities'].append(entity)
     return res
Beispiel #14
0
def detailed_metrics(y_gold, y_pred):
    """Calculate the main classification metrics for every label type.

    Args:
        y_gold: 2d array. Ground truth (correct) target values.
        y_pred: 2d array. Estimated targets as returned by a classifier.
        digits: int. Number of digits for formatting output floating point values.

    Returns:
        type_metrics: dict of label types and their metrics.
        macro_avg: dict of weighted macro averages for all metrics across label types.
    """
    gold_entities = set(get_entities(y_gold))
    pred_entities = set(get_entities(y_pred))
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in gold_entities:
        d1[e[0]].add((e[1], e[2]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    metrics = {}
    ps, rs, f1s, s = [], [], [], []
    for type_name, gold_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(gold_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(gold_entities)
        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        metrics[type_name.lower() + '_precision'] = round(p, 3)
        metrics[type_name.lower() + '_recall'] = round(r, 3)
        metrics[type_name.lower() + '_f1'] = round(f1, 3)

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)
    macro_avg = {
        'macro_precision': round(np.average(ps, weights=s), 3),
        'macro_recall': round(np.average(rs, weights=s), 3),
        'macro_f1': round(np.average(f1s, weights=s), 3)
    }

    return metrics, macro_avg
Beispiel #15
0
def get_metrics(y_true, y_pred, suffix=False):
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))
    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    type_name_list = []
    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        type_name_list.append(type_name)
        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    # compute averages
    type_name_list.append('avg / total')
    ps.append(np.average(ps, weights=s))
    rs.append(np.average(rs, weights=s))
    f1s.append(np.average(f1s, weights=s))
    s.append(np.sum(s))

    df_metrics = pd.DataFrame({
        'type_name': type_name_list,
        'precision': ps,
        'recall': rs,
        'f1-score': f1s,
        'support': s
    })

    return df_metrics
Beispiel #16
0
def get_tag_dict(sequence, tag_texts):
    words = sequence.split()
    entities = get_entities(tag_texts)

    slots = defaultdict(list)
    for slot, start_idx, end_idx in entities:
        slots[slot].append(" ".join(words[start_idx : end_idx + 1]))
    return dict(slots)
Beispiel #17
0
 def verifyTestDataBalanceCRF(self, y_test):
     """check tags (classes) balance from dataset in test"""
     from seqeval.metrics.sequence_labeling import get_entities
     lst = [ls for sublist in y_test for ls in sublist]
     tags = set([tg[0] for tg in get_entities(lst)])
     tags = list(tags)
     tags.sort()
     print('{}\t{}'.format(len(tags), tags))
Beispiel #18
0
def precision_recall_f1_support_sequence_labelling(y_true, y_pred):
    """Compute precision, recall, f1 and support for sequence labelling tasks.

    For given gold (`y_true`) and predicted (`y_pred`) sequence labels, returns the precision,
    recall, f1 and support per label, and the macro and micro average of these scores across
    labels. Expects `y_true` and `y_pred` to be a sequence of IOB1/2, IOE1/2, or IOBES formatted
    labels.

    Args:
        y_true (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels.
        y_pred (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels.

    Returns:
        A dictionary of scores keyed by the labels in `y_true` where each score is a 4-tuple
        containing precision, recall, f1 and support. Additionally includes the keys
        'Macro avg' and 'Micro avg' containing the macro and micro averages across scores.
    """
    scores = {}
    # Unique labels, not including NEG
    labels = list(
        {tag.split('-')[-1]
         for tag in set(y_true) if tag != OUTSIDE})
    labels.sort(
    )  # ensures labels displayed in same order across runs / partitions

    for label in labels:
        y_true_lab = [
            tag if tag.endswith(label) else OUTSIDE for tag in y_true
        ]
        y_pred_lab = [
            tag if tag.endswith(label) else OUTSIDE for tag in y_pred
        ]

        # TODO (John): Open a pull request to seqeval with a new function that returns all these
        # scores in one call. There is a lot of repeated computation here.
        precision = precision_score(y_true_lab, y_pred_lab)
        recall = recall_score(y_true_lab, y_pred_lab)
        f1 = f1_score(y_true_lab, y_pred_lab)
        support = len(set(get_entities(y_true_lab)))

        scores[label] = precision, recall, f1, support

    # Get macro and micro performance metrics averages
    macro_precision = mean([v[0] for v in scores.values()])
    macro_recall = mean([v[1] for v in scores.values()])
    macro_f1 = mean([v[2] for v in scores.values()])
    total_support = sum([v[3] for v in scores.values()])

    micro_precision = precision_score(y_true, y_pred)
    micro_recall = recall_score(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred)

    scores[
        'Macro avg'] = macro_precision, macro_recall, macro_f1, total_support
    scores[
        'Micro avg'] = micro_precision, micro_recall, micro_f1, total_support

    return scores
Beispiel #19
0
def decoding(text, tag_seq):
    assert len(text) == len(
        tag_seq), f"text len: {len(text)}, tag_seq len: {len(tag_seq)}"

    puncs = list(",.?;!,。?;!")
    splits = [idx for idx in range(len(text)) if text[idx] in puncs]

    prev = 0
    sub_texts, sub_tag_seqs = [], []
    for i, split in enumerate(splits):
        sub_tag_seqs.append(tag_seq[prev:split])
        sub_texts.append(text[prev:split])
        prev = split
    sub_tag_seqs.append(tag_seq[prev:])
    sub_texts.append((text[prev:]))

    ents_list = []
    for sub_text, sub_tag_seq in zip(sub_texts, sub_tag_seqs):
        ents = get_entities(sub_tag_seq, suffix=False)
        ents_list.append((sub_text, ents))

    aps = []
    no_a_words = []
    for sub_tag_seq, ent_list in ents_list:
        sub_aps = []
        sub_no_a_words = []
        for ent in ent_list:
            ent_name, start, end = ent
            if ent_name == "Aspect":
                aspect = sub_tag_seq[start:end + 1]
                sub_aps.append([aspect])
                if len(sub_no_a_words) > 0:
                    sub_aps[-1].extend(sub_no_a_words)
                    sub_no_a_words.clear()
            else:
                ent_name == "Opinion"
                opinion = sub_tag_seq[start:end + 1]
                if len(sub_aps) > 0:
                    sub_aps[-1].append(opinion)
                else:
                    sub_no_a_words.append(opinion)

        if sub_aps:
            aps.extend(sub_aps)
            if len(no_a_words) > 0:
                aps[-1].extend(no_a_words)
                no_a_words.clear()
        elif sub_no_a_words:
            if len(aps) > 0:
                aps[-1].extend(sub_no_a_words)
            else:
                no_a_words.extend(sub_no_a_words)

    if no_a_words:
        no_a_words.insert(0, "None")
        aps.append(no_a_words)

    return aps
Beispiel #20
0
def split_entity(label_sequence):
    """
    从标签序列中抽取实体
        >>> label_sequence=[['O', 'B', 'O', 'B', 'I', 'B'], ['O', 'O', 'B']]
        >>> chunks=[('_', 1, 1), ('_', 3, 4), ('_', 5, 5), ('_', 9, 9)]
    :param label_sequence:
    :return: list of (chunk_type, chunk_start, chunk_end).
    """
    return get_entities(label_sequence)
Beispiel #21
0
def process_tokenized_sentence_document(doc: TokenizedSentenceDocument):
    sents = doc.sent_tokens
    metadata = doc.metadata

    logger.warn('Received document labeled %s with %d sentences' % (metadata, len(sents)))
    instances = []
    start_time = time()

    for sent_ind, token_list in enumerate(sents):
        inst_str = create_instance_string(token_list)
        logger.debug('Instance string is %s' % (inst_str))
        instances.append(inst_str)

    dataset = TemporalDocumentDataset.from_instance_list(instances, app.state.tokenizer)
    logger.warn('Dataset is as follows: %s' % (str(dataset.features)))

    preproc_end = time()

    output = app.state.trainer.predict(test_dataset=dataset)

    timex_predictions = np.argmax(output.predictions[0], axis=2)
    
    timex_results = []
    event_results = []
    relation_results = []

    pred_end = time()
    
    for sent_ind in range(len(dataset)):
        tokens = app.state.tokenizer.convert_ids_to_tokens(dataset.features[sent_ind].input_ids)
        wpind_to_ind = {}
        timex_labels = []
        for token_ind in range(1,len(tokens)):
            if dataset[sent_ind].input_ids[token_ind] <= 2:
                break
            if tokens[token_ind].startswith('Ġ'):
                wpind_to_ind[token_ind] = len(wpind_to_ind)
                timex_labels.append(timex_label_list[timex_predictions[sent_ind][token_ind]])

        timex_entities = get_entities(timex_labels)
        logging.info("Extracted %d timex entities from the sentence" % (len(timex_entities)))
        timex_results.append( [Timex(timeClass=label[0], begin=label[1], end=label[2]) for label in timex_entities] )
        event_results.append( [] )
        relation_results.append( [] )


    results = TemporalResults(timexes=timex_results, events=event_results, relations=relation_results)

    postproc_end = time()

    preproc_time = preproc_end - start_time
    pred_time = pred_end - preproc_end
    postproc_time = postproc_end - pred_end

    logging.info("Pre-processing time: %f, processing time: %f, post-processing time %f" % (preproc_time, pred_time, postproc_time))

    return results
Beispiel #22
0
def extract_gold_entities_multi_token(tokens, gold_bio_tags):

    gold_tags = set(get_entities(gold_bio_tags))
    gold_entities = []
    for gold_tag in gold_tags:
        entity_tokens = (' '.join(tokens[gold_tag[1]:gold_tag[2] + 1]))
        gold_entities.append(
            [gold_tag[0], entity_tokens, gold_tag[1], gold_tag[2]])

    return gold_entities
Beispiel #23
0
    def _build_response(self, sent, tags, probs):
        words = self.tokenize(sent)
        res = {
            'words': words,
            'entities': [],
            'terms': [],
            'head_rels': []
        }
        tag_ner, tag_term, tag_rel = tags
        prob_ner, prob_term, prob_rel = probs
        chunks_ner = sequence_labeling.get_entities(tag_ner)
        chunks_term = sequence_labeling.get_entities(tag_term)

        for chunk_type, chunk_start, chunk_end in chunks_ner:
            chunk_end += 1
            entity = {
                'text': ' '.join(words[chunk_start: chunk_end]),
                'type': chunk_type,
                'score': float(np.average(prob_ner[chunk_start: chunk_end])),
                'beginOffset': chunk_start,
                'endOffset': chunk_end
            }
            res['entities'].append(entity)
        for chunk_type, chunk_start, chunk_end in chunks_term:
            chunk_end += 1
            term = {
                'text': ' '.join(words[chunk_start: chunk_end]),
                'type': chunk_type,
                'score': float(np.average(prob_ner[chunk_start: chunk_end])),
                'beginOffset': chunk_start,
                'endOffset': chunk_end
            }
            res['terms'].append(term)
        for i, tag in enumerate(tag_rel):
            if tag:
                rel = {
                    'text': words[i],
                    'score': f"{round(prob_rel[i], 4)}",
                    'offset': i
                }
                res['head_rels'].append(rel)
        return res
Beispiel #24
0
    def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.
            y : iterabl, label strings.

        Returns:
            features: document id matrix.
            y: label id matrix.
        """
        mentions = []
        mentions_char = []
        left_contexts = []
        right_contexts = []
        outputs = []

        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
        ngram_indices = []
        for sent in word_ids:
            ngrams = self.generate_ngrams(sent, n=4)
            ngram_indices.append(ngrams)
            for l, r in ngrams:
                mentions.append(word_ids[l:r])
                mentions_char.append(char_ids[l:r])
                left_contexts.append(word_ids[:l])
                right_contexts.append(word_ids[r:])

        if y is not None:
            for ngram, labels in zip(ngram_indices, y):
                d = {(begin_offset, end_offset + 1): t
                     for t, begin_offset, end_offset in get_entities(labels)}
                for l, r in ngram:
                    if (l, r) in d:
                        outputs.append(self._label_vocab[d[(l, r)]])
                    else:
                        outputs.append(self._label_vocab)

        outputs = np.array(outputs)
        inputs = [
            np.array(left_contexts),
            np.array(mentions),
            np.array(mentions_char),
            np.array(right_contexts)
        ]

        if y is not None:
            return inputs, outputs
        else:
            return inputs
Beispiel #25
0
def summary_data(tags):
    total_entity = []

    for sen_tag in tags:
        entitys = get_entities(sen_tag)
        entitys = [ele[0] for ele in entitys]
        total_entity += entitys
    unique, counts = np.unique(total_entity, return_counts=True)

    print('Entities for training:\n', dict(zip(unique, counts)))
    return
Beispiel #26
0
 def calc_char_offset(cls, words, tags):
     doc = ' '.join(words)
     j = {'text': ' '.join(words), 'labels': []}
     pos = defaultdict(int)
     for label, start_offset, end_offset in get_entities(tags):
         entity = ' '.join(words[start_offset:end_offset + 1])
         char_left = doc.index(entity, pos[entity])
         char_right = char_left + len(entity)
         span = [char_left, char_right, label]
         j['labels'].append(span)
         pos[entity] = char_right
     return j
Beispiel #27
0
    def _build_response1(self, sent, tags, prob):
        words = self.tokenizer(sent)
        res = "" 
        chunks = get_entities(tags)
        for index, obj in enumerate(words):
            res = res + obj  +"\t"+tags[index] +"\n"
            if "." in obj:
                  res = res+"\n"
            if "ред" in obj:
                  res = res+"\n"

        return res
Beispiel #28
0
def extract_tp_actual_correct(y_true, y_pred, suffix, *args):
    entities_true = defaultdict(set)
    entities_pred = defaultdict(set)
    for type_name, start, end in get_entities(y_true, suffix):
        entities_true[type_name].add((start, end))
    for type_name, start, end in get_entities(y_pred, suffix):
        entities_pred[type_name].add((start, end))

    target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))

    tp_sum = np.array([], dtype=np.int32)
    pred_sum = np.array([], dtype=np.int32)
    true_sum = np.array([], dtype=np.int32)
    for type_name in target_names:
        entities_true_type = entities_true.get(type_name, set())
        entities_pred_type = entities_pred.get(type_name, set())
        tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type))
        pred_sum = np.append(pred_sum, len(entities_pred_type))
        true_sum = np.append(true_sum, len(entities_true_type))

    return pred_sum, tp_sum, true_sum
Beispiel #29
0
def merge_col_from_tag(row, col, tag):
    """
    The function merged items in the list from designated row[col] according to row[tag]
    col is the column of token list such as [this, is, three, dollar]
    tag is the column of tag list such as [O, O, B-TBNorm, I-TBNorm]
    Return: [this, is, three dollar]

    If the row[col] is tag, it will return [O, O, B], which is the tag for the result above
    """
    l = row[col].copy()
    if col in ['tag', 'tag_pred']:
        for tup in get_entities(row[tag])[::-1]:
            for i in range(tup[1], tup[2] + 1):
                l.pop(tup[1])
            l.insert(tup[1], 'B')
    else:
        for tup in get_entities(row[tag])[::-1]:
            text = row[col][tup[1]:tup[2] + 1]
            for i in range(tup[1], tup[2] + 1):
                l.pop(tup[1])
            l.insert(tup[1], ' '.join(text))
    return l
Beispiel #30
0
    def test_calc_char_offset(self):
        words = ['EU', 'rejects', 'German', 'call']
        tags = ['B-ORG', 'O', 'B-MISC', 'O']

        entities = get_entities(tags)
        actual = CoNLLParser.calc_char_offset(words, tags)

        self.assertEqual(entities, [('ORG', 0, 0), ('MISC', 2, 2)])

        self.assertEqual(
            actual, {
                'text': 'EU rejects German call',
                'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
            })