Esempio n. 1
0
def test_iob_to_biluo():
    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
    bad_iob = ["O", "O", '"', "B-LOC", "I-LOC"]
    converted_biluo = iob_to_biluo(good_iob)
    assert good_biluo == converted_biluo
    with pytest.raises(ValueError):
        iob_to_biluo(bad_iob)
Esempio n. 2
0
def test_issue2385():
    """Test that IOB tags are correctly converted to BILUO tags."""
    # fix bug in labels with a 'b' character
    tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
    assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
    # maintain support for iob1 format
    tags2 = ("I-ORG", "I-ORG", "B-ORG")
    assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
    # maintain support for iob2 format
    tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
    assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
Esempio n. 3
0
def test_issue2385():
    """Test that IOB tags are correctly converted to BILUO tags."""
    # fix bug in labels with a 'b' character
    tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
    assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
    # maintain support for iob1 format
    tags2 = ("I-ORG", "I-ORG", "B-ORG")
    assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
    # maintain support for iob2 format
    tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
    assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
Esempio n. 4
0
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str],
                             nlp: Language) -> Doc:
    # Create initial doc
    all_tokens = list(chain.from_iterable(sentences))
    # Mark that every token is followed by space
    spaces = [True] * len(all_tokens)
    doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces)

    # Set sentence boundaries
    tok_idx = 0
    for sentence in sentences:
        for sentence_idx in range(len(sentence)):
            # First token should have start to True, all others False
            doc[tok_idx].is_sent_start = sentence_idx == 0
            tok_idx += 1

    if labels:
        if len(labels) != len(all_tokens):
            raise ValueError(
                f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})"
            )

        # Create entities after converting IOB (actually BIO) to BILUO
        doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels))

    return doc
Esempio n. 5
0
    def process(self, line, intent_treshold_score=0.5):
        doc = self.nlp.make_doc(line)
        words_true = [w.text for w in doc]
        length = len(words_true)
        words_true += ['<EOS>']
        words = words_true + ['<PAD>'] * (50 - len(words_true))
        words = np.array(words)
        batch = [{'words': words, 'length': length}]
        decoder_prediction, intent, intent_score = self.model.test(batch)
        # batch only contains one element
        intent = intent[0]
        intent_score = intent_score[0]
        # get the part that corresponds to words (truncate PAD and EOS)
        decoder_prediction = decoder_prediction[:length, 0]
        #print(decoder_prediction, intent[0], intent_score)
        # clean up <EOS> and <PAD>
        decoder_prediction = [
            t if (t != '<EOS>' and t != '<PAD>') else 'O'
            for t in decoder_prediction
        ]
        biluo_tags = iob_to_biluo(decoder_prediction)
        entities_offsets = offsets_from_biluo_tags(doc, biluo_tags)
        entities = []
        for ent in entities_offsets:
            e_parts = ent[2].split('.')
            if len(e_parts) > 1:
                # role.type
                entity = {'role': e_parts[0], 'type': e_parts[1]}
            else:
                entity = {'role': None, 'type': e_parts[0]}
            value = line[ent[0]:ent[1]]
            entities.append({
                '_entity': entity['type'],
                'role': entity['role'],
                'value': value,
                '_body': value,
                '_start': ent[0],
                '_end': ent[1]
            })

        # now convert to the same format as wit.ai, applying the treshold
        if intent_score < intent_treshold_score:
            intent_result = None
        else:
            intent_result = {'confidence': str(intent_score), 'value': intent}

        entities_result = {}
        for ent in entities:
            if ent['role']:
                entities_result[ent['role']] = ent
            else:
                entities_result[ent['_entity']] = ent

        return intent_result, entities_result
Esempio n. 6
0
    def _sentence_to_spacy_annotations(self, tokens,
                                       tags) -> Tuple[str, Tuple]:
        sentence = " ".join(tokens)
        tags = iob_to_biluo(tags)

        doc = self.nlp(sentence)
        annotations = offsets_from_biluo_tags(doc, tags)
        annotations = [(begin, end, tag) for begin, end, tag in annotations
                       if len(tag) > 0]

        return sentence, annotations
Esempio n. 7
0
    def _sentence_to_spacy_annotations(self, tokens,
                                       tags) -> Tuple[str, Tuple]:
        sentence = " ".join(tokens)
        tags = iob_to_biluo(tags)

        doc = self.nlp(sentence)
        annotations = offsets_from_biluo_tags(doc, tags)
        # print(sentence)
        # print(tags)
        # print(annotations)

        return sentence, annotations
Esempio n. 8
0
def get_frame_elements_span(samples):
    """Returns a list of spans that contain gold frame elements"""
    result = []
    for s in samples:
        biluo = iob_to_biluo(s['slots_true'])
        entities = tags_to_entities(biluo)
        #print(entities)
        for e in entities:
            result.append({
                'sample_id': s['id'],
                'type': e[0],
                'start': e[1],
                'end': e[2]
            })
    return result
Esempio n. 9
0
    def set_annotations(
        self, docs: Iterable[Doc], logits: torch.Tensor
    ) -> Iterable[Doc]:
        assert len(logits.shape) == 3  # (batch, length, nclass)
        id2label = self.labels

        for doc, logit in zip(docs, cast(Iterable, logits)):
            doc._.set("tokens_logit", logit)
            best_tags = get_best_tags(logit, id2label, self.k_beam)
            ents = [best_tags[a[0]] if len(a) else "O" for a in doc._.get(ATTRS.align)]
            biluo_ents = iob_to_biluo(ents)
            doc.ents = tuple(
                spacy.util.filter_spans(
                    doc.ents + tuple(spans_from_biluo_tags(doc, biluo_ents))
                )
            )
        return docs
Esempio n. 10
0
def read_examples(path):
    path = Path(path)
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
            if not sent.strip():
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
                tokens.pop(0)
            words = []
            iob = []
            for token in tokens:
                if token.strip():
                    pieces = token.split()
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
Esempio n. 11
0
def read_examples(path):
    path = Path(path)
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
            if not sent.strip():
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
                tokens.pop(0)
            words = []
            iob = []
            for token in tokens:
                if token.strip():
                    pieces = token.split()
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
Esempio n. 12
0
def sequence_iob_to_ents(iob_sequence):
    """From the sequence of IOB shaped (n_samples, seq_max_len) to label:start-end array"""
    #print(decoder_prediction, intent[0], intent_score)
    # clean up <EOS> and <PAD>
    result = []
    for line in iob_sequence:
        line = [
            t if (t != '<EOS>' and t != '<PAD>' and t != 0) else 'O'
            for t in line
        ]
        #print(line)
        line = iob_to_biluo(line)
        entities_offsets = tags_to_entities(line)
        # an entity is a tuple (label, start, end)
        entity_tuples = [(label, start, end)
                         for (label, start, end) in entities_offsets]
        result.append(entity_tuples)
    return result
Esempio n. 13
0
    def __init__(self, fpath, tokenizer):
        """
        fpath: [train|valid|test].txt
        """
        ner_types = ['ORG', 'PER', 'LOC', 'MISC']
        self.tokenizer = tokenizer
        self.VOCAB = ['<PAD>', 'O'
                      ] + create_biluo_tag_from_ner_types(ner_types)[:-1]
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.VOCAB)}
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.VOCAB)}

        entries = open(fpath, 'r').read().strip().split("\n\n")
        sents, tags_li = [], []  # list of lists
        for entry in entries:
            words = [line.split()[0] for line in entry.splitlines()]
            tags = ([line.split()[-1] for line in entry.splitlines()])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<PAD>"] + iob_to_biluo(tags) + ["<PAD>"])
        self.sents, self.tags_li = sents, tags_li
def format_predictions_to_display(doc,
                                  predictions,
                                  probability_maps,
                                  pos=False):
    """Format predictions into spacy display formar."""
    bert_predictions = []
    iob_tags = []
    tags_formatted = []

    for prediction, probability_map in zip(predictions[0],
                                           probability_maps[0]):
        word = list(prediction.keys())[0]
        probas = probability_map[word]
        normalized_probas = list(softmax(np.mean(probas, axis=0)))
        bert_predictions.append(
            (word, prediction[word], np.max(normalized_probas)))
        if pos:
            iob_tags.append("I-" + prediction[word])
        else:
            iob_tags.append(prediction[word])

    biluo_tags = iob_to_biluo(iob_tags)
    tags = offsets_from_biluo_tags(doc, biluo_tags)

    for tag in tags:
        start_token = get_token_for_char(doc, tag[0])
        word_span = doc.text[tag[0]:tag[1]]
        length_of_span = len(word_span.split())
        if length_of_span == 1:
            probs = [bert_predictions[start_token][2]]
        else:
            probs = [
                item[2] for item in bert_predictions[start_token:start_token +
                                                     length_of_span]
            ]
        tags_formatted.append({
            "start": tag[0],
            "end": tag[1],
            "label": tag[2],
            "score": np.prod(probs)
        })
    return bert_predictions, tags_formatted
Esempio n. 15
0
def prediction_to_IOB(prediction, gt):
    """
    CONVERT PREDICTION AND GROUTH TRUTH TO BILOU SCHEMA

    Input:
        - prediction: spacy.Doc
        - gt : spacy.GoldParse
    
    Output:
        - list of list contains every token info in prediction.text
    """
    tag_iob = []
    for token in prediction:
        if (token.ent_type_ == ''):
            tag_iob.append(token.ent_iob_)
        else:
            tag_iob.append('-'.join([token.ent_iob_, token.ent_type_]))
    tokens = [token.text for token in prediction]

    tag_new_iob = iob_to_biluo(tag_iob)
    gt_NER = gt.ner
    return [[token, true_label, pred_iob]
            for token, true_label, pred_iob in zip(tokens, gt_NER, tag_new_iob)
            ]
Esempio n. 16
0
def test_issue2385_biluo(tags):
    """Test that BILUO-compatible tags aren't modified."""
    assert iob_to_biluo(tags) == list(tags)
Esempio n. 17
0
def bio_to_biluo(tags: List[str]) -> List[str]:
    warnings.warn(f"Use spacy.gold.iob_to_biluo instead", DeprecationWarning)
    return iob_to_biluo(tags)
Esempio n. 18
0
class Cached(Provider):
    cache: Dict[str, Any]
    name = 'cached'
    known_schemas = {
        # these assume same tokenisation
        "bio":
        lambda doc, annotation: offsets_from_biluo_tags(
            iob_to_biluo(doc, annotation)),
        "bilou":
        offsets_from_biluo_tags,
        "offsets":
        OFFSETS,
        "list_of_clusters":
        convert_clusters_to_offsets,
        # these provide their own tokenisation

        # annotation: List[Tuple[str,str]]
        "list_of_tuples_bio_flat":
        lambda doc, annotation: get_offsets(doc.text, annotation),

        # annotation: List[List[Tuple[str,str]]]
        "list_of_tuples_bio_stacked":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, annotation),

        # annotation: Tuple[List[str],List[str]]
        "tuple_of_lists_flat":
        lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])),

        # annotation: List[Tuple[List[str]], Tuple[List[str]]]
        "list_of_tuples_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))),

        # annotation: Tuple[List[List[str]], Tuple[List[List[str]]
        "tuple_of_lists_of_lists":
        lambda doc, annotation: get_offsets_from_sentences(
            doc.text, ((w, l) for ws, ls in zip(*annotation[:2])
                       for w, l in zip(ws, ls)))

        # TODO: BRAT
        # TODO: Pubmed
    }

    def __init__(self,
                 schema: Union[str, Callable[[Doc, Any],
                                             OffsetAnnotation]] = None,
                 getter=None,
                 path: str = None):
        self.cache = {}
        self.loaded = False
        if not schema:
            self.schema = OFFSETS
        elif schema in self.known_schemas:
            self.schema = Cached.known_schemas[schema]
        elif isinstance(schema, Callable):
            self.schema = schema
        else:
            self.schema = None
        self.getter = getter
        if path:
            self.load(path)

    @overrides
    def save(self, path: str):
        util.save_file(self.cache, path)

    # TODO: guess schema

    @overrides
    def load(self, path):
        self.cache = util.load_file(path)
        self.loaded = True

    @overrides
    def annotate_document(self, doc: Doc) -> OffsetAnnotation:
        if not self.loaded:
            raise ValueError("You forgot to load the cache!")
        annotations = self.cache.get(doc._.id, None)
        if annotations:
            if self.schema:
                if self.schema == OFFSETS:
                    return self.getter(
                        annotations) if self.getter else annotations
                else:
                    return self.schema(
                        doc,
                        self.getter(annotations)
                        if self.getter else annotations)
            else:
                logger.info(
                    f"no schema loaded for {self.__class__.__name__}, good luck!"
                )
                return annotations
Esempio n. 19
0
    def transform(self, X, *_):
        from nltk.corpus import stopwords
        from nltk.stem.snowball import SnowballStemmer
        stemmer = SnowballStemmer("english")

        import spacy
        from spacy.gold import iob_to_biluo
        nlp = spacy.load('en_core_web_md',
                         disable=['parser', 'tagger', 'textcat'])
        from spacy.attrs import ORTH
        nlp.tokenizer.add_special_case("I'm", [{ORTH: "I'm"}])
        nlp.vocab.add_flag(
            lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS,
            spacy.attrs.IS_STOP)

        english_stopwords = stopwords.words('english')
        english_stopwords.append("i'm")

        tokenized_corpus = []
        good_ents = ["PERSON", "GPE", "ORG", "LOC", "EVENT", "FAC"]
        continue_tags = ["B-", "I-"]
        end_tags = ["L-", "U-"]

        for text in X:
            toks = []
            iobs = [i.ent_iob_ for i in nlp(text)]
            biluos = list(iob_to_biluo(iobs))
            #Named entities variable
            ne = ""
            for index, tok in enumerate(nlp(text)):
                if biluos[index] in continue_tags and str(
                        tok.ent_type_) in good_ents:
                    #str(tok).split() != [] Checks if empty token
                    #For some reason tok.whitespace_ doesn't include double token entities
                    #like "JENNIFER LAWRENCE"
                    if not self._tag:
                        ne += " " + str(tok).lower()
                    elif self._tag and str(tok).split() != []:
                        #Entity is the beginning of an entity set
                        if biluos[index] == "B-":
                            if str(tok.ent_type_) != "PERSON":
                                ne += " &" + str(tok).lower()
                            elif str(tok.ent_type_) == "PERSON":
                                ne += " *" + str(tok).lower()
                        else:
                            if str(tok.ent_type_) != "PERSON":
                                ne += " " + str(tok).lower()
                            elif str(tok.ent_type_) == "PERSON":
                                ne += " " + str(tok).lower()
                elif biluos[index] in end_tags and str(
                        tok.ent_type_) in good_ents:
                    if not self._tag:
                        ne += " " + str(tok).lower()
                        toks.append(ne.lstrip())
                        ne = " "
                    elif self._tag and str(tok).split() != []:
                        #Entity is just a single unit
                        if biluos[index] == "U-":
                            if str(tok.ent_type_) != "PERSON":
                                ne += " &" + str(tok).lower()
                                toks.append(ne.lstrip())
                                ne = " "
                            elif str(tok.ent_type_) == "PERSON":
                                ne += " *" + str(tok).lower()
                                ne.replace("*’m", "")
                                toks.append(ne.lstrip())
                                ne = " "
                        else:
                            ne += " " + str(tok).lower()
                            # so that possesive tags are not stored with the '’s'
                            ne = ne.replace("’s", "")
                            toks.append(ne.lstrip())
                            ne = " "
                #If token is just a boring old word
                else:
                    if not tok.is_punct and not tok.is_space and str(
                            tok).lower() not in english_stopwords:
                        toks.append(stemmer.stem(str(tok)))
            tokenized_corpus.append(toks)
        return tokenized_corpus
    doc_toks = [tok.text for tok in doc]
    if not (doc_toks == sentences[i]):
        # doc_toks rövidebb általában
        j = 0
        k = 0
        new_tags = []
        while j < len(sentences[i]):
            if sentences[i][j] == doc_toks[k]:
                new_tags.append(iobs[i][j])
                j += 1
                k += 1
            else:
                new_tags.append(iobs[i][j])
                k += 1
                j += 2
        tags = iob_to_biluo(new_tags)
    else:
        tags = iob_to_biluo(iobs[i])
    try:
        entities = offsets_from_biluo_tags(doc, tags)
        e = (detokenized_sent, entities)
        corpus.append(e)
    except Exception as err:
        print(err, detokenized_sent)
        continue

print(len(corpus))
corpus = [e for e in corpus if len(e[0]) > 0]
print(len(corpus))

with open("data/interim/corpus.p", "wb") as of:
Esempio n. 21
0
def test_issue2385_biluo(tags):
    """Test that BILUO-compatible tags aren't modified."""
    assert iob_to_biluo(tags) == list(tags)