def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] bad_iob = ["O", "O", '"', "B-LOC", "I-LOC"] converted_biluo = iob_to_biluo(good_iob) assert good_biluo == converted_biluo with pytest.raises(ValueError): iob_to_biluo(bad_iob)
def test_issue2385(): """Test that IOB tags are correctly converted to BILUO tags.""" # fix bug in labels with a 'b' character tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] # maintain support for iob1 format tags2 = ("I-ORG", "I-ORG", "B-ORG") assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] # maintain support for iob2 format tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str], nlp: Language) -> Doc: # Create initial doc all_tokens = list(chain.from_iterable(sentences)) # Mark that every token is followed by space spaces = [True] * len(all_tokens) doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces) # Set sentence boundaries tok_idx = 0 for sentence in sentences: for sentence_idx in range(len(sentence)): # First token should have start to True, all others False doc[tok_idx].is_sent_start = sentence_idx == 0 tok_idx += 1 if labels: if len(labels) != len(all_tokens): raise ValueError( f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})" ) # Create entities after converting IOB (actually BIO) to BILUO doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels)) return doc
def process(self, line, intent_treshold_score=0.5): doc = self.nlp.make_doc(line) words_true = [w.text for w in doc] length = len(words_true) words_true += ['<EOS>'] words = words_true + ['<PAD>'] * (50 - len(words_true)) words = np.array(words) batch = [{'words': words, 'length': length}] decoder_prediction, intent, intent_score = self.model.test(batch) # batch only contains one element intent = intent[0] intent_score = intent_score[0] # get the part that corresponds to words (truncate PAD and EOS) decoder_prediction = decoder_prediction[:length, 0] #print(decoder_prediction, intent[0], intent_score) # clean up <EOS> and <PAD> decoder_prediction = [ t if (t != '<EOS>' and t != '<PAD>') else 'O' for t in decoder_prediction ] biluo_tags = iob_to_biluo(decoder_prediction) entities_offsets = offsets_from_biluo_tags(doc, biluo_tags) entities = [] for ent in entities_offsets: e_parts = ent[2].split('.') if len(e_parts) > 1: # role.type entity = {'role': e_parts[0], 'type': e_parts[1]} else: entity = {'role': None, 'type': e_parts[0]} value = line[ent[0]:ent[1]] entities.append({ '_entity': entity['type'], 'role': entity['role'], 'value': value, '_body': value, '_start': ent[0], '_end': ent[1] }) # now convert to the same format as wit.ai, applying the treshold if intent_score < intent_treshold_score: intent_result = None else: intent_result = {'confidence': str(intent_score), 'value': intent} entities_result = {} for ent in entities: if ent['role']: entities_result[ent['role']] = ent else: entities_result[ent['_entity']] = ent return intent_result, entities_result
def _sentence_to_spacy_annotations(self, tokens, tags) -> Tuple[str, Tuple]: sentence = " ".join(tokens) tags = iob_to_biluo(tags) doc = self.nlp(sentence) annotations = offsets_from_biluo_tags(doc, tags) annotations = [(begin, end, tag) for begin, end, tag in annotations if len(tag) > 0] return sentence, annotations
def _sentence_to_spacy_annotations(self, tokens, tags) -> Tuple[str, Tuple]: sentence = " ".join(tokens) tags = iob_to_biluo(tags) doc = self.nlp(sentence) annotations = offsets_from_biluo_tags(doc, tags) # print(sentence) # print(tags) # print(annotations) return sentence, annotations
def get_frame_elements_span(samples): """Returns a list of spans that contain gold frame elements""" result = [] for s in samples: biluo = iob_to_biluo(s['slots_true']) entities = tags_to_entities(biluo) #print(entities) for e in entities: result.append({ 'sample_id': s['id'], 'type': e[0], 'start': e[1], 'end': e[2] }) return result
def set_annotations( self, docs: Iterable[Doc], logits: torch.Tensor ) -> Iterable[Doc]: assert len(logits.shape) == 3 # (batch, length, nclass) id2label = self.labels for doc, logit in zip(docs, cast(Iterable, logits)): doc._.set("tokens_logit", logit) best_tags = get_best_tags(logit, id2label, self.k_beam) ents = [best_tags[a[0]] if len(a) else "O" for a in doc._.get(ATTRS.align)] biluo_ents = iob_to_biluo(ents) doc.ents = tuple( spacy.util.filter_spans( doc.ents + tuple(spans_from_biluo_tags(doc, biluo_ents)) ) ) return docs
def read_examples(path): path = Path(path) with path.open() as file_: sents = file_.read().strip().split('\n\n') for sent in sents: if not sent.strip(): continue tokens = sent.split('\n') while tokens and tokens[0].startswith('#'): tokens.pop(0) words = [] iob = [] for token in tokens: if token.strip(): pieces = token.split() words.append(pieces[1]) iob.append(pieces[2]) yield words, iob_to_biluo(iob)
def sequence_iob_to_ents(iob_sequence): """From the sequence of IOB shaped (n_samples, seq_max_len) to label:start-end array""" #print(decoder_prediction, intent[0], intent_score) # clean up <EOS> and <PAD> result = [] for line in iob_sequence: line = [ t if (t != '<EOS>' and t != '<PAD>' and t != 0) else 'O' for t in line ] #print(line) line = iob_to_biluo(line) entities_offsets = tags_to_entities(line) # an entity is a tuple (label, start, end) entity_tuples = [(label, start, end) for (label, start, end) in entities_offsets] result.append(entity_tuples) return result
def __init__(self, fpath, tokenizer): """ fpath: [train|valid|test].txt """ ner_types = ['ORG', 'PER', 'LOC', 'MISC'] self.tokenizer = tokenizer self.VOCAB = ['<PAD>', 'O' ] + create_biluo_tag_from_ner_types(ner_types)[:-1] self.tag2idx = {tag: idx for idx, tag in enumerate(self.VOCAB)} self.idx2tag = {idx: tag for idx, tag in enumerate(self.VOCAB)} entries = open(fpath, 'r').read().strip().split("\n\n") sents, tags_li = [], [] # list of lists for entry in entries: words = [line.split()[0] for line in entry.splitlines()] tags = ([line.split()[-1] for line in entry.splitlines()]) sents.append(["[CLS]"] + words + ["[SEP]"]) tags_li.append(["<PAD>"] + iob_to_biluo(tags) + ["<PAD>"]) self.sents, self.tags_li = sents, tags_li
def format_predictions_to_display(doc, predictions, probability_maps, pos=False): """Format predictions into spacy display formar.""" bert_predictions = [] iob_tags = [] tags_formatted = [] for prediction, probability_map in zip(predictions[0], probability_maps[0]): word = list(prediction.keys())[0] probas = probability_map[word] normalized_probas = list(softmax(np.mean(probas, axis=0))) bert_predictions.append( (word, prediction[word], np.max(normalized_probas))) if pos: iob_tags.append("I-" + prediction[word]) else: iob_tags.append(prediction[word]) biluo_tags = iob_to_biluo(iob_tags) tags = offsets_from_biluo_tags(doc, biluo_tags) for tag in tags: start_token = get_token_for_char(doc, tag[0]) word_span = doc.text[tag[0]:tag[1]] length_of_span = len(word_span.split()) if length_of_span == 1: probs = [bert_predictions[start_token][2]] else: probs = [ item[2] for item in bert_predictions[start_token:start_token + length_of_span] ] tags_formatted.append({ "start": tag[0], "end": tag[1], "label": tag[2], "score": np.prod(probs) }) return bert_predictions, tags_formatted
def prediction_to_IOB(prediction, gt): """ CONVERT PREDICTION AND GROUTH TRUTH TO BILOU SCHEMA Input: - prediction: spacy.Doc - gt : spacy.GoldParse Output: - list of list contains every token info in prediction.text """ tag_iob = [] for token in prediction: if (token.ent_type_ == ''): tag_iob.append(token.ent_iob_) else: tag_iob.append('-'.join([token.ent_iob_, token.ent_type_])) tokens = [token.text for token in prediction] tag_new_iob = iob_to_biluo(tag_iob) gt_NER = gt.ner return [[token, true_label, pred_iob] for token, true_label, pred_iob in zip(tokens, gt_NER, tag_new_iob) ]
def test_issue2385_biluo(tags): """Test that BILUO-compatible tags aren't modified.""" assert iob_to_biluo(tags) == list(tags)
def bio_to_biluo(tags: List[str]) -> List[str]: warnings.warn(f"Use spacy.gold.iob_to_biluo instead", DeprecationWarning) return iob_to_biluo(tags)
class Cached(Provider): cache: Dict[str, Any] name = 'cached' known_schemas = { # these assume same tokenisation "bio": lambda doc, annotation: offsets_from_biluo_tags( iob_to_biluo(doc, annotation)), "bilou": offsets_from_biluo_tags, "offsets": OFFSETS, "list_of_clusters": convert_clusters_to_offsets, # these provide their own tokenisation # annotation: List[Tuple[str,str]] "list_of_tuples_bio_flat": lambda doc, annotation: get_offsets(doc.text, annotation), # annotation: List[List[Tuple[str,str]]] "list_of_tuples_bio_stacked": lambda doc, annotation: get_offsets_from_sentences( doc.text, annotation), # annotation: Tuple[List[str],List[str]] "tuple_of_lists_flat": lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])), # annotation: List[Tuple[List[str]], Tuple[List[str]]] "list_of_tuples_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))), # annotation: Tuple[List[List[str]], Tuple[List[List[str]] "tuple_of_lists_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for ws, ls in zip(*annotation[:2]) for w, l in zip(ws, ls))) # TODO: BRAT # TODO: Pubmed } def __init__(self, schema: Union[str, Callable[[Doc, Any], OffsetAnnotation]] = None, getter=None, path: str = None): self.cache = {} self.loaded = False if not schema: self.schema = OFFSETS elif schema in self.known_schemas: self.schema = Cached.known_schemas[schema] elif isinstance(schema, Callable): self.schema = schema else: self.schema = None self.getter = getter if path: self.load(path) @overrides def save(self, path: str): util.save_file(self.cache, path) # TODO: guess schema @overrides def load(self, path): self.cache = util.load_file(path) self.loaded = True @overrides def annotate_document(self, doc: Doc) -> OffsetAnnotation: if not self.loaded: raise ValueError("You forgot to load the cache!") annotations = self.cache.get(doc._.id, None) if annotations: if self.schema: if self.schema == OFFSETS: return self.getter( annotations) if self.getter else annotations else: return self.schema( doc, self.getter(annotations) if self.getter else annotations) else: logger.info( f"no schema loaded for {self.__class__.__name__}, good luck!" ) return annotations
def transform(self, X, *_): from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") import spacy from spacy.gold import iob_to_biluo nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'textcat']) from spacy.attrs import ORTH nlp.tokenizer.add_special_case("I'm", [{ORTH: "I'm"}]) nlp.vocab.add_flag( lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) english_stopwords = stopwords.words('english') english_stopwords.append("i'm") tokenized_corpus = [] good_ents = ["PERSON", "GPE", "ORG", "LOC", "EVENT", "FAC"] continue_tags = ["B-", "I-"] end_tags = ["L-", "U-"] for text in X: toks = [] iobs = [i.ent_iob_ for i in nlp(text)] biluos = list(iob_to_biluo(iobs)) #Named entities variable ne = "" for index, tok in enumerate(nlp(text)): if biluos[index] in continue_tags and str( tok.ent_type_) in good_ents: #str(tok).split() != [] Checks if empty token #For some reason tok.whitespace_ doesn't include double token entities #like "JENNIFER LAWRENCE" if not self._tag: ne += " " + str(tok).lower() elif self._tag and str(tok).split() != []: #Entity is the beginning of an entity set if biluos[index] == "B-": if str(tok.ent_type_) != "PERSON": ne += " &" + str(tok).lower() elif str(tok.ent_type_) == "PERSON": ne += " *" + str(tok).lower() else: if str(tok.ent_type_) != "PERSON": ne += " " + str(tok).lower() elif str(tok.ent_type_) == "PERSON": ne += " " + str(tok).lower() elif biluos[index] in end_tags and str( tok.ent_type_) in good_ents: if not self._tag: ne += " " + str(tok).lower() toks.append(ne.lstrip()) ne = " " elif self._tag and str(tok).split() != []: #Entity is just a single unit if biluos[index] == "U-": if str(tok.ent_type_) != "PERSON": ne += " &" + str(tok).lower() toks.append(ne.lstrip()) ne = " " elif str(tok.ent_type_) == "PERSON": ne += " *" + str(tok).lower() ne.replace("*’m", "") toks.append(ne.lstrip()) ne = " " else: ne += " " + str(tok).lower() # so that possesive tags are not stored with the '’s' ne = ne.replace("’s", "") toks.append(ne.lstrip()) ne = " " #If token is just a boring old word else: if not tok.is_punct and not tok.is_space and str( tok).lower() not in english_stopwords: toks.append(stemmer.stem(str(tok))) tokenized_corpus.append(toks) return tokenized_corpus
doc_toks = [tok.text for tok in doc] if not (doc_toks == sentences[i]): # doc_toks rövidebb általában j = 0 k = 0 new_tags = [] while j < len(sentences[i]): if sentences[i][j] == doc_toks[k]: new_tags.append(iobs[i][j]) j += 1 k += 1 else: new_tags.append(iobs[i][j]) k += 1 j += 2 tags = iob_to_biluo(new_tags) else: tags = iob_to_biluo(iobs[i]) try: entities = offsets_from_biluo_tags(doc, tags) e = (detokenized_sent, entities) corpus.append(e) except Exception as err: print(err, detokenized_sent) continue print(len(corpus)) corpus = [e for e in corpus if len(e[0]) > 0] print(len(corpus)) with open("data/interim/corpus.p", "wb") as of: