def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"): """ an adaption of benchmark spacy model which is compatible with spacy v. 3 running this requires: spacy >= 3.0.0 spacy-transformers """ from spacy.tokens import Doc import dacy nlp = dacy.load(dacy_model) trf = nlp.get_pipe('transformer') ner = nlp.get_pipe('ner') predictions = [] start = time.time() for token in sentences_tokens: doc = Doc(nlp.vocab, words=token) doc = trf(doc) doc = ner(doc) ents = [] for t in doc: if t.ent_iob_ == 'O': ents.append(t.ent_iob_) else: ents.append(t.ent_iob_ + "-" + t.ent_type_) predictions.append(ents) print('DaCy ({}):'.format(dacy_model)) print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"): """ an adaption of benchmark spacy model which is compatible with spacy v. 3 running this requires: spacy >= 3.0.0 spacy-transformers """ import dacy from spacy.tokens import Doc nlp = dacy.load(dacy_model) trf = nlp.get_pipe('transformer') tagger = nlp.get_pipe('tagger') start = time.time() tags_pred = [] for sent in sentences_tokens: doc = Doc(nlp.vocab, words=sent) doc = trf(doc) doc = tagger(doc) tags = [] for tok in doc: tags.append(tok.tag_) tags_pred.append(tags) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
def benchmark_dacy_mdl(dacy_model="da_dacy_large_tft-0.0.0"): """ an adaption of benchmark spacy model which is compatible with spacy v. 3 running this requires: spacy >= 3.0.0 spacy-transformers """ def normalize_spacy_head(i, hd): return 0 if i == hd else hd+1 from spacy.tokens import Doc import dacy nlp = dacy.load(dacy_model) trf = nlp.get_pipe('transformer') parser = nlp.get_pipe('parser') start = time.time() deps_pred = [] for sent in sentences_tokens: doc = Doc(nlp.vocab, words=sent) doc = trf(doc) doc = parser(doc) deprels = [] depheads = [] for i, tok in enumerate(doc): deprels.append(tok.dep_.lower()) depheads.append(normalize_spacy_head(i, tok.head.i)) deps_pred.append([(r,h) for r,h in zip(deprels, depheads)]) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(deps_pred)==num_sentences assert sum([len(s) for s in deps_pred])==num_tokens print(dependency_report(deps_true, deps_pred))
def test_LIX(): Doc.set_extension("LIX", getter=LIX_getter) nlp = dacy.load("da_dacy_medium_tft-0.0.0") doc = nlp("Dette er en test tekst") doc._.LIX
spacy.prefer_gpu() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: if platform == "linux" or platform == "linux2" or platform == "darwin": multiprocessing.set_start_method("fork") except RuntimeError: pass # elif platform == "win32": # multiprocessing.set_start_method("spawn") ######### DaCy multiprocessing hack START ######### # Hack to make DaCy multiprocessable for both spawn and fork (SpaCy 3.0 issue with pickle) torch.set_num_threads(1) num_cpus: int = int(os.cpu_count()) # type: ignore ner_model = dacy.load("da_dacy_large_tft-0.0.0") def worker(text: List[str]): # type: ignore return list(ner_model.pipe(text, batch_size=len(text))) ######### DaCy multiprocessing hack END ######### class TextAnonymizer(object): """ Object of a text corpus to apply masking function for anonymization Args: corpus: The corpus containing a list of strings