def test_for_linker(self): self.config = Config() self.config.general['log_level'] = logging.DEBUG cdb = CDB(config=self.config) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) cdb.add_names(cui='S-2290045', names=prepare_name('Movar', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}} cuis = list(cdb.cui2names.keys()) for cui in cuis[0:50]: vectors = { 'short': np.random.rand(300), 'long': np.random.rand(300), 'medium': np.random.rand(300) } cdb.update_context_vector(cui, vectors, negative=False) d = self.nlp(self.text) vocab = Vocab.load(self.vocab_path) cm = ContextModel(cdb, vocab, self.config) cm.train_using_negative_sampling('S-229004') self.config.linking['train_count_threshold'] = 0 cm.train('S-229004', d._.ents[1], d) cm.similarity('S-229004', d._.ents[1], d) cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
def setUp(self) -> None: self.config = Config() self.config.general['log_level'] = logging.INFO cdb = CDB(config=self.config) self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', self.nlp, {}, self.config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', self.nlp, {}, self.config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', self.nlp, {}, self.config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} self.vocab_path = "./tmp_vocab.dat" if not os.path.exists(self.vocab_path): import requests tmp = requests.get( "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat") with open(self.vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(self.vocab_path) # Make the pipeline self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config) self.nlp.add_tagger(tagger=tag_skip_and_punct, name='skip_and_punct', additional_fields=['is_punct']) spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=self.config, data_vocab=vocab) self.nlp.add_token_normalizer(spell_checker=spell_checker, config=self.config) ner = NER(cdb, self.config) self.nlp.add_ner(ner) # Add Linker link = Linker(cdb, vocab, self.config) self.nlp.add_linker(link) self.text = "CDB - I was running and then Movar Virus attacked and CDb"
import logging from medcat.cdb import CDB import os import requests config = Config() config.general['log_level'] = logging.INFO cdb = CDB(config=config) nlp = Pipe(tokenizer=spacy_split_all, config=config) nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config), name='skip_and_punct', additional_fields=['is_punct']) # Add a couple of names cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config)) cdb.add_names(cui='S-229004', names=prepare_name('Movar viruses', nlp, {}, config)) cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config)) # Check #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}} vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) vocab = Vocab.load(vocab_path) # Make the pipeline