Esempio n. 1
0
 def setUpClass(cls):
     print("Load test database csvs for load tests")
     config = Config()
     config.general['log_level'] = logging.DEBUG
     maker = CDBMaker(config)
     csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
     cls.cdb = maker.prepare_csvs(csvs, full_build=True)
Esempio n. 2
0
 def setUpClass(cls):
     print("Load test database csvs for load tests")
     config = Config()
     config.general['log_level'] = logging.DEBUG
     cls.maker = CDBMaker(config)
     csvs = [
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb.csv'),
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb_2.csv')
     ]
     cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
Esempio n. 3
0
    def setUp(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        self.maker = CDBMaker(self.config)

        # Building a new CDB from two files (full_build)
        csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
        self.cdb = self.maker.prepare_csvs(csvs, full_build=True)
Esempio n. 4
0
    def setUpClass(cls):
        print("Set up CDB")
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        print("Set up Vocab")
        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get(
                "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)

        print("Set up NLP pipeline")
        cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct,
                                          config=cls.config),
                           name='skip_and_punct',
                           additional_fields=['is_punct'])

        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab,
                                              config=cls.config,
                                              data_vocab=cls.vocab)
        cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker,
                                     config=cls.config)
        cls.ner = NER(cls.cdb, cls.config)
        cls.nlp.add_ner(cls.ner)

        print("Set up Linker")
        cls.link = Linker(cls.cdb, cls.vocab, cls.config)
        cls.nlp.add_linker(cls.link)

        print("Set limits for tokens and uppercase")
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2

        print("Add concepts")
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar', cls.nlp, {}, cls.config))
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar viruses', cls.nlp, {},
                                             cls.config))
        cls.cdb.add_names(cui='S-229005',
                          names=prepare_name('CDB', cls.nlp, {}, cls.config))

        print("Add test text")
        cls.text = "CDB - I was running and then Movar    Virus attacked and CDb"
        cls.text_post_pipe = cls.nlp(cls.text)
Esempio n. 5
0
    def setUpClass(cls) -> None:
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)
        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab)
        cls.ner = NER(cls.cdb, cls.config)
        cls.linker = Linker(cls.cdb, cls.vocab, cls.config)
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2
        cls.meta_cat = MetaCAT()
        cls.text = "CDB - I was running and then Movar Virus attacked and CDb"
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
Esempio n. 6
0
    def setUp(self) -> None:
        self.config = Config()
        self.config.general['log_level'] = logging.INFO
        cdb = CDB(config=self.config)

        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

        self.vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(self.vocab_path):
            import requests
            tmp = requests.get(
                "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(self.vocab_path, 'wb') as f:
                f.write(tmp.content)

        vocab = Vocab.load(self.vocab_path)
        # Make the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)
        ner = NER(cdb, self.config)
        self.nlp.add_ner(ner)

        # Add Linker
        link = Linker(cdb, vocab, self.config)
        self.nlp.add_linker(link)

        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
Esempio n. 7
0
    def load(cls, path, config=None):
        r''' Load and return a CDB. This allows partial loads in probably not the right way at all.

        Args:
            path (`str`):
                Path to a `cdb.dat` from which to load data.
        '''
        with open(path, 'rb') as f:
            # Again no idea
            data = dill.load(f)
            if config is None:
                config = Config.from_dict(data['config'])
            # Create an instance of the CDB (empty)
            cdb = cls(config=config)

            # Load data into the new cdb instance
            for k in cdb.__dict__:
                if k in data['cdb']:
                    cdb.__dict__[k] = data['cdb'][k]

        return cdb
Esempio n. 8
0
    def test_for_linker(self):
        self.config = Config()
        self.config.general['log_level'] = logging.DEBUG
        cdb = CDB(config=self.config)

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        cdb.add_names(cui='S-2290045',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}, 'S-2290045': {'movar'}}

        cuis = list(cdb.cui2names.keys())
        for cui in cuis[0:50]:
            vectors = {
                'short': np.random.rand(300),
                'long': np.random.rand(300),
                'medium': np.random.rand(300)
            }
            cdb.update_context_vector(cui, vectors, negative=False)

        d = self.nlp(self.text)
        vocab = Vocab.load(self.vocab_path)
        cm = ContextModel(cdb, vocab, self.config)
        cm.train_using_negative_sampling('S-229004')
        self.config.linking['train_count_threshold'] = 0

        cm.train('S-229004', d._.ents[1], d)

        cm.similarity('S-229004', d._.ents[1], d)

        cm.disambiguate(['S-2290045', 'S-229004'], d._.ents[1], 'movar', d)
Esempio n. 9
0
 def setUpClass(cls) -> None:
     config = Config()
     config.general["spacy_model"] = "en_core_sci_md"
     cls.cdb_maker = CDBMaker(config)
Esempio n. 10
0
r''' The tests here are a bit messy but they work, should be converted to python unittests.
'''
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
import numpy as np
import logging

config = Config()
config.general['log_level'] = logging.DEBUG
maker = CDBMaker(config)

# Building a new CDB from two files (full_build)
csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
cdb = maker.prepare_csvs(csvs, full_build=True)

assert len(cdb.cui2names) == 3
assert len(cdb.cui2snames) == 3
assert len(cdb.name2cuis) == 5
assert len(cdb.cui2tags) == 3
assert len(cdb.cui2preferred_name) == 2
assert len(cdb.cui2context_vectors) == 3
assert len(cdb.cui2count_train) == 3
assert cdb.name2cuis2status['virus']['C0000039'] == 'P'
assert cdb.cui2type_ids['C0000039'] == {'T234', 'T109', 'T123'}
assert cdb.addl_info['cui2original_names']['C0000039'] == {
    'Virus', 'Virus K', 'Virus M', 'Virus Z'
}
assert cdb.addl_info['cui2description']['C0000039'].startswith("Synthetic")

# Test name addition
from medcat.preprocessing.cleaners import prepare_name
Esempio n. 11
0
from medcat.ner.vocab_based_ner import NER
from medcat.preprocessing.taggers import tag_skip_and_punct
from medcat.pipe import Pipe
from medcat.utils.normalizers import BasicSpellChecker
from medcat.vocab import Vocab
from medcat.preprocessing.cleaners import prepare_name
from medcat.linking.vector_context_model import ContextModel
from functools import partial
from medcat.linking.context_based_linker import Linker
from medcat.config import Config
import logging
from medcat.cdb import CDB
import os
import requests

config = Config()
config.general['log_level'] = logging.INFO
cdb = CDB(config=config)

nlp = Pipe(tokenizer=spacy_split_all, config=config)
nlp.add_tagger(tagger=partial(tag_skip_and_punct, config=config),
               name='skip_and_punct',
               additional_fields=['is_punct'])

# Add a couple of names
cdb.add_names(cui='S-229004', names=prepare_name('Movar', nlp, {}, config))
cdb.add_names(cui='S-229004',
              names=prepare_name('Movar viruses', nlp, {}, config))
cdb.add_names(cui='S-229005', names=prepare_name('CDB', nlp, {}, config))
# Check
#assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}
Esempio n. 12
0
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
import numpy as np
import logging
import os

config = Config()
config.general['log_level'] = logging.INFO
config.general['spacy_model'] = 'en_core_sci_lg'
maker = CDBMaker(config)

# Building a new CDB from two files (full_build)
csvs = ['./tmp_medmentions.csv']
cdb = maker.prepare_csvs(csvs, full_build=True)

cdb.save("./tmp_cdb.dat")


from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.cat import CAT

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
    import requests
    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

config = Config()
cdb = CDB.load("./tmp_cdb.dat", config=config)
Esempio n. 13
0
from medcat.cdb import CDB
from medcat.cdb_maker import CDBMaker
from medcat.config import Config

# Specify cdb name and path to csvs
cdb_name = "cdb_name.dat"
csv_path_list = [" path to list of csvs here"]

# Create CDB
config = Config()
maker = CDBMaker(config)
cdb = maker.prepare_csvs(csv_path_list, full_build=True)
cdb.save(cdb_name)

# Load the newly created cdb:
cdb2 = CDB.load(cdb_name)