Beispiel #1
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB.load(cdb_path)
            cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE"))
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
 def test_bg_save_and_load_model_context_vectors(self):
     self.cdb.save("./tmp_cdb.dat")
     self.cdb2 = CDB.load('./tmp_cdb.dat')
     self.assertEqual(self.cdb.cui2count_train['C0000139'], 2,
                      "Count should equal 2")
     self.assertEqual(
         self.cdb.cui2context_vectors['C0000139']['long'].shape[0], 300,
         "Dimensions should equal 300")
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug("Loading VOCAB ...")
        vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug("Loading CDB ...")

        cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH"))

        spacy_model = os.getenv("SPACY_MODEL", "")

        if spacy_model:
            cdb.config.general["spacy_model"] == spacy_model
        else:
            logging.warning("SPACY_MODEL environment var not set, \
                attempting to load the spacy model found within the CDB : " +
                            cdb.config.general["spacy_model"])

            if cdb.config.general["spacy_model"] == "":
                raise ValueError(
                    "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \
                 To solve this declare the SPACY_MODEL in the env_medcat file."
                )

        # this is redundant as the config is already in the CDB
        conf = cdb.config

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug("Applying CDB CUI filter ...")
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug("Loading META annotations ...")
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"):
                m = MetaCAT.load(model_path)
                meta_models.append(m)

        cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models)
        return cat
Beispiel #4
0
    def test_training_import(self):
        cdb2 = CDB.load('./tmp_cdb.dat')
        self.cdb.reset_training()
        cdb2.reset_training()
        np.random.seed(11)
        cuis = list(self.cdb.cui2names.keys())
        for i in range(2):
            for cui in cuis:
                vectors = {}
                for cntx_type in self.config.linking['context_vector_sizes']:
                    vectors[cntx_type] = np.random.rand(300)
                self.cdb.update_context_vector(cui, vectors, negative=False)

        cdb2.import_training(cdb=self.cdb, overwrite=True)
        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
        assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
Beispiel #5
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            try:
                cdb = CDB.load(cdb_path)
            except KeyError as ke:
                mc_v = pkg_resources.get_distribution('medcat').version
                if int(mc_v.split('.')[0]) > 0:
                    log.error(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x'
                    )
                    raise Exception(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x',
                        'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                        'MedCATTrainer Dev team if you believe this should work'
                    ) from ke
                raise

            custom_config = os.getenv("MEDCAT_CONFIG_FILE")
            if custom_config is not None and os.path.exists(custom_config):
                cdb.config.parse_config_file(path=custom_config)
            else:
                log.info(
                    "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB"
                )
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
Beispiel #6
0
 def setUpClass(cls) -> None:
     cls.cdb = CDB.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "cdb.dat"))
     cls.vocab = Vocab.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "vocab.dat"))
     cls.cdb.config.ner['min_name_len'] = 2
     cls.cdb.config.ner['upper_case_limit_len'] = 3
     cls.cdb.config.general['spell_check'] = True
     cls.cdb.config.linking['train_count_threshold'] = 10
     cls.cdb.config.linking['similarity_threshold'] = 0.3
     cls.cdb.config.linking['train'] = True
     cls.cdb.config.linking['disamb_length_limit'] = 5
     cls.cdb.config.general['full_unlink'] = True
     cls.undertest = CAT(cdb=cls.cdb,
                         config=cls.cdb.config,
                         vocab=cls.vocab)
Beispiel #7
0
def import_concepts_from_cdb(cdb_model_id: int):
    from medcat.cdb import CDB

    cdb_model = ConceptDB.objects.get(id=cdb_model_id)
    cdb = CDB.load(cdb_model.cdb_file.path)
    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=cdb_model_id).values_list('cui', flat=True))
    all_cuis = set(Concept.objects.all().values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in all_cuis:
            concept = Concept()
            concept.cui = cui
            update_concept_model(concept, cdb_model, cdb)
        if cui in all_cuis and cui not in existing_cuis:
            # ui has been added from another CDB. Overwrite here.
            concept = Concept.objects.get(cui=cui)
            update_concept_model(concept, cdb_model, cdb)
Beispiel #8
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB.load(concept_db.cdb_file.path)

    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=id).values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in existing_cuis:
            concept = Concept()
            concept.pretty_name = cdb.cui2preferred_name.get(cui, cui)
            concept.cui = cui
            concept.tui = ','.join(list(cdb.cui2type_ids.get(cui, '')))
            concept.semantic_type = ','.join([
                cdb.addl_info['type_id2name'].get(tui, '')
                for tui in list(cdb.cui2type_ids.get(cui, ''))
            ])
            concept.desc = cdb.addl_info['cui2description'].get(cui, '')
            concept.synonyms = ", ".join(
                cdb.addl_info['cui2original_names'].get(cui, []))
            concept.cdb = concept_db
            concept.save()
Beispiel #9
0
def _reset_cdb_filters(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB.load(concept_db.cdb_file.path)
    cdb.config.linking['filters'] = {'cuis': set()}
    cdb.save(concept_db.cdb_file.path)
Beispiel #10
0
 def test_save_and_load(self):
     self.cdb.save("./tmp_cdb.dat")
     cdb2 = CDB.load('./tmp_cdb.dat')
     # Check a random thing
     assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
Beispiel #11
0
assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300

# Test negative
for cui in cuis:
    vectors = {}
    for cntx_type in config.linking['context_vector_sizes']:
        vectors[cntx_type] = np.random.rand(300)
    cdb.update_context_vector(cui, vectors, negative=True)

assert cdb.cui2count_train['C0000139'] == 2
assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300

# Test save/load
from medcat.cdb import CDB
cdb.save("./tmp_cdb.dat")
cdb2 = CDB.load('./tmp_cdb.dat')
# Check a random thing
assert cdb2.cui2context_vectors['C0000139']['long'][
    7] == cdb.cui2context_vectors['C0000139']['long'][7]

# Test training import
cdb.reset_training()
cdb2.reset_training()
np.random.seed(11)
cuis = list(cdb.cui2names.keys())
for i in range(2):
    for cui in cuis:
        vectors = {}
        for cntx_type in config.linking['context_vector_sizes']:
            vectors[cntx_type] = np.random.rand(300)
        cdb.update_context_vector(cui, vectors, negative=False)
Beispiel #12
0
cdb.save("./tmp_cdb.dat")


from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.cat import CAT

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
    import requests
    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

config = Config()
cdb = CDB.load("./tmp_cdb.dat", config=config)
vocab = Vocab.load(vocab_path)

cdb.reset_training()

cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
cat.config.ner['min_name_len'] = 3
cat.config.ner['upper_case_limit_len'] = 3
cat.config.linking['disamb_length_limit'] = 3
cat.config.linking['filters'] = {'cuis': set()}
cat.config.linking['train_count_threshold'] = -1
cat.config.linking['context_vector_sizes'] = {'xlong': 27, 'long': 18, 'medium': 9, 'short': 3}
cat.config.linking['context_vector_weights'] = {'xlong': 0, 'long': 0.4, 'medium': 0.4, 'short': 0.2}
cat.config.linking['weighted_average_function'] = lambda step: max(0.1, 1-(step**2*0.0004))

cat.config.linking['similarity_threshold_type'] = 'dynamic'
Beispiel #13
0
from medcat.cdb import CDB
from medcat.cdb_maker import CDBMaker
from medcat.config import Config

# Specify cdb name and path to csvs
cdb_name = "cdb_name.dat"
csv_path_list = [" path to list of csvs here"]

# Create CDB
config = Config()
maker = CDBMaker(config)
cdb = maker.prepare_csvs(csv_path_list, full_build=True)
cdb.save(cdb_name)

# Load the newly created cdb:
cdb2 = CDB.load(cdb_name)