def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB.load(cdb_path) cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE")) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def test_bg_save_and_load_model_context_vectors(self): self.cdb.save("./tmp_cdb.dat") self.cdb2 = CDB.load('./tmp_cdb.dat') self.assertEqual(self.cdb.cui2count_train['C0000139'], 2, "Count should equal 2") self.assertEqual( self.cdb.cui2context_vectors['C0000139']['long'].shape[0], 300, "Dimensions should equal 300")
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug("Loading VOCAB ...") vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug("Loading CDB ...") cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH")) spacy_model = os.getenv("SPACY_MODEL", "") if spacy_model: cdb.config.general["spacy_model"] == spacy_model else: logging.warning("SPACY_MODEL environment var not set, \ attempting to load the spacy model found within the CDB : " + cdb.config.general["spacy_model"]) if cdb.config.general["spacy_model"] == "": raise ValueError( "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \ To solve this declare the SPACY_MODEL in the env_medcat file." ) # this is redundant as the config is already in the CDB conf = cdb.config # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug("Applying CDB CUI filter ...") with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug("Loading META annotations ...") for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"): m = MetaCAT.load(model_path) meta_models.append(m) cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models) return cat
def test_training_import(self): cdb2 = CDB.load('./tmp_cdb.dat') self.cdb.reset_training() cdb2.reset_training() np.random.seed(11) cuis = list(self.cdb.cui2names.keys()) for i in range(2): for cui in cuis: vectors = {} for cntx_type in self.config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) self.cdb.update_context_vector(cui, vectors, negative=False) cdb2.import_training(cdb=self.cdb, overwrite=True) assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7] assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path try: cdb = CDB.load(cdb_path) except KeyError as ke: mc_v = pkg_resources.get_distribution('medcat').version if int(mc_v.split('.')[0]) > 0: log.error( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x' ) raise Exception( 'Attempted to load MedCAT v0.x model with MCTrainer v1.x', 'Please re-configure this project to use a MedCAT v1.x CDB or consult the ' 'MedCATTrainer Dev team if you believe this should work' ) from ke raise custom_config = os.getenv("MEDCAT_CONFIG_FILE") if custom_config is not None and os.path.exists(custom_config): cdb.config.parse_config_file(path=custom_config) else: log.info( "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB" ) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab.load(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) CAT_MAP[cat_id] = cat return cat
def setUpClass(cls) -> None: cls.cdb = CDB.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) cls.cdb.config.ner['min_name_len'] = 2 cls.cdb.config.ner['upper_case_limit_len'] = 3 cls.cdb.config.general['spell_check'] = True cls.cdb.config.linking['train_count_threshold'] = 10 cls.cdb.config.linking['similarity_threshold'] = 0.3 cls.cdb.config.linking['train'] = True cls.cdb.config.linking['disamb_length_limit'] = 5 cls.cdb.config.general['full_unlink'] = True cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, vocab=cls.vocab)
def import_concepts_from_cdb(cdb_model_id: int): from medcat.cdb import CDB cdb_model = ConceptDB.objects.get(id=cdb_model_id) cdb = CDB.load(cdb_model.cdb_file.path) # Get all existing cuis for this CDB existing_cuis = set( Concept.objects.filter(cdb=cdb_model_id).values_list('cui', flat=True)) all_cuis = set(Concept.objects.all().values_list('cui', flat=True)) for cui in cdb.cui2names.keys(): if cui not in all_cuis: concept = Concept() concept.cui = cui update_concept_model(concept, cdb_model, cdb) if cui in all_cuis and cui not in existing_cuis: # ui has been added from another CDB. Overwrite here. concept = Concept.objects.get(cui=cui) update_concept_model(concept, cdb_model, cdb)
def _import_concepts(id): from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) cdb = CDB.load(concept_db.cdb_file.path) # Get all existing cuis for this CDB existing_cuis = set( Concept.objects.filter(cdb=id).values_list('cui', flat=True)) for cui in cdb.cui2names.keys(): if cui not in existing_cuis: concept = Concept() concept.pretty_name = cdb.cui2preferred_name.get(cui, cui) concept.cui = cui concept.tui = ','.join(list(cdb.cui2type_ids.get(cui, ''))) concept.semantic_type = ','.join([ cdb.addl_info['type_id2name'].get(tui, '') for tui in list(cdb.cui2type_ids.get(cui, '')) ]) concept.desc = cdb.addl_info['cui2description'].get(cui, '') concept.synonyms = ", ".join( cdb.addl_info['cui2original_names'].get(cui, [])) concept.cdb = concept_db concept.save()
def _reset_cdb_filters(id): from medcat.cdb import CDB concept_db = ConceptDB.objects.get(id=id) cdb = CDB.load(concept_db.cdb_file.path) cdb.config.linking['filters'] = {'cuis': set()} cdb.save(concept_db.cdb_file.path)
def test_save_and_load(self): self.cdb.save("./tmp_cdb.dat") cdb2 = CDB.load('./tmp_cdb.dat') # Check a random thing assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 # Test negative for cui in cuis: vectors = {} for cntx_type in config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=True) assert cdb.cui2count_train['C0000139'] == 2 assert cdb.cui2context_vectors['C0000139']['long'].shape[0] == 300 # Test save/load from medcat.cdb import CDB cdb.save("./tmp_cdb.dat") cdb2 = CDB.load('./tmp_cdb.dat') # Check a random thing assert cdb2.cui2context_vectors['C0000139']['long'][ 7] == cdb.cui2context_vectors['C0000139']['long'][7] # Test training import cdb.reset_training() cdb2.reset_training() np.random.seed(11) cuis = list(cdb.cui2names.keys()) for i in range(2): for cui in cuis: vectors = {} for cntx_type in config.linking['context_vector_sizes']: vectors[cntx_type] = np.random.rand(300) cdb.update_context_vector(cui, vectors, negative=False)
cdb.save("./tmp_cdb.dat") from medcat.vocab import Vocab from medcat.cdb import CDB from medcat.cat import CAT vocab_path = "./tmp_vocab.dat" if not os.path.exists(vocab_path): import requests tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat") with open(vocab_path, 'wb') as f: f.write(tmp.content) config = Config() cdb = CDB.load("./tmp_cdb.dat", config=config) vocab = Vocab.load(vocab_path) cdb.reset_training() cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat.config.ner['min_name_len'] = 3 cat.config.ner['upper_case_limit_len'] = 3 cat.config.linking['disamb_length_limit'] = 3 cat.config.linking['filters'] = {'cuis': set()} cat.config.linking['train_count_threshold'] = -1 cat.config.linking['context_vector_sizes'] = {'xlong': 27, 'long': 18, 'medium': 9, 'short': 3} cat.config.linking['context_vector_weights'] = {'xlong': 0, 'long': 0.4, 'medium': 0.4, 'short': 0.2} cat.config.linking['weighted_average_function'] = lambda step: max(0.1, 1-(step**2*0.0004)) cat.config.linking['similarity_threshold_type'] = 'dynamic'
from medcat.cdb import CDB from medcat.cdb_maker import CDBMaker from medcat.config import Config # Specify cdb name and path to csvs cdb_name = "cdb_name.dat" csv_path_list = [" path to list of csvs here"] # Create CDB config = Config() maker = CDBMaker(config) cdb = maker.prepare_csvs(csv_path_list, full_build=True) cdb.save(cdb_name) # Load the newly created cdb: cdb2 = CDB.load(cdb_name)