Esempio n. 1
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB()
            cdb.load_dict(cdb_path)
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab()
            vocab.load_dict(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, vocab=vocab)
        cat.train = False
        CAT_MAP[cat_id] = cat
    return cat
Esempio n. 2
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB()
    cdb.load_dict(concept_db.cdb_file.path)
    tuis = None

    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=id).values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in existing_cuis:
            pretty_name = None

            if cui in cdb.cui2pretty_name:
                pretty_name = cdb.cui2pretty_name[cui]
            elif cui in cdb.cui2original_names and len(
                    cdb.cui2original_names[cui]) > 0:
                pretty_name = next(iter(cdb.cui2original_names[cui]))

            tui = cdb.cui2tui.get(cui, 'unk')
            if pretty_name is not None and (tuis is None or tui in tuis):
                concept = Concept()
                concept.pretty_name = pretty_name
                concept.cui = cui
                concept.tui = tui
                concept.semantic_type = cdb.tui2name.get(tui, '')
                concept.desc = cdb.cui2desc.get(cui, '')
                concept.synonyms = ", ".join(
                    cdb.cui2original_names.get(cui, []))
                concept.cdb = concept_db
                concept.save()
                set_icd_info_objects(cdb, concept, cui)
                set_opcs_info_objects(cdb, concept, cui)
Esempio n. 3
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB()
    cdb.load_dict(concept_db.cdb_file.path)
    tuis = None

    for cui in cdb.cui2pretty_name:
        tui = cdb.cui2tui.get(cui, 'unk')
        if tuis is None or tui in tuis:
            concept = Concept()
            concept.pretty_name = cdb.cui2pretty_name.get(cui, '')
            concept.cui = cui
            concept.tui = tui
            concept.semantic_type = cdb.tui2name.get(tui, '')
            concept.desc = cdb.cui2desc.get(cui, '')
            concept.synonyms = ",".join(cdb.cui2original_names.get(cui, []))
            concept.cdb = concept_db
            icd10 = ''
            try:
                for pair in cdb.cui2info[cui]['icd10']:
                    icd10 += pair['chapter'] + " | " + pair['name']
                    icd10 += '\n'
                icd10.strip()
            except:
                pass
            concept.icd10 = icd10
            #concept.vocab = cdb.cui2ontos.get(cui, '')

            try:
                concept.save()
            except:
                pass
Esempio n. 4
0
def filter_cdb_by_icd10(cdb: CDB) -> CDB:
    """
    Filters an existing CDB to only contain concepts that have an associated ICD-10 code.
    Can be used for snomed orr UMLS CDBs.
    :return: filtered CDB
    """
    cuis_to_keep = [cui for cui in cdb.cui2names.keys() if 'icd10' in cdb.cui2info[cui]]
    cdb.filter_by_cui(cuis_to_keep)
    return cdb
Esempio n. 5
0
    def __init__(self, cdb=None):
        if cdb is None:
            self.cdb = CDB()
        else:
            self.cdb = cdb

        # Build the required spacy pipeline
        self.nlp = SpacyPipe(spacy_split_all)
        self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=False))
Esempio n. 6
0
 def save(self, *args, **kwargs):
     if self.concept_db is None:
         cdb = CDB()
         cdb.save_dict('empty_cdb.dat')
         f = open('empty_cdb.dat', 'rb')
         cdb_obj = ConceptDB()
         cdb_obj.name = f'{self.name}_empty_cdb'
         cdb_obj.cdb_file.save(f'{self.name}_empty_cdb.dat', File(f))
         cdb_obj.use_for_training = True
         cdb_obj.save()
         self.concept_db = cdb_obj
     super(ProjectAnnotateEntities, self).save(*args, **kwargs)
Esempio n. 7
0
    def train_supervised(self, data_path, reset_cdb=False, reset_cui_count=False, epochs=2, lr=None,
                         anneal=None):
        """ Given data learns vector embeddings for concepts
        in a suppervised way.

        data_path:  path to data in json format
        """
        self.train = False
        data = json.load(open(data_path))

        if reset_cdb:
            self.cdb = CDB()

        if reset_cui_count:
            # Get all CUIs
            cuis = []
            for doc in data['documents']:
                for ann in doc['annotations']:
                    cuis.append(ann['cui'])
            for cui in set(cuis):
                if cui in self.cdb.cui_count:
                    self.cdb.cui_count[cui] = 1

        for epoch in epochs:
            log.info("Starting epoch: {}".format(epoch))
            for doc in data['documents']:
                spacy_doc = self(doc['text'])

                for ann in doc['annotations']:
                    cui = ann['cui']
                    start = ann['start']
                    end = ann['end']
                    deleted = ann['deleted']

                    if deleted:
                        # Add negatives only if they exist in the CDB
                        if cui in self.cdb.cui2names:
                            self.add_name(cui=cui,
                                          source_val=ann['value'],
                                          spacy_doc=spacy_doc,
                                          text_inds=[start, end],
                                          negative=deleted,
                                          lr=lr,
                                          anneal=anneal)
                    else:
                        self.add_name(cui=cui,
                                      source_val=ann['value'],
                                      spacy_doc=spacy_doc,
                                      text_inds=[start, end],
                                      lr=lr,
                                      anneal=anneal)
Esempio n. 8
0
    def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None):
        self.vocab = vocab
        if pretrained_cdb is None:
            self.cdb = CDB()
        else:
            self.cdb = pretrained_cdb

        # Build the required spacy pipeline
        self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser'])
        self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS))
        # Get the tokenizer
        if word_tokenizer is not None:
            self.tokenizer = word_tokenizer
        else:
            self.tokenizer = self._tok
Esempio n. 9
0
 def __init__(self, vocab=None, pretrained_cdb=None, tokenizer=None):
     self.vocab = vocab
     if pretrained_cdb is None:
         self.cdb = CDB()
     else:
         self.cdb = pretrained_cdb
     # Build the required spacy pipeline
     self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser'])
     self.nlp.add_punct_tagger(
         tagger=partial(spacy_tag_punct, skip_stopwords=False))
     # Get the tokenizer
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = self._tok  #BertTokenizer.from_pretrained('bert-base-uncased')
Esempio n. 10
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB.load(cdb_path)
            cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE"))
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
Esempio n. 11
0
 def test_bg_save_and_load_model_context_vectors(self):
     self.cdb.save("./tmp_cdb.dat")
     self.cdb2 = CDB.load('./tmp_cdb.dat')
     self.assertEqual(self.cdb.cui2count_train['C0000139'], 2,
                      "Count should equal 2")
     self.assertEqual(
         self.cdb.cui2context_vectors['C0000139']['long'].shape[0], 300,
         "Dimensions should equal 300")
Esempio n. 12
0
 def setUpClass(cls):
     print("Load test database csvs for edit tests")
     cls.config = Config()
     cls.config.general['log_level'] = logging.DEBUG
     cls.maker = CDBMaker(cls.config)
     csvs = ['../examples/cdb.csv', '../examples/cdb_2.csv']
     cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
     cls.cdb2 = CDB(cls.config)
Esempio n. 13
0
    def __init__(self, config, cdb=None, name_max_words=20):
        self.config = config
        # Set log level
        self.log.setLevel(self.config.general['log_level'])

        # To make life a bit easier
        self.cnf_cm = config.cdb_maker

        if cdb is None:
            self.cdb = CDB(config=self.config)
        else:
            self.cdb = cdb

        # Build the required spacy pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
Esempio n. 14
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug("Loading VOCAB ...")
        vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug("Loading CDB ...")

        cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH"))

        spacy_model = os.getenv("SPACY_MODEL", "")

        if spacy_model:
            cdb.config.general["spacy_model"] == spacy_model
        else:
            logging.warning("SPACY_MODEL environment var not set, \
                attempting to load the spacy model found within the CDB : " +
                            cdb.config.general["spacy_model"])

            if cdb.config.general["spacy_model"] == "":
                raise ValueError(
                    "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \
                 To solve this declare the SPACY_MODEL in the env_medcat file."
                )

        # this is redundant as the config is already in the CDB
        conf = cdb.config

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug("Applying CDB CUI filter ...")
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug("Loading META annotations ...")
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"):
                m = MetaCAT.load(model_path)
                meta_models.append(m)

        cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models)
        return cat
Esempio n. 15
0
    def setUpClass(cls):
        print("Set up CDB")
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        print("Set up Vocab")
        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get(
                "https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)

        print("Set up NLP pipeline")
        cls.nlp = Pipe(tokenizer=spacy_split_all, config=cls.config)
        cls.nlp.add_tagger(tagger=partial(tag_skip_and_punct,
                                          config=cls.config),
                           name='skip_and_punct',
                           additional_fields=['is_punct'])

        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab,
                                              config=cls.config,
                                              data_vocab=cls.vocab)
        cls.nlp.add_token_normalizer(spell_checker=cls.spell_checker,
                                     config=cls.config)
        cls.ner = NER(cls.cdb, cls.config)
        cls.nlp.add_ner(cls.ner)

        print("Set up Linker")
        cls.link = Linker(cls.cdb, cls.vocab, cls.config)
        cls.nlp.add_linker(cls.link)

        print("Set limits for tokens and uppercase")
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2

        print("Add concepts")
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar', cls.nlp, {}, cls.config))
        cls.cdb.add_names(cui='S-229004',
                          names=prepare_name('Movar viruses', cls.nlp, {},
                                             cls.config))
        cls.cdb.add_names(cui='S-229005',
                          names=prepare_name('CDB', cls.nlp, {}, cls.config))

        print("Add test text")
        cls.text = "CDB - I was running and then Movar    Virus attacked and CDb"
        cls.text_post_pipe = cls.nlp(cls.text)
Esempio n. 16
0
def update_concept_model(concept: Concept, cdb_model: ConceptDB, cdb: CDB):
    cui = concept.cui
    concept.pretty_name = cdb.get_name(cui)
    concept.type_ids = ','.join(list(cdb.cui2type_ids.get(cui, '')))
    concept.semantic_type = ','.join([
        cdb.addl_info['type_id2name'].get(type_id, '')
        for type_id in list(cdb.cui2type_ids.get(cui, ''))
    ])
    concept.desc = cdb.addl_info['cui2description'].get(cui, '')
    concept.synonyms = ", ".join(cdb.addl_info['cui2original_names'].get(
        cui, []))
    concept.cdb = cdb_model
    concept.save()
Esempio n. 17
0
 def setUpClass(cls):
     print("Load test database csvs for edit tests")
     cls.config = Config()
     cls.config.general['log_level'] = logging.DEBUG
     cls.maker = CDBMaker(cls.config)
     csvs = [
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb.csv'),
         os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                      'examples', 'cdb_2.csv')
     ]
     cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
     cls.cdb2 = CDB(cls.config)
Esempio n. 18
0
    def setUp(self) -> None:
        self.config = Config()
        self.config.general['log_level'] = logging.INFO
        cdb = CDB(config=self.config)

        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Add a couple of names
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar', self.nlp, {}, self.config))
        cdb.add_names(cui='S-229004',
                      names=prepare_name('Movar viruses', self.nlp, {},
                                         self.config))
        cdb.add_names(cui='S-229005',
                      names=prepare_name('CDB', self.nlp, {}, self.config))
        # Check
        #assert cdb.cui2names == {'S-229004': {'movar', 'movarvirus', 'movarviruses'}, 'S-229005': {'cdb'}}

        self.vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(self.vocab_path):
            import requests
            tmp = requests.get(
                "https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(self.vocab_path, 'wb') as f:
                f.write(tmp.content)

        vocab = Vocab.load(self.vocab_path)
        # Make the pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=self.config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])
        spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab,
                                          config=self.config,
                                          data_vocab=vocab)
        self.nlp.add_token_normalizer(spell_checker=spell_checker,
                                      config=self.config)
        ner = NER(cdb, self.config)
        self.nlp.add_ner(ner)

        # Add Linker
        link = Linker(cdb, vocab, self.config)
        self.nlp.add_linker(link)

        self.text = "CDB - I was running and then Movar    Virus attacked and CDb"
Esempio n. 19
0
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')
Esempio n. 20
0
    def test_training_import(self):
        cdb2 = CDB.load('./tmp_cdb.dat')
        self.cdb.reset_training()
        cdb2.reset_training()
        np.random.seed(11)
        cuis = list(self.cdb.cui2names.keys())
        for i in range(2):
            for cui in cuis:
                vectors = {}
                for cntx_type in self.config.linking['context_vector_sizes']:
                    vectors[cntx_type] = np.random.rand(300)
                self.cdb.update_context_vector(cui, vectors, negative=False)

        cdb2.import_training(cdb=self.cdb, overwrite=True)
        assert cdb2.cui2context_vectors['C0000139']['long'][7] == self.cdb.cui2context_vectors['C0000139']['long'][7]
        assert cdb2.cui2count_train['C0000139'] == self.cdb.cui2count_train['C0000139']
Esempio n. 21
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            try:
                cdb = CDB.load(cdb_path)
            except KeyError as ke:
                mc_v = pkg_resources.get_distribution('medcat').version
                if int(mc_v.split('.')[0]) > 0:
                    log.error(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x'
                    )
                    raise Exception(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x',
                        'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                        'MedCATTrainer Dev team if you believe this should work'
                    ) from ke
                raise

            custom_config = os.getenv("MEDCAT_CONFIG_FILE")
            if custom_config is not None and os.path.exists(custom_config):
                cdb.config.parse_config_file(path=custom_config)
            else:
                log.info(
                    "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB"
                )
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
Esempio n. 22
0
    def test_concept_similarity(self):
        cdb = CDB(config=self.config)
        np.random.seed(11)
        for i in range(500):
            cui = "C" + str(i)
            type_ids = {'T-' + str(i%10)}
            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.nlp, {}, self.config), ontologies=set(),
                    name_status='P', type_ids=type_ids, description='', full_build=True)

            vectors = {}
            for cntx_type in self.config.linking['context_vector_sizes']:
                vectors[cntx_type] = np.random.rand(300)
            cdb.update_context_vector(cui, vectors, negative=False)
        res = cdb.most_similar('C200', 'long', type_id_filter=['T-0'], min_cnt=1, topn=10, force_build=True)
        assert len(res) == 10
Esempio n. 23
0
 def setUpClass(cls) -> None:
     cls.cdb = CDB.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "cdb.dat"))
     cls.vocab = Vocab.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "vocab.dat"))
     cls.cdb.config.ner['min_name_len'] = 2
     cls.cdb.config.ner['upper_case_limit_len'] = 3
     cls.cdb.config.general['spell_check'] = True
     cls.cdb.config.linking['train_count_threshold'] = 10
     cls.cdb.config.linking['similarity_threshold'] = 0.3
     cls.cdb.config.linking['train'] = True
     cls.cdb.config.linking['disamb_length_limit'] = 5
     cls.cdb.config.general['full_unlink'] = True
     cls.undertest = CAT(cdb=cls.cdb,
                         config=cls.cdb.config,
                         vocab=cls.vocab)
Esempio n. 24
0
    def load_model(self,
                   model_full_tag_name,
                   vocab_input_file_name="vocab.dat",
                   cdb_input_file_name="cdb.dat"):
        """ Loads variables of this object
            This is used to search the site-packages models folder for installed models..
        """

        vocab = Vocab.load_model(model_full_tag_name=model_full_tag_name,
                                 input_file_name=vocab_input_file_name)
        cdb = CDB.load_model(model_full_tag_name=model_full_tag_name,
                             input_file_name=cdb_input_file_name)

        if cdb is False or vocab is False:
            log.error("Exiting...")
            sys.exit()

        return CAT(cdb, vocab=vocab)
Esempio n. 25
0
def import_concepts_from_cdb(cdb_model_id: int):
    from medcat.cdb import CDB

    cdb_model = ConceptDB.objects.get(id=cdb_model_id)
    cdb = CDB.load(cdb_model.cdb_file.path)
    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=cdb_model_id).values_list('cui', flat=True))
    all_cuis = set(Concept.objects.all().values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in all_cuis:
            concept = Concept()
            concept.cui = cui
            update_concept_model(concept, cdb_model, cdb)
        if cui in all_cuis and cui not in existing_cuis:
            # ui has been added from another CDB. Overwrite here.
            concept = Concept.objects.get(cui=cui)
            update_concept_model(concept, cdb_model, cdb)
Esempio n. 26
0
    def setUpClass(cls) -> None:
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.cdb = CDB(config=cls.config)

        vocab_path = "./tmp_vocab.dat"
        if not os.path.exists(vocab_path):
            tmp = requests.get("https://medcat.rosalind.kcl.ac.uk/media/vocab.dat")
            with open(vocab_path, 'wb') as f:
                f.write(tmp.content)

        cls.vocab = Vocab.load(vocab_path)
        cls.spell_checker = BasicSpellChecker(cdb_vocab=cls.cdb.vocab, config=cls.config, data_vocab=cls.vocab)
        cls.ner = NER(cls.cdb, cls.config)
        cls.linker = Linker(cls.cdb, cls.vocab, cls.config)
        cls.config.ner['max_skip_tokens'] = 1
        cls.config.ner['upper_case_limit_len'] = 4
        cls.config.linking['disamb_length_limit'] = 2
        cls.meta_cat = MetaCAT()
        cls.text = "CDB - I was running and then Movar Virus attacked and CDb"
        cls.config = Config()
        cls.config.general['log_level'] = logging.INFO
        cls.undertest = Pipe(tokenizer=spacy_split_all, config=cls.config)
Esempio n. 27
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB.load(concept_db.cdb_file.path)

    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=id).values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in existing_cuis:
            concept = Concept()
            concept.pretty_name = cdb.cui2preferred_name.get(cui, cui)
            concept.cui = cui
            concept.tui = ','.join(list(cdb.cui2type_ids.get(cui, '')))
            concept.semantic_type = ','.join([
                cdb.addl_info['type_id2name'].get(tui, '')
                for tui in list(cdb.cui2type_ids.get(cui, ''))
            ])
            concept.desc = cdb.addl_info['cui2description'].get(cui, '')
            concept.synonyms = ", ".join(
                cdb.addl_info['cui2original_names'].get(cui, []))
            concept.cdb = concept_db
            concept.save()
Esempio n. 28
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug('Loading VOCAB ...')
        vocab = Vocab()
        vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug('Loading CDB ...')
        cdb = CDB()
        cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH"))

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug('Applying CDB CUI filter ...')
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug('Loading META annotations ...')
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'):
                m = MetaCAT(save_dir=model_path)
                m.load()
                meta_models.append(m)

        return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
Esempio n. 29
0
from argparse import ArgumentParser

import pandas as pd
from tqdm import tqdm
import numpy as np

from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB 

vocab = Vocab()
vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"])
print("Loaded Vocab")

# Load the cdb model you downloaded
cdb = CDB()
cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) 
print("Loaded CDB")

# create cat
cat = CAT(cdb=cdb, vocab=vocab)
cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184']

tqdm.pandas()

def get_entities(text) :
    doc = cat.get_entities(text)
    relevant_entities = []
    for ent in doc :
        if "icd10" in ent["info"] :
            ent_string = text[ent["start"]:ent['end']]
Esempio n. 30
0
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    use_groups = False
    if groups is not None:
        use_groups = True

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    examples = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        # Add groups if they exist
        if groups is not None:
            for cui in cdb.cui2info.keys():
                if "group" in cdb.cui2info[cui]:
                    del cdb.cui2info[cui]['group']
            groups = json.load(open("./groups.json"))
            for k,v in groups.items():
                for val in v:
                    cat.add_cui_to_group(val, k)

        fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path,
                             lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts, examples