Example #1
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB()
            cdb.load_dict(cdb_path)
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab()
            vocab.load_dict(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, vocab=vocab)
        cat.train = False
        CAT_MAP[cat_id] = cat
    return cat
Example #2
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB.load(cdb_path)
            cdb.config.parse_config_file(path=os.getenv("MEDCAT_CONFIG_FILE"))
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
Example #3
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug("Loading VOCAB ...")
        vocab = Vocab.load(os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug("Loading CDB ...")

        cdb = CDB.load(os.getenv("APP_MODEL_CDB_PATH"))

        spacy_model = os.getenv("SPACY_MODEL", "")

        if spacy_model:
            cdb.config.general["spacy_model"] == spacy_model
        else:
            logging.warning("SPACY_MODEL environment var not set, \
                attempting to load the spacy model found within the CDB : " +
                            cdb.config.general["spacy_model"])

            if cdb.config.general["spacy_model"] == "":
                raise ValueError(
                    "No SPACY_MODEL env var declared, the CDB loaded does not have a spacy_model set in the config variable! \
                 To solve this declare the SPACY_MODEL in the env_medcat file."
                )

        # this is redundant as the config is already in the CDB
        conf = cdb.config

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug("Applying CDB CUI filter ...")
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug("Loading META annotations ...")
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(":"):
                m = MetaCAT.load(model_path)
                meta_models.append(m)

        cat = CAT(cdb=cdb, config=conf, vocab=vocab, meta_cats=meta_models)
        return cat
Example #4
0
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')
Example #5
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            try:
                cdb = CDB.load(cdb_path)
            except KeyError as ke:
                mc_v = pkg_resources.get_distribution('medcat').version
                if int(mc_v.split('.')[0]) > 0:
                    log.error(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x'
                    )
                    raise Exception(
                        'Attempted to load MedCAT v0.x model with MCTrainer v1.x',
                        'Please re-configure this project to use a MedCAT v1.x CDB or consult the '
                        'MedCATTrainer Dev team if you believe this should work'
                    ) from ke
                raise

            custom_config = os.getenv("MEDCAT_CONFIG_FILE")
            if custom_config is not None and os.path.exists(custom_config):
                cdb.config.parse_config_file(path=custom_config)
            else:
                log.info(
                    "No MEDCAT_CONFIG_FILE env var set to valid path, using default config available on CDB"
                )
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab.load(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
        CAT_MAP[cat_id] = cat
    return cat
Example #6
0
 def setUpClass(cls) -> None:
     cls.cdb = CDB.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "cdb.dat"))
     cls.vocab = Vocab.load(
         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                      "examples", "vocab.dat"))
     cls.cdb.config.ner['min_name_len'] = 2
     cls.cdb.config.ner['upper_case_limit_len'] = 3
     cls.cdb.config.general['spell_check'] = True
     cls.cdb.config.linking['train_count_threshold'] = 10
     cls.cdb.config.linking['similarity_threshold'] = 0.3
     cls.cdb.config.linking['train'] = True
     cls.cdb.config.linking['disamb_length_limit'] = 5
     cls.cdb.config.general['full_unlink'] = True
     cls.undertest = CAT(cdb=cls.cdb,
                         config=cls.cdb.config,
                         vocab=cls.vocab)
Example #7
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug('Loading VOCAB ...')
        vocab = Vocab()
        vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug('Loading CDB ...')
        cdb = CDB()
        cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH"))

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug('Applying CDB CUI filter ...')
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug('Loading META annotations ...')
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'):
                m = MetaCAT(save_dir=model_path)
                m.load()
                meta_models.append(m)

        return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
Example #8
0
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB 

vocab = Vocab()
vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"])
print("Loaded Vocab")

# Load the cdb model you downloaded
cdb = CDB()
cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) 
print("Loaded CDB")

# create cat
cat = CAT(cdb=cdb, vocab=vocab)
cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184']

tqdm.pandas()

def get_entities(text) :
    doc = cat.get_entities(text)
    relevant_entities = []
    for ent in doc :
        if "icd10" in ent["info"] :
            ent_string = text[ent["start"]:ent['end']]
            if ent_string.lower() in ["ms", "mr", "mrs"] :
                continue
            cui = ent["cui"]
            icd_codes = tuple(sorted([x["chapter"] for x in ent["info"]["icd10"]]))
            if "R69" in icd_codes:
Example #9
0
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    use_groups = False
    if groups is not None:
        use_groups = True

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    examples = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        # Add groups if they exist
        if groups is not None:
            for cui in cdb.cui2info.keys():
                if "group" in cdb.cui2info[cui]:
                    del cdb.cui2info[cui]['group']
            groups = json.load(open("./groups.json"))
            for k,v in groups.items():
                for val in v:
                    cat.add_cui_to_group(val, k)

        fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path,
                             lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts, examples
Example #10
0
def run_cv(cdb_path,
           data_path,
           vocab_path,
           cv=100,
           nepochs=16,
           reset_cui_count=True,
           test_size=0.1):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        fp, fn, tp, p, r, f1, cui_counts = cat.train_supervised(
            data_path=data_path,
            lr=1,
            nepochs=nepochs,
            anneal=True,
            print_stats=True,
            use_filters=True,
            reset_cui_count=reset_cui_count,
            terminate_last=True,
            test_size=test_size)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts
Example #11
0
class MedCatProcessor(NlpProcessor):
    """"
    MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality
    (both single and bulk processing) that can be easily exposed for an API.
    """
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')

    def get_app_info(self):
        """
        Returns general information about the application
        :return: application information stored as KVPs
        """
        return {
            'name': self.app_name,
            'language': self.app_lang,
            'version': self.app_version,
            'model': self.app_model
        }

    def process_content(self, content):
        """
        Processes a single document extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing document with extracted annotations stored as KVPs.
        """
        if 'text' not in content:
            error_msg = "'text' field missing in the payload content."
            nlp_result = {
                'success': False,
                'errors': [error_msg],
                'timestamp': NlpProcessor._get_timestamp()
            }
            return nlp_result, False

        text = content['text']

        # assume an that a blank document is a valid document and process it only
        # when it contains any non-blank characters
        if text is not None and len(text.strip()) > 0:
            entities = self.cat.get_entities(text)
        else:
            entities = []

        nlp_result = {
            'text': text,
            'annotations': entities,
            'success': True,
            'timestamp': NlpProcessor._get_timestamp()
        }

        # append the footer
        if 'footer' in content:
            nlp_result['footer'] = content['footer']

        return nlp_result

    def process_content_bulk(self, content):
        """
        Processes an array of documents extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing documents with extracted annotations,stored as KVPs.
        """

        # process at least 10 docs per thread and don't bother with starting
        # additional threads when less documents were provided
        min_doc_per_thread = 10
        num_slices = max(1, int(len(content) / min_doc_per_thread))
        batch_size = min(300, num_slices)

        if batch_size >= self.bulk_nproc:
            nproc = self.bulk_nproc
        else:
            batch_size = min_doc_per_thread
            nproc = max(1, num_slices)
            if len(content) > batch_size * nproc:
                nproc += 1

        # use generators both to provide input documents and to provide resulting annotations
        # to avoid too many mem-copies
        invalid_doc_ids = []
        ann_res = self.cat.multi_processing(
            MedCatProcessor._generate_input_doc(content, invalid_doc_ids),
            nproc=nproc,
            batch_size=batch_size)

        return MedCatProcessor._generate_result(content, ann_res,
                                                invalid_doc_ids)

    # helper generator functions to avoid multiple copies of data
    #
    @staticmethod
    def _generate_input_doc(documents, invalid_doc_idx):
        """
        Generator function returning documents to be processed as a list of tuples:
          (idx, text), (idx, text), ...
        Skips empty documents and reports their ids to the invalid_doc_idx array
        :param documents: array of input documents that contain 'text' field
        :param invalid_doc_idx:  array that will contain invalid document idx
        :return: consecutive tuples of (idx, document)
        """
        for i in range(0, len(documents)):
            # assume the document to be processed only when it is not blank
            if 'text' in documents[i] and documents[i][
                    'text'] is not None and len(
                        documents[i]['text'].strip()) > 0:
                yield i, documents[i]['text']
            else:
                invalid_doc_idx.append(i)

    @staticmethod
    def _generate_result(in_documents, annotations, invalid_doc_idx):
        """
        Generator function merging the resulting annotations with the input documents.
        The result for documents that were invalid will not contain any annotations.
        :param in_documents: array of input documents that contain 'text' field
        :param annotations: array of annotations extracted from documents
        :param invalid_doc_idx: array of invalid document idx
        :return:
        """
        # generate output for valid annotations
        for i in range(len(annotations)):
            res = annotations[i]
            res_idx = res[0]
            in_ct = in_documents[res_idx]

            # parse the result
            out_res = {
                'text': res[1]["text"],
                'annotations': res[1]["entities"],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

        # generate output for invalid documents
        for i in invalid_doc_idx:
            in_ct = in_documents[i]

            out_res = {
                'text': in_ct["text"],
                'annotations': [],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

    @staticmethod
    def _get_medcat_version():
        """
        Returns the version string of the MedCAT module as reported by pip
        :return:
        """
        try:
            import subprocess
            result = subprocess.check_output(['pip', 'show', 'medcat'],
                                             universal_newlines=True)
            version_line = list(
                filter(lambda v: 'Version' in v, result.split('\n')))
            return version_line[0].split(' ')[1]
        except Exception:
            raise Exception("Cannot read the MedCAT library version")
Example #12
0
        vocab_url = os.getenv('VOCAB_URL')
        urlretrieve(vocab_url, vocab_path)

    if not os.path.exists(cdb_path):
        cdb_url = os.getenv('CDB_URL')
        print("*" * 399)
        print(cdb_url)
        urlretrieve(cdb_url, cdb_path)

    vocab = Vocab()
    vocab.load_dict(vocab_path)
    cdb = CDB()
    cdb.load_dict(cdb_path)
    mc_negated = MetaCAT(save_dir=neg_path)
    mc_negated.load()
    cat = CAT(cdb=cdb, vocab=vocab, meta_cats=[mc_negated])
    cat.spacy_cat.MIN_ACC = 0.30
    cat.spacy_cat.MIN_ACC_TH = 0.30
    cat.spacy_cat.ACC_ALWAYS = True
except Exception as e:
    print(str(e))


def get_html_and_json(text):
    doc = cat(text)

    a = json.loads(cat.get_json(text))
    for i in range(len(a['entities'])):
        ent = a['entities'][i]

        new_ent = {}
Example #13
0
from medcat.cat import CAT

vocab_path = "./tmp_vocab.dat"
if not os.path.exists(vocab_path):
    import requests
    tmp = requests.get("https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat")
    with open(vocab_path, 'wb') as f:
        f.write(tmp.content)

config = Config()
cdb = CDB.load("./tmp_cdb.dat", config=config)
vocab = Vocab.load(vocab_path)

cdb.reset_training()

cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
cat.config.ner['min_name_len'] = 3
cat.config.ner['upper_case_limit_len'] = 3
cat.config.linking['disamb_length_limit'] = 3
cat.config.linking['filters'] = {'cuis': set()}
cat.config.linking['train_count_threshold'] = -1
cat.config.linking['context_vector_sizes'] = {'xlong': 27, 'long': 18, 'medium': 9, 'short': 3}
cat.config.linking['context_vector_weights'] = {'xlong': 0, 'long': 0.4, 'medium': 0.4, 'short': 0.2}
cat.config.linking['weighted_average_function'] = lambda step: max(0.1, 1-(step**2*0.0004))

cat.config.linking['similarity_threshold_type'] = 'dynamic'
cat.config.linking['similarity_threshold'] = 0.35
cat.config.linking['calculate_dynamic_threshold'] = True

cat.train(df.text.values, fine_tune=True)
cdb = CDB()
# cdb.load_dict(os.path.join(medcat_path, 'simple_cdb.csv'))


# If you need a special CDB you can build one from a .csv file
preparator = PrepareCDB(vocab=vocab)
csv_paths = [os.path.join(medcat_path, 'simple_cdb.csv')]#, '<another one>', ...]
csv_paths = [os.path.join(medcat_path, 'attention_cdb.csv')]
cdb = preparator.prepare_csvs(csv_paths)

# Save the new CDB for later
cdb.save_dict(os.path.join(medcat_path, 'simple_cdb.cdb'))

# To annotate documents we do
doc = "My simple document with kidney failure"
cat = CAT(cdb=cdb, vocab=vocab)
cat.train = False
doc_spacy = cat(doc)
# Entities are in
doc_spacy._.ents
# Or to get a json
doc_json = cat.get_json(doc)

# To have a look at the results:
from spacy import displacy
# Note that this will not show all entites, but only the longest ones
displacy.serve(doc_spacy, style='ent')

# To run cat on a large number of documents
data = [] # [(<doc_id>, <text>), (<doc_id>, <text>), ...]
docs = cat.multi_processing(data)