def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project): cdb_id = project.concept_db.id vocab_id = project.vocab.id cat_id = str(cdb_id) + "-" + str(vocab_id) if cat_id in CAT_MAP: cat = CAT_MAP[cat_id] else: if cdb_id in CDB_MAP: cdb = CDB_MAP[cdb_id] else: cdb_path = project.concept_db.cdb_file.path cdb = CDB() cdb.load_dict(cdb_path) CDB_MAP[cdb_id] = cdb if vocab_id in VOCAB_MAP: vocab = VOCAB_MAP[vocab_id] else: vocab_path = project.vocab.vocab_file.path vocab = Vocab() vocab.load_dict(vocab_path) VOCAB_MAP[vocab_id] = vocab cat = CAT(cdb=cdb, vocab=vocab) cat.train = False CAT_MAP[cat_id] = cat return cat
def _create_cat(self): """ Loads MedCAT resources and creates CAT instance """ if os.getenv("APP_MODEL_VOCAB_PATH") is None: raise ValueError( "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified") if os.getenv("APP_MODEL_CDB_PATH") is None: raise Exception( "Concept database (env: APP_MODEL_CDB_PATH) not specified") # Vocabulary and Concept Database are mandatory self.log.debug('Loading VOCAB ...') vocab = Vocab() vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH")) self.log.debug('Loading CDB ...') cdb = CDB() cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH")) # Apply CUI filter if provided if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None: self.log.debug('Applying CDB CUI filter ...') with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file: all_lines = (line.rstrip() for line in cui_file) selected_cuis = [line for line in all_lines if line] # filter blank lines cdb.filter_by_cui(selected_cuis) # Meta-annotation models are optional meta_models = [] if os.getenv("APP_MODEL_META_PATH_LIST") is not None: self.log.debug('Loading META annotations ...') for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'): m = MetaCAT(save_dir=model_path) m.load() meta_models.append(m) return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
import os from argparse import ArgumentParser import pandas as pd from tqdm import tqdm import numpy as np from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB vocab = Vocab() vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"]) print("Loaded Vocab") # Load the cdb model you downloaded cdb = CDB() cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) print("Loaded CDB") # create cat cat = CAT(cdb=cdb, vocab=vocab) cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184'] tqdm.pandas() def get_entities(text) : doc = cat.get_entities(text) relevant_entities = [] for ent in doc : if "icd10" in ent["info"] :
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs): from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB import json use_groups = False if groups is not None: use_groups = True f1s = {} ps = {} rs = {} tps = {} fns = {} fps = {} cui_counts = {} examples = {} for i in range(cv): cdb = CDB() cdb.load_dict(cdb_path) vocab = Vocab() vocab.load_dict(path=vocab_path) cat = CAT(cdb, vocab=vocab) cat.train = False cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 # Add groups if they exist if groups is not None: for cui in cdb.cui2info.keys(): if "group" in cdb.cui2info[cui]: del cdb.cui2info[cui]['group'] groups = json.load(open("./groups.json")) for k,v in groups.items(): for val in v: cat.add_cui_to_group(val, k) fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path, lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs) for key in f1.keys(): if key in f1s: f1s[key].append(f1[key]) else: f1s[key] = [f1[key]] if key in ps: ps[key].append(p[key]) else: ps[key] = [p[key]] if key in rs: rs[key].append(r[key]) else: rs[key] = [r[key]] if key in tps: tps[key].append(tp.get(key, 0)) else: tps[key] = [tp.get(key, 0)] if key in fps: fps[key].append(fp.get(key, 0)) else: fps[key] = [fp.get(key, 0)] if key in fns: fns[key].append(fn.get(key, 0)) else: fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts, examples
from flask import Flask from flask import Response import json from medcat.cdb import CDB from medcat.utils.vocab import Vocab from medcat.cat import CAT from flask import request import os vocab = Vocab() cdb = CDB() cdb.load_dict(os.getenv("CDB_MODEL", '/cat/models/med_ann_norm.dat')) vocab.load_dict( path=os.getenv("VOCAB_MODEL", '/cat/models/med_ann_norm_dict.dat')) cat = CAT(cdb, vocab=vocab) cat.spacy_cat.train = False app = Flask(__name__) app_name = 'MEDCAT' app_lang = 'en' app_version = os.getenv("CAT_VERSION", '0.1.0') @app.route('/api/info', methods=['GET']) def info(): app_info = {'name': app_name, 'language': app_lang, 'version': app_version} return Response(response=json.dumps(app_info), status=200,
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, reset_cui_count=True, test_size=0.1): from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.cdb import CDB import json f1s = {} ps = {} rs = {} tps = {} fns = {} fps = {} cui_counts = {} for i in range(cv): cdb = CDB() cdb.load_dict(cdb_path) vocab = Vocab() vocab.load_dict(path=vocab_path) cat = CAT(cdb, vocab=vocab) cat.train = False cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 fp, fn, tp, p, r, f1, cui_counts = cat.train_supervised( data_path=data_path, lr=1, nepochs=nepochs, anneal=True, print_stats=True, use_filters=True, reset_cui_count=reset_cui_count, terminate_last=True, test_size=test_size) for key in f1.keys(): if key in f1s: f1s[key].append(f1[key]) else: f1s[key] = [f1[key]] if key in ps: ps[key].append(p[key]) else: ps[key] = [p[key]] if key in rs: rs[key].append(r[key]) else: rs[key] = [r[key]] if key in tps: tps[key].append(tp.get(key, 0)) else: tps[key] = [tp.get(key, 0)] if key in fps: fps[key].append(fp.get(key, 0)) else: fps[key] = [fp.get(key, 0)] if key in fns: fns[key].append(fn.get(key, 0)) else: fns[key] = [fn.get(key, 0)] return fps, fns, tps, ps, rs, f1s, cui_counts
class MedCatProcessor(NlpProcessor): """" MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality (both single and bulk processing) that can be easily exposed for an API. """ def __init__(self): super().__init__() self.log.info('Initializing MedCAT processor ...') self.app_name = 'MedCAT' self.app_lang = 'en' self.app_version = MedCatProcessor._get_medcat_version() self.app_model = os.getenv("APP_MODEL_NAME", 'unknown') self.vocab = Vocab() self.cdb = CDB() self.cdb.load_dict( os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat')) self.vocab.load_dict( path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat')) self.cat = CAT(self.cdb, vocab=self.vocab) self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False) self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8)) self.log.info('MedCAT processor is ready') def get_app_info(self): """ Returns general information about the application :return: application information stored as KVPs """ return { 'name': self.app_name, 'language': self.app_lang, 'version': self.app_version, 'model': self.app_model } def process_content(self, content): """ Processes a single document extracting the annotations. :param content: document to be processed, containing 'text' field. :return: processing result containing document with extracted annotations stored as KVPs. """ if 'text' not in content: error_msg = "'text' field missing in the payload content." nlp_result = { 'success': False, 'errors': [error_msg], 'timestamp': NlpProcessor._get_timestamp() } return nlp_result, False text = content['text'] # assume an that a blank document is a valid document and process it only # when it contains any non-blank characters if text is not None and len(text.strip()) > 0: entities = self.cat.get_entities(text) else: entities = [] nlp_result = { 'text': text, 'annotations': entities, 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in content: nlp_result['footer'] = content['footer'] return nlp_result def process_content_bulk(self, content): """ Processes an array of documents extracting the annotations. :param content: document to be processed, containing 'text' field. :return: processing result containing documents with extracted annotations,stored as KVPs. """ # process at least 10 docs per thread and don't bother with starting # additional threads when less documents were provided min_doc_per_thread = 10 num_slices = max(1, int(len(content) / min_doc_per_thread)) batch_size = min(300, num_slices) if batch_size >= self.bulk_nproc: nproc = self.bulk_nproc else: batch_size = min_doc_per_thread nproc = max(1, num_slices) if len(content) > batch_size * nproc: nproc += 1 # use generators both to provide input documents and to provide resulting annotations # to avoid too many mem-copies invalid_doc_ids = [] ann_res = self.cat.multi_processing( MedCatProcessor._generate_input_doc(content, invalid_doc_ids), nproc=nproc, batch_size=batch_size) return MedCatProcessor._generate_result(content, ann_res, invalid_doc_ids) # helper generator functions to avoid multiple copies of data # @staticmethod def _generate_input_doc(documents, invalid_doc_idx): """ Generator function returning documents to be processed as a list of tuples: (idx, text), (idx, text), ... Skips empty documents and reports their ids to the invalid_doc_idx array :param documents: array of input documents that contain 'text' field :param invalid_doc_idx: array that will contain invalid document idx :return: consecutive tuples of (idx, document) """ for i in range(0, len(documents)): # assume the document to be processed only when it is not blank if 'text' in documents[i] and documents[i][ 'text'] is not None and len( documents[i]['text'].strip()) > 0: yield i, documents[i]['text'] else: invalid_doc_idx.append(i) @staticmethod def _generate_result(in_documents, annotations, invalid_doc_idx): """ Generator function merging the resulting annotations with the input documents. The result for documents that were invalid will not contain any annotations. :param in_documents: array of input documents that contain 'text' field :param annotations: array of annotations extracted from documents :param invalid_doc_idx: array of invalid document idx :return: """ # generate output for valid annotations for i in range(len(annotations)): res = annotations[i] res_idx = res[0] in_ct = in_documents[res_idx] # parse the result out_res = { 'text': res[1]["text"], 'annotations': res[1]["entities"], 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in in_ct: out_res['footer'] = in_ct['footer'] yield out_res # generate output for invalid documents for i in invalid_doc_idx: in_ct = in_documents[i] out_res = { 'text': in_ct["text"], 'annotations': [], 'success': True, 'timestamp': NlpProcessor._get_timestamp() } # append the footer if 'footer' in in_ct: out_res['footer'] = in_ct['footer'] yield out_res @staticmethod def _get_medcat_version(): """ Returns the version string of the MedCAT module as reported by pip :return: """ try: import subprocess result = subprocess.check_output(['pip', 'show', 'medcat'], universal_newlines=True) version_line = list( filter(lambda v: 'Version' in v, result.split('\n'))) return version_line[0].split(' ')[1] except Exception: raise Exception("Cannot read the MedCAT library version")
# TODO #neg_path = os.getenv('NEG_PATH', '/tmp/mc_negated') try: if not os.path.exists(vocab_path): vocab_url = os.getenv('VOCAB_URL') urlretrieve(vocab_url, vocab_path) if not os.path.exists(cdb_path): cdb_url = os.getenv('CDB_URL') print("*" * 399) print(cdb_url) urlretrieve(cdb_url, cdb_path) vocab = Vocab() vocab.load_dict(vocab_path) cdb = CDB() cdb.load_dict(cdb_path) # mc_negated = MetaCAT(save_dir=neg_path) # mc_negated.load() # cat = CAT(cdb=cdb, vocab=vocab, meta_cats=[mc_negated]) cat = CAT(cdb=cdb, vocab=vocab) cat.spacy_cat.MIN_ACC = 0.30 cat.spacy_cat.MIN_ACC_TH = 0.30 cat.spacy_cat.ACC_ALWAYS = True except Exception as e: print(str(e)) def get_html_and_json(text): doc = cat(text)
intersection_space = len(source.intersection(target)) target_space = len(target) stats_dict[(mrn_number, account_number)] = intersection_space / target_space return stats_dict, error_log if __name__ == '__main__': print('Loading the vocabulary...') try: vocab = Vocab() vocab.load_dict(path_medcat + '/vocab.dat') except: raise ImportError('vocab and script should be in same directory') print('Loading the weights. This will take time...') try: cdb = CDB() cdb.load_dict(path_medcat + '/umls_base_wlink_fixed_x_avg_2m_mimic.dat') except: raise ImportError('weights and script should be in same directory') print('Building the model...') cat = CAT(cdb=cdb, vocab=vocab) print('Building mrn directories...')
semgroups_fn = '../data/umls_semgroups.txt' cols = ['UMLS_CUI', 'SNOMED_FSN', 'SNOMED_CID'] snomed_df = pd.read_csv(snomed_core_fn, delimiter='|')[cols] core_cui_set = set(snomed_df['UMLS_CUI'].tolist()) # sem_group_acronym|sem_group|tui|tui_description sem_groups_df = pd.read_csv(semgroups_fn, delimiter='|').dropna() tui_group_map = dict( zip(sem_groups_df['tui'].tolist(), sem_groups_df['sem_group'].tolist())) vocab = Vocab() print('Loading vocabulary...') # Load the vocab model you downloaded vocab.load_dict('../data/medcat/vocab.dat') # Load the cdb model you downloaded cdb = CDB() print('Loading model...') cdb.load_dict('../data/medcat/cdb.dat') # create cat print('Creating MedCAT pipeline...') cat = CAT(cdb=cdb, vocab=vocab) print('Loading Spacy...') sentencizer = spacy.load( 'en_core_sci_lg', disable=['tagger', 'parser', 'ner', 'textcat']) sentencizer.add_pipe(sentencizer.create_pipe('sentencizer')) print('Loading UMLS entity linker...')
from code_utils.global_variables import * from medcat.cat import CAT from medcat.utils.vocab import Vocab from medcat.prepare_cdb import PrepareCDB from medcat.cdb import CDB import os import spacy # nlp = spacy.load(spacy_en_path, disable=['ner', 'parser']) medcat_path = r'C:\Users\K1774755\PycharmProjects\toy-models\MedCat' vocab = Vocab() # Load the vocab model you just downloaded vocab.load_dict(os.path.join(medcat_path, 'med_ann_norm_dict.dat')) # If you have an existing CDB cdb = CDB() # cdb.load_dict(os.path.join(medcat_path, 'simple_cdb.csv')) # If you need a special CDB you can build one from a .csv file preparator = PrepareCDB(vocab=vocab) csv_paths = [os.path.join(medcat_path, 'simple_cdb.csv')]#, '<another one>', ...] csv_paths = [os.path.join(medcat_path, 'attention_cdb.csv')] cdb = preparator.prepare_csvs(csv_paths) # Save the new CDB for later cdb.save_dict(os.path.join(medcat_path, 'simple_cdb.cdb')) # To annotate documents we do doc = "My simple document with kidney failure"