Esempio n. 1
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB()
    cdb.load_dict(concept_db.cdb_file.path)
    tuis = None

    # Get all existing cuis for this CDB
    existing_cuis = set(
        Concept.objects.filter(cdb=id).values_list('cui', flat=True))

    for cui in cdb.cui2names.keys():
        if cui not in existing_cuis:
            pretty_name = None

            if cui in cdb.cui2pretty_name:
                pretty_name = cdb.cui2pretty_name[cui]
            elif cui in cdb.cui2original_names and len(
                    cdb.cui2original_names[cui]) > 0:
                pretty_name = next(iter(cdb.cui2original_names[cui]))

            tui = cdb.cui2tui.get(cui, 'unk')
            if pretty_name is not None and (tuis is None or tui in tuis):
                concept = Concept()
                concept.pretty_name = pretty_name
                concept.cui = cui
                concept.tui = tui
                concept.semantic_type = cdb.tui2name.get(tui, '')
                concept.desc = cdb.cui2desc.get(cui, '')
                concept.synonyms = ", ".join(
                    cdb.cui2original_names.get(cui, []))
                concept.cdb = concept_db
                concept.save()
                set_icd_info_objects(cdb, concept, cui)
                set_opcs_info_objects(cdb, concept, cui)
Esempio n. 2
0
def _import_concepts(id):
    from medcat.cdb import CDB
    concept_db = ConceptDB.objects.get(id=id)
    cdb = CDB()
    cdb.load_dict(concept_db.cdb_file.path)
    tuis = None

    for cui in cdb.cui2pretty_name:
        tui = cdb.cui2tui.get(cui, 'unk')
        if tuis is None or tui in tuis:
            concept = Concept()
            concept.pretty_name = cdb.cui2pretty_name.get(cui, '')
            concept.cui = cui
            concept.tui = tui
            concept.semantic_type = cdb.tui2name.get(tui, '')
            concept.desc = cdb.cui2desc.get(cui, '')
            concept.synonyms = ",".join(cdb.cui2original_names.get(cui, []))
            concept.cdb = concept_db
            icd10 = ''
            try:
                for pair in cdb.cui2info[cui]['icd10']:
                    icd10 += pair['chapter'] + " | " + pair['name']
                    icd10 += '\n'
                icd10.strip()
            except:
                pass
            concept.icd10 = icd10
            #concept.vocab = cdb.cui2ontos.get(cui, '')

            try:
                concept.save()
            except:
                pass
Esempio n. 3
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB()
            cdb.load_dict(cdb_path)
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab()
            vocab.load_dict(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, vocab=vocab)
        cat.train = False
        CAT_MAP[cat_id] = cat
    return cat
Esempio n. 4
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug('Loading VOCAB ...')
        vocab = Vocab()
        vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug('Loading CDB ...')
        cdb = CDB()
        cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH"))

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug('Applying CDB CUI filter ...')
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug('Loading META annotations ...')
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'):
                m = MetaCAT(save_dir=model_path)
                m.load()
                meta_models.append(m)

        return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
Esempio n. 5
0
import pandas as pd
from tqdm import tqdm
import numpy as np

from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB 

vocab = Vocab()
vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"])
print("Loaded Vocab")

# Load the cdb model you downloaded
cdb = CDB()
cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) 
print("Loaded CDB")

# create cat
cat = CAT(cdb=cdb, vocab=vocab)
cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184']

tqdm.pandas()

def get_entities(text) :
    doc = cat.get_entities(text)
    relevant_entities = []
    for ent in doc :
        if "icd10" in ent["info"] :
            ent_string = text[ent["start"]:ent['end']]
            if ent_string.lower() in ["ms", "mr", "mrs"] :
Esempio n. 6
0
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    use_groups = False
    if groups is not None:
        use_groups = True

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    examples = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        # Add groups if they exist
        if groups is not None:
            for cui in cdb.cui2info.keys():
                if "group" in cdb.cui2info[cui]:
                    del cdb.cui2info[cui]['group']
            groups = json.load(open("./groups.json"))
            for k,v in groups.items():
                for val in v:
                    cat.add_cui_to_group(val, k)

        fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path,
                             lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts, examples
Esempio n. 7
0
from flask import Flask
from flask import Response
import json
from medcat.cdb import CDB
from medcat.utils.vocab import Vocab
from medcat.cat import CAT
from flask import request
import os

vocab = Vocab()
cdb = CDB()

cdb.load_dict(os.getenv("CDB_MODEL", '/cat/models/med_ann_norm.dat'))
vocab.load_dict(
    path=os.getenv("VOCAB_MODEL", '/cat/models/med_ann_norm_dict.dat'))
cat = CAT(cdb, vocab=vocab)

cat.spacy_cat.train = False

app = Flask(__name__)

app_name = 'MEDCAT'
app_lang = 'en'
app_version = os.getenv("CAT_VERSION", '0.1.0')


@app.route('/api/info', methods=['GET'])
def info():
    app_info = {'name': app_name, 'language': app_lang, 'version': app_version}
    return Response(response=json.dumps(app_info),
                    status=200,
Esempio n. 8
0
def run_cv(cdb_path,
           data_path,
           vocab_path,
           cv=100,
           nepochs=16,
           reset_cui_count=True,
           test_size=0.1):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        fp, fn, tp, p, r, f1, cui_counts = cat.train_supervised(
            data_path=data_path,
            lr=1,
            nepochs=nepochs,
            anneal=True,
            print_stats=True,
            use_filters=True,
            reset_cui_count=reset_cui_count,
            terminate_last=True,
            test_size=test_size)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts
""" Analysing SNOMED annotations from the MedCAT output """

# Import packages
import json
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from medcat.cdb import CDB
import os


# Load Concept database (CDB) used for the project
cdb = CDB()
cdb.load_dict(os.path.join("/Volumes/NO NAME/", "snomed.dat"))


# Load MedCAT output
file_path = r"/Volumes/NO NAME/"  # add file path
file = r"AShek_project_EXport_MedCAT_Export_With_Text_2020-01-30_16_07_02.json"  # Add file name

with open(file_path + file) as f:
    data = json.load(f)

print("The number of projects is: ", len(data['projects']))  # number of projects

# Read all documents from all projects to doc_df
doc_df = pd.DataFrame([a for d in data['projects'] for a in d['documents']])
print("The number of documents is", len(doc_df['id']))  # number of documents
doc_df['last_modified'] = pd.to_datetime(doc_df['last_modified'])

# Read annotations to ann_df
Esempio n. 10
0
class MedCatProcessor(NlpProcessor):
    """"
    MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality
    (both single and bulk processing) that can be easily exposed for an API.
    """
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')

    def get_app_info(self):
        """
        Returns general information about the application
        :return: application information stored as KVPs
        """
        return {
            'name': self.app_name,
            'language': self.app_lang,
            'version': self.app_version,
            'model': self.app_model
        }

    def process_content(self, content):
        """
        Processes a single document extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing document with extracted annotations stored as KVPs.
        """
        if 'text' not in content:
            error_msg = "'text' field missing in the payload content."
            nlp_result = {
                'success': False,
                'errors': [error_msg],
                'timestamp': NlpProcessor._get_timestamp()
            }
            return nlp_result, False

        text = content['text']

        # assume an that a blank document is a valid document and process it only
        # when it contains any non-blank characters
        if text is not None and len(text.strip()) > 0:
            entities = self.cat.get_entities(text)
        else:
            entities = []

        nlp_result = {
            'text': text,
            'annotations': entities,
            'success': True,
            'timestamp': NlpProcessor._get_timestamp()
        }

        # append the footer
        if 'footer' in content:
            nlp_result['footer'] = content['footer']

        return nlp_result

    def process_content_bulk(self, content):
        """
        Processes an array of documents extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing documents with extracted annotations,stored as KVPs.
        """

        # process at least 10 docs per thread and don't bother with starting
        # additional threads when less documents were provided
        min_doc_per_thread = 10
        num_slices = max(1, int(len(content) / min_doc_per_thread))
        batch_size = min(300, num_slices)

        if batch_size >= self.bulk_nproc:
            nproc = self.bulk_nproc
        else:
            batch_size = min_doc_per_thread
            nproc = max(1, num_slices)
            if len(content) > batch_size * nproc:
                nproc += 1

        # use generators both to provide input documents and to provide resulting annotations
        # to avoid too many mem-copies
        invalid_doc_ids = []
        ann_res = self.cat.multi_processing(
            MedCatProcessor._generate_input_doc(content, invalid_doc_ids),
            nproc=nproc,
            batch_size=batch_size)

        return MedCatProcessor._generate_result(content, ann_res,
                                                invalid_doc_ids)

    # helper generator functions to avoid multiple copies of data
    #
    @staticmethod
    def _generate_input_doc(documents, invalid_doc_idx):
        """
        Generator function returning documents to be processed as a list of tuples:
          (idx, text), (idx, text), ...
        Skips empty documents and reports their ids to the invalid_doc_idx array
        :param documents: array of input documents that contain 'text' field
        :param invalid_doc_idx:  array that will contain invalid document idx
        :return: consecutive tuples of (idx, document)
        """
        for i in range(0, len(documents)):
            # assume the document to be processed only when it is not blank
            if 'text' in documents[i] and documents[i][
                    'text'] is not None and len(
                        documents[i]['text'].strip()) > 0:
                yield i, documents[i]['text']
            else:
                invalid_doc_idx.append(i)

    @staticmethod
    def _generate_result(in_documents, annotations, invalid_doc_idx):
        """
        Generator function merging the resulting annotations with the input documents.
        The result for documents that were invalid will not contain any annotations.
        :param in_documents: array of input documents that contain 'text' field
        :param annotations: array of annotations extracted from documents
        :param invalid_doc_idx: array of invalid document idx
        :return:
        """
        # generate output for valid annotations
        for i in range(len(annotations)):
            res = annotations[i]
            res_idx = res[0]
            in_ct = in_documents[res_idx]

            # parse the result
            out_res = {
                'text': res[1]["text"],
                'annotations': res[1]["entities"],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

        # generate output for invalid documents
        for i in invalid_doc_idx:
            in_ct = in_documents[i]

            out_res = {
                'text': in_ct["text"],
                'annotations': [],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

    @staticmethod
    def _get_medcat_version():
        """
        Returns the version string of the MedCAT module as reported by pip
        :return:
        """
        try:
            import subprocess
            result = subprocess.check_output(['pip', 'show', 'medcat'],
                                             universal_newlines=True)
            version_line = list(
                filter(lambda v: 'Version' in v, result.split('\n')))
            return version_line[0].split(' ')[1]
        except Exception:
            raise Exception("Cannot read the MedCAT library version")
Esempio n. 11
0
""" Analysing SNOMED annotations from the MedCAT output """

# Import packages
import json
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from medcat.cdb import CDB
import os

# Load Concept database (CDB) used for the project
cdb = CDB()
cdb.load_dict(os.path.join("F:/", "snomed.dat"))

# Load MedCAT output
file_path = r"C:\Users\k1767582\Desktop\MedCat output/"  # F:/
file = r"20200110_1000 Epilepsy letters_with_txt.json"  # Epilepsy_MedCAT_Export_With_Text_2020-01-28_12_20_08.json

with open(file_path + file) as f:
    data2 = json.load(f)

print("The number of documents is",
      len(data2['documents']))  # number of documents

# Read document information to doc_df
doc_df = pd.DataFrame.from_dict(data2['documents'])
doc_df['last_modified'] = pd.to_datetime(doc_df['last_modified'])

# Read annotations to ann_df
ann_df = pd.DataFrame(
    [a for d in data2['documents'] for a in d['annotations']])
Esempio n. 12
0
try:
    if not os.path.exists(vocab_path):
        vocab_url = os.getenv('VOCAB_URL')
        urlretrieve(vocab_url, vocab_path)

    if not os.path.exists(cdb_path):
        cdb_url = os.getenv('CDB_URL')
        print("*" * 399)
        print(cdb_url)
        urlretrieve(cdb_url, cdb_path)

    vocab = Vocab()
    vocab.load_dict(vocab_path)
    cdb = CDB()
    cdb.load_dict(cdb_path)
    #    mc_negated = MetaCAT(save_dir=neg_path)
    #    mc_negated.load()
    #    cat = CAT(cdb=cdb, vocab=vocab, meta_cats=[mc_negated])
    cat = CAT(cdb=cdb, vocab=vocab)
    cat.spacy_cat.MIN_ACC = 0.30
    cat.spacy_cat.MIN_ACC_TH = 0.30
    cat.spacy_cat.ACC_ALWAYS = True
except Exception as e:
    print(str(e))


def get_html_and_json(text):
    doc = cat(text)

    a = json.loads(cat.get_json(text))
Esempio n. 13
0
    return stats_dict, error_log


if __name__ == '__main__':

    print('Loading the vocabulary...')
    try:
        vocab = Vocab()
        vocab.load_dict(path_medcat + '/vocab.dat')
    except:
        raise ImportError('vocab and script should be in same directory')

    print('Loading the weights. This will take time...')
    try:
        cdb = CDB()
        cdb.load_dict(path_medcat +
                      '/umls_base_wlink_fixed_x_avg_2m_mimic.dat')
    except:
        raise ImportError('weights and script should be in same directory')

    print('Building the model...')
    cat = CAT(cdb=cdb, vocab=vocab)

    print('Building mrn directories...')
    mrns = [path_mrns + '/' + x for x in os.listdir(path_mrns)]
    total = len(mrns)

    print('{} available mrns \n'.format(total))
    stats_dicts = {}
    error_logs = []

    for mrn in tqdm(mrns, total=total):
Esempio n. 14
0
        # sem_group_acronym|sem_group|tui|tui_description
        sem_groups_df = pd.read_csv(semgroups_fn, delimiter='|').dropna()
        tui_group_map = dict(
            zip(sem_groups_df['tui'].tolist(),
                sem_groups_df['sem_group'].tolist()))

        vocab = Vocab()
        print('Loading vocabulary...')
        # Load the vocab model you downloaded
        vocab.load_dict('../data/medcat/vocab.dat')

        # Load the cdb model you downloaded
        cdb = CDB()
        print('Loading model...')
        cdb.load_dict('../data/medcat/cdb.dat')

        # create cat
        print('Creating MedCAT pipeline...')
        cat = CAT(cdb=cdb, vocab=vocab)

        print('Loading Spacy...')
        sentencizer = spacy.load(
            'en_core_sci_lg', disable=['tagger', 'parser', 'ner', 'textcat'])
        sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))
        print('Loading UMLS entity linker...')
        linker = EntityLinker(resolve_abbreviations=True, name='umls')
        cui_to_ent_map = linker.kb.cui_to_entity
        print('Let\'s go get some entities...')

        splits = ['validation', 'train']