Beispiel #1
0
def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
    cdb_id = project.concept_db.id
    vocab_id = project.vocab.id
    cat_id = str(cdb_id) + "-" + str(vocab_id)

    if cat_id in CAT_MAP:
        cat = CAT_MAP[cat_id]
    else:
        if cdb_id in CDB_MAP:
            cdb = CDB_MAP[cdb_id]
        else:
            cdb_path = project.concept_db.cdb_file.path
            cdb = CDB()
            cdb.load_dict(cdb_path)
            CDB_MAP[cdb_id] = cdb

        if vocab_id in VOCAB_MAP:
            vocab = VOCAB_MAP[vocab_id]
        else:
            vocab_path = project.vocab.vocab_file.path
            vocab = Vocab()
            vocab.load_dict(vocab_path)
            VOCAB_MAP[vocab_id] = vocab

        cat = CAT(cdb=cdb, vocab=vocab)
        cat.train = False
        CAT_MAP[cat_id] = cat
    return cat
Beispiel #2
0
    def _create_cat(self):
        """
        Loads MedCAT resources and creates CAT instance
        """
        if os.getenv("APP_MODEL_VOCAB_PATH") is None:
            raise ValueError(
                "Vocabulary (env: APP_MODEL_VOCAB_PATH) not specified")

        if os.getenv("APP_MODEL_CDB_PATH") is None:
            raise Exception(
                "Concept database (env: APP_MODEL_CDB_PATH) not specified")

        # Vocabulary and Concept Database are mandatory
        self.log.debug('Loading VOCAB ...')
        vocab = Vocab()
        vocab.load_dict(path=os.getenv("APP_MODEL_VOCAB_PATH"))

        self.log.debug('Loading CDB ...')
        cdb = CDB()
        cdb.load_dict(path=os.getenv("APP_MODEL_CDB_PATH"))

        # Apply CUI filter if provided
        if os.getenv("APP_MODEL_CUI_FILTER_PATH") is not None:
            self.log.debug('Applying CDB CUI filter ...')
            with open(os.getenv("APP_MODEL_CUI_FILTER_PATH")) as cui_file:
                all_lines = (line.rstrip() for line in cui_file)
                selected_cuis = [line for line in all_lines
                                 if line]  # filter blank lines
                cdb.filter_by_cui(selected_cuis)

        # Meta-annotation models are optional
        meta_models = []
        if os.getenv("APP_MODEL_META_PATH_LIST") is not None:
            self.log.debug('Loading META annotations ...')
            for model_path in os.getenv("APP_MODEL_META_PATH_LIST").split(':'):
                m = MetaCAT(save_dir=model_path)
                m.load()
                meta_models.append(m)

        return CAT(cdb=cdb, vocab=vocab, meta_cats=meta_models)
Beispiel #3
0
import os
from argparse import ArgumentParser

import pandas as pd
from tqdm import tqdm
import numpy as np

from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.cdb import CDB 

vocab = Vocab()
vocab.load_dict(os.environ["MEDCAT_VOCAB_FILE"])
print("Loaded Vocab")

# Load the cdb model you downloaded
cdb = CDB()
cdb.load_dict(os.environ["MEDCAT_CDB_FILE"]) 
print("Loaded CDB")

# create cat
cat = CAT(cdb=cdb, vocab=vocab)
cat.spacy_cat.TUI_FILTER = ['T047', 'T048', 'T184']

tqdm.pandas()

def get_entities(text) :
    doc = cat.get_entities(text)
    relevant_entities = []
    for ent in doc :
        if "icd10" in ent["info"] :
Beispiel #4
0
def run_cv(cdb_path, data_path, vocab_path, cv=100, nepochs=16, test_size=0.1, lr=1, groups=None, **kwargs):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    use_groups = False
    if groups is not None:
        use_groups = True

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    examples = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        # Add groups if they exist
        if groups is not None:
            for cui in cdb.cui2info.keys():
                if "group" in cdb.cui2info[cui]:
                    del cdb.cui2info[cui]['group']
            groups = json.load(open("./groups.json"))
            for k,v in groups.items():
                for val in v:
                    cat.add_cui_to_group(val, k)

        fp, fn, tp, p, r, f1, cui_counts, examples = cat.train_supervised(data_path=data_path,
                             lr=1, test_size=test_size, use_groups=use_groups, nepochs=nepochs, **kwargs)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts, examples
from flask import Flask
from flask import Response
import json
from medcat.cdb import CDB
from medcat.utils.vocab import Vocab
from medcat.cat import CAT
from flask import request
import os

vocab = Vocab()
cdb = CDB()

cdb.load_dict(os.getenv("CDB_MODEL", '/cat/models/med_ann_norm.dat'))
vocab.load_dict(
    path=os.getenv("VOCAB_MODEL", '/cat/models/med_ann_norm_dict.dat'))
cat = CAT(cdb, vocab=vocab)

cat.spacy_cat.train = False

app = Flask(__name__)

app_name = 'MEDCAT'
app_lang = 'en'
app_version = os.getenv("CAT_VERSION", '0.1.0')


@app.route('/api/info', methods=['GET'])
def info():
    app_info = {'name': app_name, 'language': app_lang, 'version': app_version}
    return Response(response=json.dumps(app_info),
                    status=200,
Beispiel #6
0
def run_cv(cdb_path,
           data_path,
           vocab_path,
           cv=100,
           nepochs=16,
           reset_cui_count=True,
           test_size=0.1):
    from medcat.cat import CAT
    from medcat.utils.vocab import Vocab
    from medcat.cdb import CDB
    import json

    f1s = {}
    ps = {}
    rs = {}
    tps = {}
    fns = {}
    fps = {}
    cui_counts = {}
    for i in range(cv):
        cdb = CDB()
        cdb.load_dict(cdb_path)
        vocab = Vocab()
        vocab.load_dict(path=vocab_path)
        cat = CAT(cdb, vocab=vocab)
        cat.train = False
        cat.spacy_cat.MIN_ACC = 0.30
        cat.spacy_cat.MIN_ACC_TH = 0.30

        fp, fn, tp, p, r, f1, cui_counts = cat.train_supervised(
            data_path=data_path,
            lr=1,
            nepochs=nepochs,
            anneal=True,
            print_stats=True,
            use_filters=True,
            reset_cui_count=reset_cui_count,
            terminate_last=True,
            test_size=test_size)

        for key in f1.keys():
            if key in f1s:
                f1s[key].append(f1[key])
            else:
                f1s[key] = [f1[key]]

            if key in ps:
                ps[key].append(p[key])
            else:
                ps[key] = [p[key]]

            if key in rs:
                rs[key].append(r[key])
            else:
                rs[key] = [r[key]]

            if key in tps:
                tps[key].append(tp.get(key, 0))
            else:
                tps[key] = [tp.get(key, 0)]

            if key in fps:
                fps[key].append(fp.get(key, 0))
            else:
                fps[key] = [fp.get(key, 0)]

            if key in fns:
                fns[key].append(fn.get(key, 0))
            else:
                fns[key] = [fn.get(key, 0)]

    return fps, fns, tps, ps, rs, f1s, cui_counts
Beispiel #7
0
class MedCatProcessor(NlpProcessor):
    """"
    MedCAT Processor class is wrapper over MedCAT that implements annotations extractions functionality
    (both single and bulk processing) that can be easily exposed for an API.
    """
    def __init__(self):
        super().__init__()

        self.log.info('Initializing MedCAT processor ...')

        self.app_name = 'MedCAT'
        self.app_lang = 'en'
        self.app_version = MedCatProcessor._get_medcat_version()
        self.app_model = os.getenv("APP_MODEL_NAME", 'unknown')

        self.vocab = Vocab()
        self.cdb = CDB()

        self.cdb.load_dict(
            os.getenv("APP_MODEL_CDB_PATH", '/cat/models/cdb.dat'))
        self.vocab.load_dict(
            path=os.getenv("APP_MODEL_VOCAB_PATH", '/cat/models/vocab.dat'))
        self.cat = CAT(self.cdb, vocab=self.vocab)

        self.cat.spacy_cat.train = os.getenv("APP_TRAINING_MODE", False)
        self.bulk_nproc = int(os.getenv('APP_BULK_NPROC', 8))

        self.log.info('MedCAT processor is ready')

    def get_app_info(self):
        """
        Returns general information about the application
        :return: application information stored as KVPs
        """
        return {
            'name': self.app_name,
            'language': self.app_lang,
            'version': self.app_version,
            'model': self.app_model
        }

    def process_content(self, content):
        """
        Processes a single document extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing document with extracted annotations stored as KVPs.
        """
        if 'text' not in content:
            error_msg = "'text' field missing in the payload content."
            nlp_result = {
                'success': False,
                'errors': [error_msg],
                'timestamp': NlpProcessor._get_timestamp()
            }
            return nlp_result, False

        text = content['text']

        # assume an that a blank document is a valid document and process it only
        # when it contains any non-blank characters
        if text is not None and len(text.strip()) > 0:
            entities = self.cat.get_entities(text)
        else:
            entities = []

        nlp_result = {
            'text': text,
            'annotations': entities,
            'success': True,
            'timestamp': NlpProcessor._get_timestamp()
        }

        # append the footer
        if 'footer' in content:
            nlp_result['footer'] = content['footer']

        return nlp_result

    def process_content_bulk(self, content):
        """
        Processes an array of documents extracting the annotations.
        :param content: document to be processed, containing 'text' field.
        :return: processing result containing documents with extracted annotations,stored as KVPs.
        """

        # process at least 10 docs per thread and don't bother with starting
        # additional threads when less documents were provided
        min_doc_per_thread = 10
        num_slices = max(1, int(len(content) / min_doc_per_thread))
        batch_size = min(300, num_slices)

        if batch_size >= self.bulk_nproc:
            nproc = self.bulk_nproc
        else:
            batch_size = min_doc_per_thread
            nproc = max(1, num_slices)
            if len(content) > batch_size * nproc:
                nproc += 1

        # use generators both to provide input documents and to provide resulting annotations
        # to avoid too many mem-copies
        invalid_doc_ids = []
        ann_res = self.cat.multi_processing(
            MedCatProcessor._generate_input_doc(content, invalid_doc_ids),
            nproc=nproc,
            batch_size=batch_size)

        return MedCatProcessor._generate_result(content, ann_res,
                                                invalid_doc_ids)

    # helper generator functions to avoid multiple copies of data
    #
    @staticmethod
    def _generate_input_doc(documents, invalid_doc_idx):
        """
        Generator function returning documents to be processed as a list of tuples:
          (idx, text), (idx, text), ...
        Skips empty documents and reports their ids to the invalid_doc_idx array
        :param documents: array of input documents that contain 'text' field
        :param invalid_doc_idx:  array that will contain invalid document idx
        :return: consecutive tuples of (idx, document)
        """
        for i in range(0, len(documents)):
            # assume the document to be processed only when it is not blank
            if 'text' in documents[i] and documents[i][
                    'text'] is not None and len(
                        documents[i]['text'].strip()) > 0:
                yield i, documents[i]['text']
            else:
                invalid_doc_idx.append(i)

    @staticmethod
    def _generate_result(in_documents, annotations, invalid_doc_idx):
        """
        Generator function merging the resulting annotations with the input documents.
        The result for documents that were invalid will not contain any annotations.
        :param in_documents: array of input documents that contain 'text' field
        :param annotations: array of annotations extracted from documents
        :param invalid_doc_idx: array of invalid document idx
        :return:
        """
        # generate output for valid annotations
        for i in range(len(annotations)):
            res = annotations[i]
            res_idx = res[0]
            in_ct = in_documents[res_idx]

            # parse the result
            out_res = {
                'text': res[1]["text"],
                'annotations': res[1]["entities"],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

        # generate output for invalid documents
        for i in invalid_doc_idx:
            in_ct = in_documents[i]

            out_res = {
                'text': in_ct["text"],
                'annotations': [],
                'success': True,
                'timestamp': NlpProcessor._get_timestamp()
            }
            # append the footer
            if 'footer' in in_ct:
                out_res['footer'] = in_ct['footer']

            yield out_res

    @staticmethod
    def _get_medcat_version():
        """
        Returns the version string of the MedCAT module as reported by pip
        :return:
        """
        try:
            import subprocess
            result = subprocess.check_output(['pip', 'show', 'medcat'],
                                             universal_newlines=True)
            version_line = list(
                filter(lambda v: 'Version' in v, result.split('\n')))
            return version_line[0].split(' ')[1]
        except Exception:
            raise Exception("Cannot read the MedCAT library version")
Beispiel #8
0
# TODO
#neg_path = os.getenv('NEG_PATH', '/tmp/mc_negated')

try:
    if not os.path.exists(vocab_path):
        vocab_url = os.getenv('VOCAB_URL')
        urlretrieve(vocab_url, vocab_path)

    if not os.path.exists(cdb_path):
        cdb_url = os.getenv('CDB_URL')
        print("*" * 399)
        print(cdb_url)
        urlretrieve(cdb_url, cdb_path)

    vocab = Vocab()
    vocab.load_dict(vocab_path)
    cdb = CDB()
    cdb.load_dict(cdb_path)
    #    mc_negated = MetaCAT(save_dir=neg_path)
    #    mc_negated.load()
    #    cat = CAT(cdb=cdb, vocab=vocab, meta_cats=[mc_negated])
    cat = CAT(cdb=cdb, vocab=vocab)
    cat.spacy_cat.MIN_ACC = 0.30
    cat.spacy_cat.MIN_ACC_TH = 0.30
    cat.spacy_cat.ACC_ALWAYS = True
except Exception as e:
    print(str(e))


def get_html_and_json(text):
    doc = cat(text)
Beispiel #9
0
        intersection_space = len(source.intersection(target))
        target_space = len(target)

        stats_dict[(mrn_number,
                    account_number)] = intersection_space / target_space

    return stats_dict, error_log


if __name__ == '__main__':

    print('Loading the vocabulary...')
    try:
        vocab = Vocab()
        vocab.load_dict(path_medcat + '/vocab.dat')
    except:
        raise ImportError('vocab and script should be in same directory')

    print('Loading the weights. This will take time...')
    try:
        cdb = CDB()
        cdb.load_dict(path_medcat +
                      '/umls_base_wlink_fixed_x_avg_2m_mimic.dat')
    except:
        raise ImportError('weights and script should be in same directory')

    print('Building the model...')
    cat = CAT(cdb=cdb, vocab=vocab)

    print('Building mrn directories...')
Beispiel #10
0
        semgroups_fn = '../data/umls_semgroups.txt'

        cols = ['UMLS_CUI', 'SNOMED_FSN', 'SNOMED_CID']
        snomed_df = pd.read_csv(snomed_core_fn, delimiter='|')[cols]
        core_cui_set = set(snomed_df['UMLS_CUI'].tolist())

        # sem_group_acronym|sem_group|tui|tui_description
        sem_groups_df = pd.read_csv(semgroups_fn, delimiter='|').dropna()
        tui_group_map = dict(
            zip(sem_groups_df['tui'].tolist(),
                sem_groups_df['sem_group'].tolist()))

        vocab = Vocab()
        print('Loading vocabulary...')
        # Load the vocab model you downloaded
        vocab.load_dict('../data/medcat/vocab.dat')

        # Load the cdb model you downloaded
        cdb = CDB()
        print('Loading model...')
        cdb.load_dict('../data/medcat/cdb.dat')

        # create cat
        print('Creating MedCAT pipeline...')
        cat = CAT(cdb=cdb, vocab=vocab)

        print('Loading Spacy...')
        sentencizer = spacy.load(
            'en_core_sci_lg', disable=['tagger', 'parser', 'ner', 'textcat'])
        sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))
        print('Loading UMLS entity linker...')
from code_utils.global_variables import *
from medcat.cat import CAT
from medcat.utils.vocab import Vocab
from medcat.prepare_cdb import PrepareCDB
from medcat.cdb import CDB
import os
import spacy

# nlp = spacy.load(spacy_en_path, disable=['ner', 'parser'])
medcat_path = r'C:\Users\K1774755\PycharmProjects\toy-models\MedCat'
vocab = Vocab()

# Load the vocab model you just downloaded
vocab.load_dict(os.path.join(medcat_path, 'med_ann_norm_dict.dat'))

# If you have an existing CDB
cdb = CDB()
# cdb.load_dict(os.path.join(medcat_path, 'simple_cdb.csv'))


# If you need a special CDB you can build one from a .csv file
preparator = PrepareCDB(vocab=vocab)
csv_paths = [os.path.join(medcat_path, 'simple_cdb.csv')]#, '<another one>', ...]
csv_paths = [os.path.join(medcat_path, 'attention_cdb.csv')]
cdb = preparator.prepare_csvs(csv_paths)

# Save the new CDB for later
cdb.save_dict(os.path.join(medcat_path, 'simple_cdb.cdb'))

# To annotate documents we do
doc = "My simple document with kidney failure"