from .permissions import * from .serializers import * from .utils import get_medcat, add_annotations, remove_annotations, train_medcat, create_annotation # For local testing, put envs """ from environs import Env env = Env() env.read_env("/home/ubuntu/projects/MedAnno/MedAnno/env_umls", recurse=False) print(os.environ) """ from medcat.utils.helpers import prepare_name from medcat.utils.loggers import basic_logger log = basic_logger("api.views") # Maps between IDs and objects CDB_MAP = {} VOCAB_MAP = {} CAT_MAP = {} # Get the basic version of MedCAT cat = None def index(request): return render(request, 'index.html') class UserViewSet(viewsets.ModelViewSet):
from spacy.tokens import Span import numpy as np import operator from medcat.utils.loggers import basic_logger from medcat.utils.matutils import unitvec import os log = basic_logger("spacycat") # IF UMLS it includes specific rules that are only good for the Full UMLS version if os.getenv('TYPE', 'other').lower() == 'umls': log.info("Using cat_ann for annotations") from medcat.cat_ann import CatAnn else: log.info("Using basic_cat_ann for annotations") from medcat.basic_cat_ann import CatAnn class SpacyCat(object): """ A Spacy pipe module, can be easily added into a spacey pipline cdb: the cdb object of class cat.cdb representing the concepts vocab: vocab object of class cat.utils.vocab with vector representations train: should the training be performed or not, if training is False the disambiguation using vectors will be performed. While training is True it will not be performed """ DEBUG = os.getenv('DEBUG', "false").lower() == 'true' CNTX_SPAN = int(os.getenv('CNTX_SPAN', 7)) CNTX_SPAN_SHORT = int(os.getenv('CNTX_SPAN_SHORT', 2)) MIN_CUI_COUNT = int(os.getenv('MIN_CUI_COUNT', 10000))
import json import os from django.db.models.signals import post_save from django.dispatch import receiver from .models import Entity, AnnotatedEntity, Concept, ICDCode, OPCSCode, ProjectAnnotateEntities from medcat.cdb import CDB from medcat.utils.vocab import Vocab from medcat.cat import CAT from medcat.utils.loggers import basic_logger log = basic_logger("api.utils") def remove_annotations(document, project, partial=False): try: if partial: # Removes only the ones that are not validated AnnotatedEntity.objects.filter(project=project, document=document, validated=False).delete() log.debug(f"Unvalidated Annotations removed for:{document.id}") else: # Removes everything AnnotatedEntity.objects.filter(project=project, document=document).delete() log.debug(f"All Annotations removed for:{document.id}") except Exception as e: log.debug(f"Something went wrong: {e}") def add_annotations(spacy_doc, user, project, document, cdb, existing_annotations, tuis=[], cuis=[]):
from functools import partial from multiprocessing import Process, Manager, Queue, Pool, Array from medcat.cdb import CDB from medcat.spacy_cat import SpacyCat from medcat.preprocessing.tokenizers import spacy_split_all from medcat.utils.spelling import CustomSpellChecker from medcat.utils.spacy_pipe import SpacyPipe from medcat.preprocessing.cleaners import spacy_tag_punct from medcat.utils.helpers import get_all_from_name, tkn_inds_from_doc from medcat.utils.loggers import basic_logger from medcat.utils.data_utils import make_mc_train_test import time import sys, traceback from tqdm.autonotebook import tqdm log = basic_logger("CAT") class CAT(object): r''' The main MedCAT class used to annotate documents, it is built on top of spaCy and works as a spaCy pipline. Creates an instance of a spaCy pipline that can be used as a spacy nlp model. Args: cdb (medcat.cdb.CDB): The concept database that will be used for NER+L vocab (medcat.utils.vocab.Vocab, optional): Vocabulary used for vector embeddings and spelling. Default: None skip_stopwords (bool): If True the stopwords will be ignored and not detected in the pipeline.
from spacy.tokens import Span import numpy as np import operator from medcat.utils.loggers import basic_logger from medcat.utils.matutils import unitvec import os log = basic_logger("cat_spacycat") # IF UMLS it includes specific rules that are only good for the Full UMLS version if os.getenv('TYPE', 'other').lower() == 'umls': log.info("Using cat_ann for annotations") from medcat.cat_ann import CatAnn else: log.info("Using basic_cat_ann for annotations") from medcat.basic_cat_ann import CatAnn class SpacyCat(object): """ A Spacy pipe module, can be easily added into a spacey pipline cdb: the cdb object of class cat.cdb representing the concepts vocab: vocab object of class cat.utils.vocab with vector representations train: should the training be performed or not, if training is False the disambiguation using vectors will be performed. While training is True it will not be performed """ DEBUG = os.getenv('DEBUG', "false").lower() == 'true' NORM_EMB = os.getenv( 'NORM_EMB', "false").lower() == 'true' # Should we normalize the w2v PREFER_FREQUENT = os.getenv('PREFER_FREQUENT', "false").lower() == 'true' PREFER_CONCEPTS_WITH = os.getenv('PREFER_CONCEPTS_WITH', None)
from sklearn.model_selection import train_test_split import numpy as np from medcat.utils.models import LSTM as MODEL from sklearn.metrics import classification_report, f1_score, confusion_matrix, precision_score, recall_score import torch from torch import nn import torch.nn.functional as F import torch.optim as optim from medcat.utils.loggers import basic_logger log = basic_logger("utils") def get_batch(ind, batch_size, x, y, cpos, device): # Get the start/end index for this batch start = ind * batch_size end = (ind+1) * batch_size # Get the batch x_batch = x[start:end] y_batch = y[start:end] c_batch = cpos[start:end] # Return and move the batches to the right device return x_batch.to(device), y_batch.to(device), c_batch.to(device) def train_network(net, data, lr=0.01, test_size=0.1, max_seq_len=41, pad_id=30000, batch_size=100, nepochs=20, device='cpu', save_dir='./meta_cat/', class_weights=None, ignore_cpos=False, auto_save_model=True, score_average='weighted'): # Split data
""" Representation class for CDB data """ import pickle import numpy as np from scipy.sparse import dok_matrix #from gensim.matutils import unitvec from medcat.utils.matutils import unitvec, sigmoid from medcat.utils.attr_dict import AttrDict from medcat.utils.loggers import basic_logger import os import pandas as pd log = basic_logger("cdb") class CDB(object): """ Holds all the CDB data required for annotation """ MAX_COO_DICT_SIZE = int(os.getenv('MAX_COO_DICT_SIZE', 10000000)) MIN_COO_COUNT = int(os.getenv('MIN_COO_COUNT', 100)) def __init__(self): self.index2cui = [] # A list containing all CUIs self.cui2index = {} # Map from cui to index in the index2cui list self.name2cui = {} # Converts a normalized concept name to a cui self.name2cnt = {} # Converts a normalized concept name to a count self.name_isunique = {} # Should this name be skipped self.name2original_name = {} # Holds the two versions of a name self.name2ntkns = {} # Number of tokens for this name self.name_isupper = {} # Checks was this name all upper case in cdb self.cui2desc = {} # Map between a CUI and its cdb description