def extract_semtype_phrase(semtype, values, output_list, ent3, text): original_phrase_list=[] temp=[] text = text.replace(',','.') split_text = text.split('.') bionlp = en_ner_bc5cdr_md.load() bionlp.add_pipe(bionlp.create_pipe('sentencizer')) for line in output_list: line=line.strip() if line: temp.append(line) if '['+semtype+']' in line and int("".join(filter(str.isdigit, line))) >= 800: for rline in temp[::-1]: if 'Phrase' in rline: rline = rline.strip(" ;").replace("Phrase:", "").replace("'","").replace("[","").replace("]","").strip() rline = re.sub('^%s' % ",", "", rline) rline = re.sub('%s$' % ",", "", rline) if values and any(detail in rline.lower() for detail in values): match ='.'.join([s for s in split_text if rline.lower() in s.lower()]) if 'pupils' in match or 'eyes' in match: match = re.sub(r'(.*)((pupils|eyes)[\sa-z,]+)|(.*)',r'\2',match, flags=re.IGNORECASE) doc=bionlp(match) for token in doc: if any(detail in token.text.lower() for detail in values): ent3[token.text.upper()] = get_dependency(match, token) if rline not in original_phrase_list: original_phrase_list.append(rline) break return original_phrase_list, ent3
def __init__(self): self.tagger = en_ner_bc5cdr_md.load() self.abbreviation_pipe = AbbreviationDetector(self.tagger) self.tagger.add_pipe(self.abbreviation_pipe) self.linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention=1) self.tagger.add_pipe(self.linker) print('NER Module Ready')
def process_item(self, item, spider): diseases_list = item['reports'][0]['diseases'] nlp_bc = en_ner_bc5cdr_md.load() doc_bc = nlp_bc(item['main_text']) # combine text with its label label = {} for token in doc_bc.ents: label[token.text] = token.label_ # combine text with its pos pos = {} for token in doc_bc: pos[token.text] = token.pos_ # combine text with its lemma lemma = {} for token in doc_bc: lemma[token.text] = token.lemma_ syndromes = [] for k, v in label.items(): if v == "DISEASE": li = k.split(" ") noun = 0 adj = 0 adp = 0 if li[-1].lower() == "coronavirus": continue for c in li: if "CoV" in c: break if c.isupper(): break if lemma[c] != c: break if "disease" in c: break if c in diseases_list: break if pos.get(c) == "ADJ": adj += 1 elif pos.get(c) == "NOUN": noun += 1 # "of" case elif pos.get(c) == "ADP": adp += 1 if adj == 0 and noun >= 1: syndromes.append(k) elif adj == 1 and (noun >= 1 and noun <= 2): syndromes.append(k) elif adj == 0 and noun >= 1 and adp >= 1: syndromes.append(k) gc.collect() item['reports'][0]['syndromes'] = syndromes return item
def scispacy_plus_tokenizer(sequence: str, scispacy_tok=None) -> Iterator[str]: """ Runs the scispacy tokenizer and removes all tokens with just whitespace characters """ if scispacy_tok is None: import en_ner_bc5cdr_md scispacy_tok = en_ner_bc5cdr_md.load().tokenizer scispacy_tokens = list(map(lambda x: str(x), scispacy_tok(sequence))) tokens = filter(lambda t: not (' ' in t or '\n' in t or '\t' in t), scispacy_tokens) return tokens
def detect_drugs(text_file): #NLP model for drug recognition nlp = en_ner_bc5cdr_md.load() #the input text file text = open(text_file, "r").read() #use NLP model to parse the text parsed_text = nlp(text) #extract entities/ drug names entities = parsed_text.ents #print entities print("List of entities:", entities) return entities
def information_extractor(text, semantic_dict): entities, ent2 =getspacy_pattern_matched_entities(text.lower()) entities=[t for t in (set(tuple(i) for i in entities))] ent2=[t for t in (set(tuple(i) for i in ent2))] entities = refine_values(entities,text) ent_list=list(eval(str(entities))) ent_list2=list(eval(str(ent2))) keys = ["AGE", "GENDER", "PULSE", "BP", "RESP", "B.G.L", "SPO2", "GCS", "MENTAL ST", "PATIENT COND"\ ,"MEDICATION","ALLERGIES","PAST MEDICAL HISTORY","PAIN","TRAUMA","PUPILS","LUNG SOUNDS","VERBAL",\ "AIRWAY", "INJURY","MECHANISM OF INJURY","COMPLAINT","TREATMENT","NOTES"] result = {} for (k, v) in ent_list: if k and v: result.setdefault(k, []).append(v) for key in keys: if key not in result.keys(): result[key] = [''] result2 = {} for (k, v) in ent_list2: result2.setdefault(k, []).append(v) d2=dict(result2) dd = defaultdict(list) for d in (semantic_dict, d2): # you can list as many input dicts as you want here for key, value in d.items(): dd[key]= list(set(dd[key] + value)) bionlp = en_ner_bc5cdr_md.load() bionlp.add_pipe(bionlp.create_pipe('sentencizer')) return dict(result), dd, semantic_dict
import pandas as pd import csv import spacy import en_ner_bc5cdr_md from collections import Counter from spacy import displacy from spacy.matcher import Matcher import scispacy # Apply scispacy model to recognize diseases and chemiscals # This model referred to works of two authors in Kaggle: https://www.kaggle.com/maria17/cord-19-explore-drugs-being-developed nlp = en_ner_bc5cdr_md.load() # define models def scispacy_model(text, nlp): entities = {} doc = nlp(str(text.lower())) entities[doc.labels] = doc.ents return entities #displacy.render(docs, style="ent", options=options), text = 'The 2019–20 coronavirus pandemic is an ongoing pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)' print(scispacy_model(text, nlp))
'DUR': 'Duration', 'ROU': 'Route', 'FOR': 'Form', 'ADE': 'ADE', 'DOS': 'Dosage', 'REA': 'Reason', 'FRE': 'Frequency' } # =====BiLSTM + CRF model for NER========= bilstm_config = BiLSTMConfig() bilstm_model = BiLSTMModel(bilstm_config) bilstm_learn = BiLSTMLearner(bilstm_config, bilstm_model) bilstm_learn.load("ner_15e_bilstm_crf_elmo") scispacy_tok = en_ner_bc5cdr_md.load().tokenizer scispacy_plus_tokenizer.__defaults__ = (scispacy_tok, ) # =====BioBERT Model for RE====== re_label_list = ["0", "1"] re_task_name = "ehr-re" biobert_re_config = AutoConfig.from_pretrained(os.path.join( BIOBERT_RE_MODEL_DIR, "config.json"), num_labels=len(re_label_list), finetuning_task=re_task_name) biobert_re_model = AutoModelForSequenceClassification.from_pretrained( os.path.join(BIOBERT_RE_MODEL_DIR, "pytorch_model.bin"), config=biobert_re_config, )
def main(): args = parse_arguments() if args.target_dir[-1] != '/': args.target_dir += '/' if args.sep == "tab": args.sep = "\t" if not os.path.isdir(args.target_dir): os.mkdir(args.target_dir) if args.tokenizer == "default": tokenizer = default_tokenizer elif args.tokenizer == "scispacy": import en_ner_bc5cdr_md tokenizer = en_ner_bc5cdr_md.load().tokenizer elif args.tokenizer == 'scispacy_plus': import en_ner_bc5cdr_md scispacy_tok = en_ner_bc5cdr_md.load().tokenizer scispacy_plus_tokenizer.__defaults__ = (scispacy_tok, ) tokenizer = scispacy_plus_tokenizer elif args.tokenizer == 'biobert-large': from transformers import AutoTokenizer biobert = AutoTokenizer.from_pretrained( "dmis-lab/biobert-large-cased-v1.1") args.max_seq_len -= biobert.num_special_tokens_to_add() tokenizer = biobert.tokenize elif args.tokenizer == 'biobert-base': from transformers import AutoTokenizer biobert = AutoTokenizer.from_pretrained( "dmis-lab/biobert-base-cased-v1.1") args.max_seq_len -= biobert.num_special_tokens_to_add() tokenizer = biobert.tokenize else: warnings.warn("Tokenizer named " + args.tokenizer + " not found." "Using default tokenizer instead. Acceptable values" "include 'scispacy', 'biobert-base', 'biobert-large'," "and 'default'.") tokenizer = default_tokenizer print("\nReading data\n") train_dev, test = read_data(data_dir=args.input_dir, train_ratio=1 - args.test_split, tokenizer=tokenizer, verbose=1) if args.ade_dir is not None: ade_train_dev, ade_test = read_ade_data(ade_data_dir=args.ade_dir, train_ratio=1 - args.test_split, verbose=1) ade_dev_split_idx = int((1 - args.dev_split) * len(ade_train_dev)) ade_train = ade_train_dev[:ade_dev_split_idx] ade_devel = ade_train_dev[ade_dev_split_idx:] else: ade_train_dev = None ade_train = None ade_test = None ade_devel = None print('\n') # Data is already shuffled, just split for dev set dev_split_idx = int((1 - args.dev_split) * len(train_dev)) train = train_dev[:dev_split_idx] devel = train_dev[dev_split_idx:] # Data for NER if args.task.lower() == 'ner': files = { 'train': (train, ade_train), 'train_dev': (train_dev, ade_train_dev), 'devel': (devel, ade_devel), 'test': (test, ade_test) } ner_generator(files, args) # Data for RE elif args.task.lower() == 're': # {dataset_name: (ehr_data, ade_data, is_test, is_label)} files = { 'train': (train, ade_train, False, True), 'dev': (devel, ade_devel, False, True), 'test': (test, ade_test, True, False), 'test_labels': (test, ade_test, True, True) } re_generator(files, args)
import en_core_sci_lg # NER specific models import en_ner_craft_md import en_ner_bc5cdr_md import en_ner_jnlpba_md import en_ner_bionlp13cg_md import en_core_med7_lg from negspacy.negation import Negex from spacy import displacy # %% TEXT_TAG = "TEXT" MODELS = { "nlp_bc": (en_ner_bc5cdr_md.load(), "CHEMICAL"), "nlp_bi": (en_core_med7_lg.load(), "DRUG"), "med7": (en_ner_bionlp13cg_md.load(), "SIMPLE_CHEMICAL"), } # %% def show_medical_abbreviation(model, document): """ This function detects and resolves medical abbreviations in word entities Parameters: model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/) document(str): Document to be processed Returns: List of unique abbreviations and their resolution
def refine_entity(ent1,ent2,ent3, text): bionlp = en_ner_bc5cdr_md.load() bionlp.add_pipe(bionlp.create_pipe('sentencizer')) bionlp.add_pipe(bionlp.create_pipe('merge_entities')) bionlp.add_pipe(bionlp.create_pipe('merge_noun_chunks')) replaced_txt = text.replace(',','.') split_text = replaced_txt.split('.') if ent2['Injury or Poisoning']: for s in split_text: ent1['INJURY'] = ent1['INJURY'] + [x for x in ent2['Injury or Poisoning'] if x.lower() in s.lower()\ and not any(word in s.lower() for word in ['vehicle','collision','fell','fall','drowning','mechanism','priority'])] if ent2['DRUG']: ent1['MEDICATION']=ent2['DRUG'] if ent2['Medical Device']: ent1['TREATMENT']=ent2['Medical Device'] dtemp={'COMPLAINT':['complain'], \ 'INJURY':['injury','injuries', 'congestion', 'bump','abrasion','laceration','contusion','broken'\ 'swellling','fracture','scratch','bruise','gash','trauma',' shot', 'wound','entrance'], \ 'MECHANISM OF INJURY':['fell','fall','gunshot','struck','fire','attack', 'collision','assault','stab','hit','suicide','drowning','crash','GSW'],\ 'LUNG SOUNDS':['lung','lungs'],\ 'VERBAL':['confused','groggy'],\ 'PATIENT COND':['stable','unstable','critical'],\ 'MENTAL ST':['awake','alert','disoriented','oriented', 'lethargic','conscious','unconscious','unresponsive','loc','mental','crying'],\ 'TRAUMA':['priority'],\ 'TREATMENT':['immobiliz','high-flow O-2','IV ',' boarded','bag mask', 'IVs ','non-re'],\ 'PUPILS':['PUPILS'],\ 'PAIN':['PAIN'],'PAST MEDICAL HISTORY':['HISTORY'],'ALLERGIES':['allergies','allergie'],'AIRWAY':['airway']} for key, value in dtemp.items(): for v in value: if ent3[v.upper()]: if ent1[key] and len(ent1[key])>0: ent1[key] = [x for x in ent1[key] if x and x.strip()] if ent3[v.upper()] not in ent1[key]: ent1[key] = list(set(ent1[key] + [ent3[v.upper()]])) else: ent1[key] = list(set([ent3[v.upper()]])) else: if key in ['ALLERGIES', 'MECHANISM OF INJURY', 'COMPLAINT', 'MENTAL ST', 'PATIENT COND', 'TRAUMA', 'TREATMENT']: if v.lower() == 'stab': v = v + '\b' ent1[key] = ent1[key] + [f.strip() for f in ent2['Finding'] if re.compile(r'\b'+ v.lower()).search(f.lower()) and '?' not in f] ent1[key] = ent1[key] + [s.strip() for s in split_text if re.compile(r'\b'+ v.lower()).search(s.lower()) and '?' not in s] ent1[key] = list(set([re.sub(r'(.*):(.*)',r'\2',x) for x in ent1[key] if x and x.strip()])) elif key in ['INJURY','PUPILS']: no_word_list = ['vehicle','collision','fell','fall','drowning','mechanism','priority','fire','2'] ent1[key] = ent1[key] + [s.strip() for s in split_text if re.compile(r'\b'+ v.lower()).search(s.lower()) and '?' not in s ] matching = '. '.join([s for s in split_text if v.lower() in s.lower() and '?' not in s]) d2 = bionlp(matching) for token in d2: if v.lower() in token.text.lower(): et = get_dependency(matching, token) if et not in ent1[key]: ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)])) ent1[key] = list(set([x for x in ent1[key] if x and x.strip() and not any(word in x.lower() for word in no_word_list)])) else: matching = '. '.join([s for s in split_text if v.lower() in s.lower() and '?' not in s]) d2 = bionlp(matching) for token in d2: if v.lower() in token.text.lower(): if key =='PATIENT COND': pc=['patient','everything','he','she'] if token.head.text.lower() in pc or any(item.text.lower() in pc for item in token.children) or any(item.text.lower() in pc for item in token.head.children): ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)])) ent1[key] = [x for x in ent1[key] if x and x.strip()] else: ent1[key] = list(set(ent1[key] + [get_dependency(matching, token)])) ent1[key] = [x for x in ent1[key] if x and x.strip()] complaints = ent1['COMPLAINT']+ent1['PAIN'] ent1['COMPLAINT'] = [x for x in complaints if x] for key in ent1: temp=ent1[key] s=[] for e in ent1[key]: s = s + [e.strip() for t in temp if e!=t and all(item in t.replace(',','').split(' ') for item in e.replace(',','').split(' ')) or '?' in e] ent1[key] = list(set(ent1[key]) - set(s)) ent1['NOTES']=[text] return ent1