def get_ner_BMES(outputs, return_str_or_not): entities = [] for idx in range(len(outputs)): labelName = outputs[idx] if labelName[0] == 'S' or labelName[0] == 'B': entity = Entity() entity.type = labelName[2:] entity.tkSpans.append([idx, idx]) entity.labelSpans.append([labelName]) entities.append(entity) elif labelName[0] == 'M' or labelName[0] == 'E': if checkWrongState_BMES(outputs, idx + 1): entity = entities[-1] entity.tkSpans[-1][1] = idx entity.labelSpans[-1].append(labelName) anwserEntities = entities if return_str_or_not: # transfer Entity class into its str representation strEntities = [] for answer in anwserEntities: strEntity = answer.type for tkSpan in answer.tkSpans: strEntity += '[' + str(tkSpan[0]) + ',' + str(tkSpan[1]) + ']' strEntities.append(strEntity) return strEntities else: return anwserEntities
def read_one_file(fileName, annotation_dir, entities_overlapped_types): annotation_file = get_bioc_file(join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: entity_ = Entity() entity_.create(entity.id, entity.infons['type'], entity.locations[0].offset, entity.locations[0].end, entity.text, None, None, None) for old_entity in entities: if is_overlapped(entity_, old_entity): logging.debug( "entity overlapped: doc:{}, entity1_id:{}, entity1_type:{}, entity1_span:{} {}, entity2_id:{}, entity2_type:{}, entity2_span:{} {}" .format(fileName, old_entity.id, old_entity.type, old_entity.start, old_entity.end, entity_.id, entity_.type, entity_.start, entity_.end)) overlapped_types = entity_.type + "_" + old_entity.type if cmp( entity_.type, old_entity.type ) > 0 else old_entity.type + "_" + entity_.type if overlapped_types in entities_overlapped_types: count = entities_overlapped_types[overlapped_types] count += 1 entities_overlapped_types[overlapped_types] = count else: entities_overlapped_types[overlapped_types] = 1 entities.append(entity_)
def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types, type_filter, isFDA2018, isNorm): documents = [] annotation_file = get_fda_file(join(annotation_dir, fileName)) # each section is a document for section in annotation_file.sections: document = Document() document.name = fileName[:fileName.find('.')] + "_" + section.id if section.text is None: document.text = "" document.entities = [] document.sentences = [] documents.append(document) continue document.text = section.text entities = [] if isFDA2018 == False and isNorm == True: for reaction in annotation_file.reactions: entity = Entity() entity.name = reaction.name for normalization in reaction.normalizations: entity.norm_ids.append( normalization.meddra_pt_id) # can be none entity.norm_names.append(normalization.meddra_pt) entities.append(entity) else: for entity in annotation_file.mentions: if entity.section != section.id: continue if types and (entity.type not in type_filter): continue entities.append(entity) document.entities = entities if opt.nlp_tool == "nltk": if isTraining: sentences = get_sentences_and_tokens_from_nltk( section.text, nlp_tool, document.entities, annotation_file.ignore_regions, section.id) else: sentences = get_sentences_and_tokens_from_nltk( section.text, nlp_tool, None, annotation_file.ignore_regions, section.id) else: raise RuntimeError("invalid nlp tool") document.sentences = sentences documents.append(document) return documents, annotation_file
def parse_one_gold_file(annotation_dir, corpus_dir, fileName): document = Document() document.name = fileName[:fileName.find('.')] annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if entity.infons['type'] not in type_we_care: continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) # some entities may have no norm id continue entities.append(entity_) document.entities = entities corpus_file = get_text_file( os.path.join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file return document
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018, isNorm): documents = [] annotation_file = get_fda_file(os.path.join(annotation_dir, fileName)) # each section is a document for section in annotation_file.sections: document = Document() document.name = fileName[:fileName.find('.')] + "_" + section.id if section.text is None: document.text = "" document.entities = [] document.sentences = [] documents.append(document) continue document.text = section.text entities = [] if isFDA2018 == False and isNorm == True: for reaction in annotation_file.reactions: entity = Entity() entity.name = reaction.name for normalization in reaction.normalizations: entity.norm_ids.append( normalization.meddra_pt_id) # can be none entity.norm_names.append(normalization.meddra_pt) entities.append(entity) else: for entity in annotation_file.mentions: if entity.section != section.id: continue if types and (entity.type not in type_filter): continue entities.append(entity) document.entities = entities document.sentences = [] documents.append(document) return documents, annotation_file
def combineTwoEntity(a, b): c = Entity() c.type = a.type if (a.tkSpans[0][0] < b.tkSpans[0][0]): if (a.tkSpans[0][1] + 1 == b.tkSpans[0][0]): c.tkSpans.append([a.tkSpans[0][0], b.tkSpans[0][1]]) else: c.tkSpans.append(a.tkSpans[0]) c.tkSpans.append(b.tkSpans[0]) else: if (b.tkSpans[0][1] + 1 == a.tkSpans[0][0]): c.tkSpans.append([b.tkSpans[0][0], a.tkSpans[0][1]]) else: c.tkSpans.append(b.tkSpans[0]) c.tkSpans.append(a.tkSpans[0]) return c
def translateNCRFPPintoEntities(doc_token, predict_results, doc_name): entity_id = 1 results = [] sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( doc_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': entity = Entity() entity.create(str(entity_id), label[2:], token['start'], token['end'], token['text'], idx, idy, idy) results.append(entity) entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): entity = results[-1] entity.append(token['start'], token['end'], token['text'], idy) return results
def evaluate(documents, dictionary, dictionary_reverse, model): model.eval() ct_predicted = 0 ct_gold = 0 ct_correct = 0 for document in documents: # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse) p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities, dictionary) ct_gold += p1 ct_predicted += p2 ct_correct += p3 if ct_gold == 0: precision = 0 recall = 0 else: precision = ct_correct * 1.0 / ct_predicted recall = ct_correct * 1.0 / ct_gold if precision + recall == 0: f_measure = 0 else: f_measure = 2 * precision * recall / (precision + recall) return precision, recall, f_measure
def metamap_ner_my_norm(d): print("load umls ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [f for f in os.listdir(annotation_dir) if os.path.isfile(os.path.join(annotation_dir, f))] if opt.test_in_cpu: model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) model.eval() ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 correct_counter = Counter() wrong_counter = Counter() for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( os.path.join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities pred_entities = [] for gold in predict_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse) p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict, predict_document.entities, correct_counter, wrong_counter) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 sorted_correct_entities = OrderedDict(correct_counter.most_common()) sorted_correct_entities = json.dumps(sorted_correct_entities, indent=4) with codecs.open("sorted_correct_entities.txt", 'w', 'UTF-8') as fp: fp.write(sorted_correct_entities) sorted_wrong_entities = OrderedDict(wrong_counter.most_common()) sorted_wrong_entities = json.dumps(sorted_wrong_entities, indent=4) with codecs.open("sorted_wrong_entities.txt", 'w', 'UTF-8') as fp: fp.write(sorted_wrong_entities) p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
attention_model = DotAttentionLayer(768) file_count = 0 for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) if file_count < 1: file_count += 1 continue file_count += 1 gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) pred_entities = [] for gold in gold_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) Xs, Ys = generate_instances_ehr(pred_entities, model.dict_alphabet, UMLS_dict_reverse) data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate) data_iter = iter(data_loader) num_iter = len(data_loader) entity_start = 0
def load_data_pubtator(file_path): # stat ct_doc = 0 ct_entity = 0 documents = [] with codecs.open(file_path, 'r', 'UTF-8') as fp: document = None for line in fp: line = line.strip() if line == '': if document is None: continue else: # save the document documents.append(document) document = None ct_doc += 1 elif line.find('|t|') != -1: # a new document document = Document() columns = line.split('|t|') document.name = columns[0] document.text = columns[1] + " " # offset need + 1 elif line.find('|a|') != -1: columns = line.split('|a|') document.text += columns[1] generator = nlp_tool.span_tokenize(document.text) for t in generator: document.all_sents_inds.append(t) for ind in range(len(document.all_sents_inds)): t_start = document.all_sents_inds[ind][0] t_end = document.all_sents_inds[ind][1] tmp_tokens = FoxTokenizer.tokenize( t_start, document.text[t_start:t_end], False) sentence_tokens = [] for token_idx, token in enumerate(tmp_tokens): token_dict = {} token_dict['start'], token_dict['end'] = token[ 1], token[2] token_dict['text'] = token[0] sentence_tokens.append(token_dict) document.sentences.append(sentence_tokens) else: columns = line.split('\t') if columns[1] == 'CID': # for cdr corpus, we ignore relation continue if columns[4].find( "Chemical" ) != -1: # for cdr corpus, we ignore chemical continue entity = Entity() entity.spans.append([int(columns[1]), int(columns[2])]) entity.name = columns[3] entity.type = columns[4] if columns[5].find('|') != -1: ids = columns[5].split('|') for id in ids: if id == '-1': raise RuntimeError("id == -1") if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) elif columns[5].find('+') != -1: ids = columns[5].split('+') for id in ids: if id == '-1': raise RuntimeError("id == -1") if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) else: id = columns[5] if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) # columns[6], cdr may has Individual mentions, we don't use it yet for sent_idx, (sent_start, sent_end) in enumerate(document.all_sents_inds): if entity.spans[0][0] >= sent_start and entity.spans[0][ 1] <= sent_end: # we assume entity has only one span entity.sent_idx = sent_idx break if entity.sent_idx == -1: logging.debug("can't find entity.sent_idx: {} ".format( entity.name)) continue # raise RuntimeError("can't find entity.sent_idx") tkStart = -1 tkEnd = -1 for tkidx, token_dict in enumerate( document.sentences[entity.sent_idx]): if token_dict['start'] == entity.spans[0][0]: tkStart = tkidx if token_dict['end'] == entity.spans[0][1]: tkEnd = tkidx if tkStart != -1 and tkEnd != -1: break if tkStart == -1 or tkEnd == -1: raise RuntimeError('tkStart == -1 or tkEnd == -1') entity.tkSpans.append([tkStart, tkEnd]) document.entities.append(entity) ct_entity += 1 logging.info("document number {}, entity number {}".format( ct_doc, ct_entity)) return documents
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining, types, type_filter): document = Document() document.name = fileName[:fileName.find('.')] ct_snomed = 0 ct_meddra = 0 ct_unnormed = 0 if annotation_dir: annotation_file = get_bioc_file(join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if types and (entity.infons['type'] not in type_filter): continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) ct_snomed += 1 elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) ct_meddra += 1 else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) ct_unnormed += 1 continue entities.append(entity_) document.entities = entities corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file if opt.nlp_tool == "spacy": if isTraining: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, None) elif opt.nlp_tool == "nltk": if isTraining: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, document.entities, None, None) else: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, None, None, None) elif opt.nlp_tool == "stanford": if isTraining: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, None) else: raise RuntimeError("invalid nlp tool") document.sentences = sentences return document, ct_snomed, ct_meddra, ct_unnormed
def load_metamap_result_from_file(file_path): re_brackets = re.compile(r'\[[0-9|/]+\]') document = Document() entities = [] with codecs.open(file_path, 'r', 'UTF-8') as fp: for line in fp.readlines(): fields = line.strip().split(u"|") if fields[1] != u'MMI': continue ID = fields[ 0] # Unique identifier used to identify text being processed. If no identifier is found in the text, 00000000 will be displayed MMI = fields[1] # Always MMI Score = fields[ 2] # MetaMap Indexing (MMI) score with a maximum score of 1000.00 UMLS_Prefer_Name = fields[ 3] # The UMLS preferred name for the UMLS concept UMLS_ID = fields[4] # The CUI for the identified UMLS concept. Semantic_Type_List = fields[ 5] # Comma-separated list of Semantic Type abbreviations Trigger_Information = fields[ 6] # Comma separated sextuple showing what triggered MMI to identify this UMLS concept Location = fields[ 7] # Summarizes where UMLS concept was found. TI – Title, AB – Abstract, TX – Free Text, TI;AB – Title and Abstract Positional_Information = fields[ 8] # Semicolon-separated list of positional-information terns, showing StartPos, slash (/), and Length of each trigger identified in the Trigger Information field Treecode = fields[ 9] # Semicolon-separated list of any MeSH treecode triggers = Trigger_Information[1:-1].split(u",\"") spans = Positional_Information.split(u";") if len(triggers) != len(spans): raise RuntimeError( "the number of triggers is not equal to that of spans: {} in {}" .format(UMLS_ID, file_path[file_path.rfind('/') + 1:])) for idx, span in enumerate(spans): bracket_spans = re_brackets.findall(span) if len(bracket_spans) == 0: # simple form if span.find(u',') != -1: logging.debug( "ignore non-continuous form of Positional_Information: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue tmps = span.split(u"/") entity = Entity() entity.spans.append( [int(tmps[0]), int(tmps[0]) + int(tmps[1])]) entity.norm_ids.append(str(UMLS_ID)) # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0 tmps = triggers[idx].split(u"-") if tmps[3].find('"') == -1: logging.debug( "ignore non-string entity: {} in {}".format( tmps[3], file_path[file_path.rfind('/') + 1:])) continue if len(tmps) != 6: logging.debug( "parsing trigger error, ignore entity: {} in {}". format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue entity.name = tmps[3][1:-1] # remove "" entities.append(entity) else: for bracket_span in bracket_spans: if bracket_span.find(u',') != -1: logging.debug( "ignore non-continuous form of Positional_Information: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue tmps = bracket_span[1:-1].split(u"/") entity = Entity() entity.spans.append( [int(tmps[0]), int(tmps[0]) + int(tmps[1])]) entity.norm_ids.append(str(UMLS_ID)) # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0 tmps = triggers[idx].split(u"-") if tmps[3].find('"') == -1: logging.debug( "ignore non-string entity: {} in {}".format( tmps[3], file_path[file_path.rfind('/') + 1:])) continue if len(tmps) != 6: logging.debug( "parsing trigger error, ignore entity: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue entity.name = tmps[3][1:-1] entities.append(entity) document.entities = entities return document
def evaluate(documents, dictionary, dictionary_reverse, vsm_model, neural_model, ensemble_model, d, isMeddra_dict): if vsm_model is not None: vsm_model.eval() if neural_model is not None: neural_model.eval() if ensemble_model is not None: ensemble_model.eval() ct_predicted = 0 ct_gold = 0 ct_correct = 0 # if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ct_correct_rule = 0 # ct_correct_vsm = 0 # ct_correct_neural = 0 # ct_correct_all = 0 # ct_correct_rule_vsm = 0 # ct_correct_rule_neural = 0 # ct_correct_vsm_neural = 0 for document in documents: # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse, isMeddra_dict) else: pred_entities2 = copy.deepcopy(pred_entities) pred_entities3 = copy.deepcopy(pred_entities) merge_entities = copy.deepcopy(pred_entities) multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) vsm_model.process_one_doc(document, pred_entities2, dictionary, dictionary_reverse, isMeddra_dict) neural_model.process_one_doc(document, pred_entities3, dictionary, dictionary_reverse, isMeddra_dict) elif opt.norm_rule: multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) elif opt.norm_vsm: vsm_model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse, isMeddra_dict) elif opt.norm_neural: neural_model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse, isMeddra_dict) else: raise RuntimeError("wrong configuration") if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ct_gold += len(document.entities) # ct_predicted += len(pred_entities) # up bound of ensemble, if at least one system makes a correct prediction, we count it as correct. # for idx, gold in enumerate(document.entities): # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids): # ct_correct_all += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids): # ct_correct_rule += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids): # ct_correct_vsm += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids): # ct_correct_neural += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids): # ct_correct_rule_vsm += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids): # ct_correct_rule_neural += 1 # ct_correct += 1 # # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\ # and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \ # and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids): # ct_correct_vsm_neural += 1 # ct_correct += 1 if opt.ensemble == 'learn': if isMeddra_dict: p1, p2, p3 = evaluate_for_fda(document.entities, pred_entities) else: p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities, dictionary) ct_gold += p1 ct_predicted += p2 ct_correct += p3 else: ensemble.merge_result(pred_entities, pred_entities2, pred_entities3, merge_entities, dictionary, isMeddra_dict, vsm_model.dict_alphabet, d) if isMeddra_dict: p1, p2, p3 = evaluate_for_fda(document.entities, merge_entities) else: p1, p2, p3 = evaluate_for_ehr(document.entities, merge_entities, dictionary) ct_gold += p1 ct_predicted += p2 ct_correct += p3 else: if isMeddra_dict: p1, p2, p3 = evaluate_for_fda(document.entities, pred_entities) else: p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities, dictionary) ct_gold += p1 ct_predicted += p2 ct_correct += p3 # if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # logging.info("ensemble correct. all:{} rule:{} vsm:{} neural:{} rule_vsm:{} rule_neural:{} vsm_neural:{}" # .format(ct_correct_all, ct_correct_rule, ct_correct_vsm, ct_correct_neural, ct_correct_rule_vsm, # ct_correct_rule_neural, ct_correct_vsm_neural)) # # logging.info("gold:{} pred:{} correct:{}".format(ct_gold, ct_predicted, ct_correct)) if ct_gold == 0: precision = 0 recall = 0 else: precision = ct_correct * 1.0 / ct_predicted recall = ct_correct * 1.0 / ct_gold if precision + recall == 0: f_measure = 0 else: f_measure = 2 * precision * recall / (precision + recall) return precision, recall, f_measure
def metamap_ner_my_norm(d): print("load umls ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO( d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [ f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f)) ] if opt.norm_rule: multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities pred_entities = [] for gold in predict_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) if opt.norm_rule: multi_sieve.runMultiPassSieve(gold_document, pred_entities, UMLS_dict, False) elif opt.norm_neural: neural_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_vsm: vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) else: raise RuntimeError("wrong configuration") p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
def get_ner_BIOHD_1234(outputs, return_str_or_not): entities = [] for idx in range(len(outputs)): labelName = outputs[idx] if labelName == 'B-X' or labelName == 'HB-X' or labelName == 'D1B-X' or labelName == 'D2B-X' \ or labelName == 'D3B-X' or labelName == 'D4B-X': entity = Entity() entity.type = 'X' entity.tkSpans.append([idx, idx]) entity.labelSpans.append([labelName]) entities.append(entity) elif labelName == 'I-X' or labelName == 'HI-X' or labelName == 'D1I-X' or labelName == 'D2I-X' \ or labelName == 'D3I-X' or labelName == 'D4I-X': if checkWrongState(outputs, idx + 1): entity = entities[-1] entity.tkSpans[-1][1] = idx entity.labelSpans[-1].append(labelName) # post-processing to rebuild entities postEntities = [] HB_HI = [] D1B_D1I = [] D2B_D2I = [] D3B_D3I = [] D4B_D4I = [] for temp in entities: labelSpan = temp.labelSpans[0] if labelSpan[0] == 'HB-X': HB_HI.append(temp) elif labelSpan[0] == 'D1B-X': D1B_D1I.append(temp) elif (labelSpan[0] == 'D2B-X'): D2B_D2I.append(temp) elif (labelSpan[0] == 'D3B-X'): D3B_D3I.append(temp) elif (labelSpan[0] == 'D4B-X'): D4B_D4I.append(temp) else: postEntities.append(temp) if len(HB_HI) != 0: for d1b in D1B_D1I: # combine with the nearest head entity at left target = None for hb in HB_HI: if (hb.tkSpans[0][0] < d1b.tkSpans[0][0]): target = hb else: break if target is None: pass else: combined = combineTwoEntity(d1b, target) postEntities.append(combined) if len(D1B_D1I) == 1: postEntities.append(target) for d3b in D3B_D3I: # combine with the nearest head entity at right target = None for hb in reversed(HB_HI): if (hb.tkSpans[0][0] > d3b.tkSpans[0][0]): target = hb else: break if target is None: pass else: combined = combineTwoEntity(d3b, target) postEntities.append(combined) if len(D3B_D3I) == 1: postEntities.append(target) else: for d2b in D2B_D2I: # combine with the nearest non-head entity at left target = None for db in D1B_D1I: if (db.tkSpans[0][0] < d2b.tkSpans[0][0]): target = db else: break for db in D2B_D2I: if (db.tkSpans[0][0] < d2b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] < db.tkSpans[0][0]): target = db else: target = db else: break for db in D3B_D3I: if (db.tkSpans[0][0] < d2b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] < db.tkSpans[0][0]): target = db else: target = db else: break for db in D4B_D4I: if (db.tkSpans[0][0] < d2b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] < db.tkSpans[0][0]): target = db else: target = db else: break if target is None: pass else: combined = combineTwoEntity(d2b, target) postEntities.append(combined) for d4b in D4B_D4I: # combine with the nearest non-head entity at right target = None for db in reversed(D1B_D1I): if (db.tkSpans[0][0] > d4b.tkSpans[0][0]): target = db else: break for db in reversed(D2B_D2I): if (db.tkSpans[0][0] > d4b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] > db.tkSpans[0][0]): target = db else: target = db else: break for db in reversed(D3B_D3I): if (db.tkSpans[0][0] > d4b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] > db.tkSpans[0][0]): target = db else: target = db else: break for db in reversed(D4B_D4I): if (db.tkSpans[0][0] > d4b.tkSpans[0][0]): if (target is not None and target.tkSpans[0][0] > db.tkSpans[0][0]): target = db else: target = db else: break if target is None: pass else: combined = combineTwoEntity(d4b, target) postEntities.append(combined) # resort by start position and remove the same entity anwserEntities = [] for temp in postEntities: isIn = False for anwser in anwserEntities: if anwser.equalsTkSpan(temp): isIn = True break if isIn == False: iter = 0 for old in anwserEntities: if old.tkSpans[0][0] > temp.tkSpans[0][0]: break iter += 1 anwserEntities.insert(iter, temp) if return_str_or_not: # transfer Entity class into its str representation strEntities = [] for answer in anwserEntities: strEntity = 'X' for tkSpan in answer.tkSpans: strEntity += '[' + str(tkSpan[0]) + ',' + str(tkSpan[1]) + ']' strEntities.append(strEntity) return strEntities else: return anwserEntities
def error_analysis(d, dictionary, dictionary_reverse, opt, isMeddra_dict): logging.info("error_analysis ...") test_data = loadData(opt.test_file, False, opt.types, opt.type_filter) logging.info("use my tokenizer") nlp_tool = None logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() ct_predicted = 0 ct_gold = 0 ct_correct = 0 for document in test_data: logging.info("###### begin {}".format(document.name)) # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.name = gold.name pred_entities.append(pred) neural_model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse, isMeddra_dict) ct_norm_gold = len(document.entities) ct_norm_predict = len(pred_entities) ct_norm_correct = 0 for predict_entity in pred_entities: for gold_entity in document.entities: if predict_entity.equals_span(gold_entity): b_right = False if len(gold_entity.norm_ids) == 0: # if gold_entity not annotated, we count it as TP b_right = True ct_norm_correct += 1 else: if len(predict_entity.norm_ids) != 0 and predict_entity.norm_ids[0] in dictionary: concept = dictionary[predict_entity.norm_ids[0]] if gold_entity.norm_ids[0] in concept.codes: ct_norm_correct += 1 b_right = True if b_right == False: if len(predict_entity.norm_ids) != 0 and predict_entity.norm_ids[0] in dictionary: concept = dictionary[predict_entity.norm_ids[0]] logging.info("entity name: {} | gold id, name: {}, {} | pred cui, codes, names: {}, {}, {}" .format(predict_entity.name, gold_entity.norm_ids[0], gold_entity.norm_names[0], concept.cui, concept.codes, concept.names)) break ct_predicted += ct_norm_predict ct_gold += ct_norm_gold ct_correct += ct_norm_correct if ct_gold == 0: precision = 0 recall = 0 else: precision = ct_correct * 1.0 / ct_predicted recall = ct_correct * 1.0 / ct_gold if precision+recall == 0: f_measure = 0 else: f_measure = 2*precision*recall/(precision+recall) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (precision, recall, f_measure))
def generate_instances(document, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict): Xs = [] Ys = [] # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) for idx, entity in enumerate(document.entities): if isMeddra_dict: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue else: Ys.append(0) else: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.info( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = dict() tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) X['word'] = word_ids if pred_entities[idx].rule_id is None: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) else: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) X['rule'][norm_utils.get_dict_index( dict_alphabet, pred_entities[idx].rule_id)] = 1 Xs.append(X) return Xs, Ys