def processOneFile_fda(fileName, annotation_dir, nlp_tool, isTraining, types, type_filter, isFDA2018, isNorm): documents = [] annotation_file = get_fda_file(join(annotation_dir, fileName)) # each section is a document for section in annotation_file.sections: document = Document() document.name = fileName[:fileName.find('.')] + "_" + section.id if section.text is None: document.text = "" document.entities = [] document.sentences = [] documents.append(document) continue document.text = section.text entities = [] if isFDA2018 == False and isNorm == True: for reaction in annotation_file.reactions: entity = Entity() entity.name = reaction.name for normalization in reaction.normalizations: entity.norm_ids.append( normalization.meddra_pt_id) # can be none entity.norm_names.append(normalization.meddra_pt) entities.append(entity) else: for entity in annotation_file.mentions: if entity.section != section.id: continue if types and (entity.type not in type_filter): continue entities.append(entity) document.entities = entities if opt.nlp_tool == "nltk": if isTraining: sentences = get_sentences_and_tokens_from_nltk( section.text, nlp_tool, document.entities, annotation_file.ignore_regions, section.id) else: sentences = get_sentences_and_tokens_from_nltk( section.text, nlp_tool, None, annotation_file.ignore_regions, section.id) else: raise RuntimeError("invalid nlp tool") document.sentences = sentences documents.append(document) return documents, annotation_file
def parse_one_gold_file(annotation_dir, corpus_dir, fileName): document = Document() document.name = fileName[:fileName.find('.')] annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if entity.infons['type'] not in type_we_care: continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) # some entities may have no norm id continue entities.append(entity_) document.entities = entities corpus_file = get_text_file( os.path.join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file return document
def convert_to_document(self,text_data,title=None): ''' text_data can be a string or string list title is a string ''' if isinstance(text_data, str): text_data=self.sent_detector.tokenize(text_data.strip()) sents=[] for content in text_data: sent=self.sent_converter.convert_to_sentence(content) if sent: sents.append(sent) doc=Document(sents) if title: doc.title=self.sent_converter.convert_to_sentence(title) return doc
def processOneFile_fda(fileName, annotation_dir, types, type_filter, isFDA2018, isNorm): documents = [] annotation_file = get_fda_file(os.path.join(annotation_dir, fileName)) # each section is a document for section in annotation_file.sections: document = Document() document.name = fileName[:fileName.find('.')] + "_" + section.id if section.text is None: document.text = "" document.entities = [] document.sentences = [] documents.append(document) continue document.text = section.text entities = [] if isFDA2018 == False and isNorm == True: for reaction in annotation_file.reactions: entity = Entity() entity.name = reaction.name for normalization in reaction.normalizations: entity.norm_ids.append( normalization.meddra_pt_id) # can be none entity.norm_names.append(normalization.meddra_pt) entities.append(entity) else: for entity in annotation_file.mentions: if entity.section != section.id: continue if types and (entity.type not in type_filter): continue entities.append(entity) document.entities = entities document.sentences = [] documents.append(document) return documents, annotation_file
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining, types, type_filter): document = Document() document.name = fileName[:fileName.find('.')] ct_snomed = 0 ct_meddra = 0 ct_unnormed = 0 if annotation_dir: annotation_file = get_bioc_file(join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if types and (entity.infons['type'] not in type_filter): continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) ct_snomed += 1 elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) ct_meddra += 1 else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) ct_unnormed += 1 continue entities.append(entity_) document.entities = entities corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file if opt.nlp_tool == "spacy": if isTraining: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, None) elif opt.nlp_tool == "nltk": if isTraining: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, document.entities, None, None) else: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, None, None, None) elif opt.nlp_tool == "stanford": if isTraining: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, None) else: raise RuntimeError("invalid nlp tool") document.sentences = sentences return document, ct_snomed, ct_meddra, ct_unnormed
def load_metamap_result_from_file(file_path): re_brackets = re.compile(r'\[[0-9|/]+\]') document = Document() entities = [] with codecs.open(file_path, 'r', 'UTF-8') as fp: for line in fp.readlines(): fields = line.strip().split(u"|") if fields[1] != u'MMI': continue ID = fields[ 0] # Unique identifier used to identify text being processed. If no identifier is found in the text, 00000000 will be displayed MMI = fields[1] # Always MMI Score = fields[ 2] # MetaMap Indexing (MMI) score with a maximum score of 1000.00 UMLS_Prefer_Name = fields[ 3] # The UMLS preferred name for the UMLS concept UMLS_ID = fields[4] # The CUI for the identified UMLS concept. Semantic_Type_List = fields[ 5] # Comma-separated list of Semantic Type abbreviations Trigger_Information = fields[ 6] # Comma separated sextuple showing what triggered MMI to identify this UMLS concept Location = fields[ 7] # Summarizes where UMLS concept was found. TI – Title, AB – Abstract, TX – Free Text, TI;AB – Title and Abstract Positional_Information = fields[ 8] # Semicolon-separated list of positional-information terns, showing StartPos, slash (/), and Length of each trigger identified in the Trigger Information field Treecode = fields[ 9] # Semicolon-separated list of any MeSH treecode triggers = Trigger_Information[1:-1].split(u",\"") spans = Positional_Information.split(u";") if len(triggers) != len(spans): raise RuntimeError( "the number of triggers is not equal to that of spans: {} in {}" .format(UMLS_ID, file_path[file_path.rfind('/') + 1:])) for idx, span in enumerate(spans): bracket_spans = re_brackets.findall(span) if len(bracket_spans) == 0: # simple form if span.find(u',') != -1: logging.debug( "ignore non-continuous form of Positional_Information: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue tmps = span.split(u"/") entity = Entity() entity.spans.append( [int(tmps[0]), int(tmps[0]) + int(tmps[1])]) entity.norm_ids.append(str(UMLS_ID)) # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0 tmps = triggers[idx].split(u"-") if tmps[3].find('"') == -1: logging.debug( "ignore non-string entity: {} in {}".format( tmps[3], file_path[file_path.rfind('/') + 1:])) continue if len(tmps) != 6: logging.debug( "parsing trigger error, ignore entity: {} in {}". format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue entity.name = tmps[3][1:-1] # remove "" entities.append(entity) else: for bracket_span in bracket_spans: if bracket_span.find(u',') != -1: logging.debug( "ignore non-continuous form of Positional_Information: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue tmps = bracket_span[1:-1].split(u"/") entity = Entity() entity.spans.append( [int(tmps[0]), int(tmps[0]) + int(tmps[1])]) entity.norm_ids.append(str(UMLS_ID)) # "B cell lymphoma"-tx-5-"B cell lymphoma"-noun-0 tmps = triggers[idx].split(u"-") if tmps[3].find('"') == -1: logging.debug( "ignore non-string entity: {} in {}".format( tmps[3], file_path[file_path.rfind('/') + 1:])) continue if len(tmps) != 6: logging.debug( "parsing trigger error, ignore entity: {} in {}" .format(triggers[idx], file_path[file_path.rfind('/') + 1:])) continue entity.name = tmps[3][1:-1] entities.append(entity) document.entities = entities return document
def load_data_pubtator(file_path): # stat ct_doc = 0 ct_entity = 0 documents = [] with codecs.open(file_path, 'r', 'UTF-8') as fp: document = None for line in fp: line = line.strip() if line == '': if document is None: continue else: # save the document documents.append(document) document = None ct_doc += 1 elif line.find('|t|') != -1: # a new document document = Document() columns = line.split('|t|') document.name = columns[0] document.text = columns[1] + " " # offset need + 1 elif line.find('|a|') != -1: columns = line.split('|a|') document.text += columns[1] generator = nlp_tool.span_tokenize(document.text) for t in generator: document.all_sents_inds.append(t) for ind in range(len(document.all_sents_inds)): t_start = document.all_sents_inds[ind][0] t_end = document.all_sents_inds[ind][1] tmp_tokens = FoxTokenizer.tokenize( t_start, document.text[t_start:t_end], False) sentence_tokens = [] for token_idx, token in enumerate(tmp_tokens): token_dict = {} token_dict['start'], token_dict['end'] = token[ 1], token[2] token_dict['text'] = token[0] sentence_tokens.append(token_dict) document.sentences.append(sentence_tokens) else: columns = line.split('\t') if columns[1] == 'CID': # for cdr corpus, we ignore relation continue if columns[4].find( "Chemical" ) != -1: # for cdr corpus, we ignore chemical continue entity = Entity() entity.spans.append([int(columns[1]), int(columns[2])]) entity.name = columns[3] entity.type = columns[4] if columns[5].find('|') != -1: ids = columns[5].split('|') for id in ids: if id == '-1': raise RuntimeError("id == -1") if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) elif columns[5].find('+') != -1: ids = columns[5].split('+') for id in ids: if id == '-1': raise RuntimeError("id == -1") if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) else: id = columns[5] if id.find("OMIM:") != -1: id = id[id.find("OMIM:") + len("OMIM:"):] entity.norm_ids.append(id) else: entity.norm_ids.append(id) # columns[6], cdr may has Individual mentions, we don't use it yet for sent_idx, (sent_start, sent_end) in enumerate(document.all_sents_inds): if entity.spans[0][0] >= sent_start and entity.spans[0][ 1] <= sent_end: # we assume entity has only one span entity.sent_idx = sent_idx break if entity.sent_idx == -1: logging.debug("can't find entity.sent_idx: {} ".format( entity.name)) continue # raise RuntimeError("can't find entity.sent_idx") tkStart = -1 tkEnd = -1 for tkidx, token_dict in enumerate( document.sentences[entity.sent_idx]): if token_dict['start'] == entity.spans[0][0]: tkStart = tkidx if token_dict['end'] == entity.spans[0][1]: tkEnd = tkidx if tkStart != -1 and tkEnd != -1: break if tkStart == -1 or tkEnd == -1: raise RuntimeError('tkStart == -1 or tkEnd == -1') entity.tkSpans.append([tkStart, tkEnd]) document.entities.append(entity) ct_entity += 1 logging.info("document number {}, entity number {}".format( ct_doc, ct_entity)) return documents