def parse_one_gold_file(annotation_dir, corpus_dir, fileName): document = Document() document.name = fileName[:fileName.find('.')] annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if entity.infons['type'] not in type_we_care: continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) # some entities may have no norm id continue entities.append(entity_) document.entities = entities corpus_file = get_text_file( os.path.join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file return document
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining, types, type_filter): document = Document() document.name = fileName[:fileName.find('.')] ct_snomed = 0 ct_meddra = 0 ct_unnormed = 0 if annotation_dir: annotation_file = get_bioc_file(join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: if types and (entity.infons['type'] not in type_filter): continue entity_ = Entity() entity_.id = entity.id processed_name = entity.text.replace('\\n', ' ') if len(processed_name) == 0: logging.debug("{}: entity {} name is empty".format( fileName, entity.id)) continue entity_.name = processed_name entity_.type = entity.infons['type'] entity_.spans.append( [entity.locations[0].offset, entity.locations[0].end]) if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\ and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'): entity_.norm_ids.append(entity.infons['SNOMED code']) entity_.norm_names.append(entity.infons['SNOMED term']) ct_snomed += 1 elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\ and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'): entity_.norm_ids.append(entity.infons['MedDRA code']) entity_.norm_names.append(entity.infons['MedDRA term']) ct_meddra += 1 else: logging.debug("{}: no norm id in entity {}".format( fileName, entity.id)) ct_unnormed += 1 continue entities.append(entity_) document.entities = entities corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0])) document.text = corpus_file if opt.nlp_tool == "spacy": if isTraining: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_spacy( corpus_file, nlp_tool, None) elif opt.nlp_tool == "nltk": if isTraining: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, document.entities, None, None) else: sentences = get_sentences_and_tokens_from_nltk( corpus_file, nlp_tool, None, None, None) elif opt.nlp_tool == "stanford": if isTraining: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, document.entities) else: sentences = get_sentences_and_tokens_from_stanford( corpus_file, nlp_tool, None) else: raise RuntimeError("invalid nlp tool") document.sentences = sentences return document, ct_snomed, ct_meddra, ct_unnormed