def load_i2b2_2012(partition='train'): assert partition in ['train', 'test'] language = get_language() if partition == 'train': annotation_dir = os.path.join(base_path, 'ner', 'i2b2_2012', '2012-07-15.original-annotation.release') else: annotation_dir = os.path.join(base_path, 'ner', 'i2b2_2012', 'ground_truth', 'merged_xml') file_list = [file for file in os.listdir(annotation_dir) if file.endswith('xml')] file_ids = sorted(set(file[:-4] for file in file_list)) # remove ending to get unique files annotations = [] raw_text_files = [] unique_entity_labels = set() for id in file_ids: root = ET.fromstring(open(os.path.join(annotation_dir, f'{id}.xml'), 'r').read().strip().replace('&', ' ')) annotation = { 'entities': { }, 'relations': [] } raw_text_files.append(root.findall("./TEXT")[0].text) annotation_xml = root.findall("./TAGS")[0] for annotation_dict in annotation_xml: assert isinstance(annotation_dict.attrib, dict) if not annotation_dict.attrib['id'].startswith('E'): continue annotation['entities'][annotation_dict.attrib['id']] = [] start, end, label = int(annotation_dict.attrib['start']), int(annotation_dict.attrib['end']), annotation_dict.attrib['type'] if label == '': continue unique_entity_labels.add(label) annotation['entities'][annotation_dict.attrib['id']].append(tuple((start, end, label))) annotations.append(annotation) raw_text_files = list(language.pipe(raw_text_files, batch_size=50)) #Assures that the annotated dataset labels align with the tokenization. for idx, (id, doc, annotation) in enumerate(zip(file_ids, raw_text_files, annotations)): #doc = language(doc) fixed_annotation = { 'entities':{ }, 'relations': annotation['relations'] } for idx, key in enumerate(annotation['entities']): instance = annotation['entities'][key] fixed_annotation['entities'][key] = [] for span in instance: char_span = doc.char_span(span[0], span[1]) if char_span is None: if doc.char_span(span[0]-1, span[1]) is not None: char_span = doc.char_span(span[0]-1, span[1]) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0], span[1]+2) is not None: char_span = doc.char_span(span[0], span[1]+2) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0]-2, span[1]-2) is not None: char_span = doc.char_span(span[0]-2, span[1]-2) elif doc.char_span(span[0]-2, span[1]-1) is not None: char_span = doc.char_span(span[0]-2, span[1]-1) elif doc.char_span(span[0], span[1]-1) is not None: char_span = doc.char_span(span[0], span[1]-1) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0]+1, span[1]) is not None: char_span = doc.char_span(span[0]+1, span[1]) elif doc.char_span(span[0], span[1]-3) is not None: char_span = doc.char_span(span[0], span[1]-3) elif doc.char_span(span[0]+5, span[1]+5) is not None: char_span = doc.char_span(span[0]+5, span[1]+5) if char_span is None: for token in doc: print(token) print(str(doc)[span[0]- 20:span[1] + 20]) print(id, span, str(doc)[span[0]:span[1]]) raise RuntimeError( 'Could not load mention span from %s as it does not align with tokenization. Add \'%s\' to tokenization exceptions.' % (id, str(doc)[int(span[0]):int(span[1])])) #checks if this span overlaps with any other overlapping = False for idx2, key2 in enumerate(annotation['entities']): for s2 in annotation['entities'][key2]: if char_span.start_char <= s2[1] and s2[0] <= char_span.end_char: if idx == idx2: pass else: # overlapping span, ignore the occurence. overlapping = True if not overlapping: fixed_annotation['entities'][key].append(tuple((char_span.start_char, char_span.end_char, span[2]))) fixed_annotation['entity_labels'] = I2B2_2012_NER_LABELS fixed_annotation['relation_labels'] = I2B2_2012_RELATION_LABELS doc._.id = id yield doc, fixed_annotation
def load_quaero_frenchmed(partition='train'): assert partition in ['train', 'test'] language = get_language() if partition == 'train': annotation_dir = os.path.join(base_path, 'ner', 'quaero_frenchmed_2014', 'train') else: annotation_dir = os.path.join(base_path, 'ner', 'quaero_frenchmed_2014', 'test') file_list = os.listdir(annotation_dir) file_ids = sorted(set(file[:-4] for file in file_list)) # remove ending to get unique files annotations = [] raw_text_files = [] unique_entity_labels = set() unique_relation_labels = set() for id in file_ids: raw_text_files.append(open(os.path.join(annotation_dir, f'{id}.txt'), 'r').read().strip()) annotation_file = open(os.path.join(annotation_dir, f'{id}.ann'), 'r').read().strip() annotation = { 'entities': { }, 'relations': [] } #convert con format in character spans w.r.t. document. for line in annotation_file.strip().split('\n'): line = line.split('\t') if not line[0]: continue if line[0][0] == "T": annotation['entities'][line[0]] = [] label = line[1].split(' ')[0] unique_entity_labels.add(label) spans = [int(index) for x in line[1].split(' ')[1:] for index in x.split(';')] if len(spans) == 4 and (spans[1] == spans[2] or spans[1] + 1 == spans[2]): annotation['entities'][line[0]].append(tuple((spans[0], spans[3], label))) else: for idx in range(0, len(spans), 2): annotation['entities'][line[0]].append(tuple((spans[idx], spans[idx + 1], label))) # print(annotation['entities'][line[0]]) if line[0][0] == "R": relation, source, target = line[1].split(' ')[0], line[1].split(' ')[1].split(':')[1], \ line[1].split(' ')[2].split(':')[1] unique_relation_labels.add(relation) annotation['relations'].append(tuple((source, target, relation))) # print(annotation['relations'][-1]) annotations.append(annotation) raw_text_files = list(language.pipe(raw_text_files, batch_size=50)) #Assures that the annotated dataset labels align with the tokenization. for idx, (id, doc, annotation) in enumerate(zip(file_ids, raw_text_files, annotations)): #doc = language(doc) fixed_annotation = { 'entities':{ }, 'relations': annotation['relations'] } for idx, key in enumerate(annotation['entities']): instance = annotation['entities'][key] fixed_annotation['entities'][key] = [] for span in instance: char_span = doc.char_span(span[0], span[1]) if char_span is None: if doc.char_span(span[0]-1, span[1]) is not None: char_span = doc.char_span(span[0]-1, span[1]) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0]-2, span[1]) is not None: char_span = doc.char_span(span[0]-2, span[1]) elif doc.char_span(span[0], span[1]+2) is not None: char_span = doc.char_span(span[0], span[1]+2) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0]-2, span[1]-2) is not None: char_span = doc.char_span(span[0]-2, span[1]-2) elif doc.char_span(span[0]-2, span[1]-1) is not None: char_span = doc.char_span(span[0]-2, span[1]-1) elif doc.char_span(span[0], span[1]-1) is not None: char_span = doc.char_span(span[0], span[1]-1) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0]+1, span[1]) is not None: char_span = doc.char_span(span[0]+1, span[1]) elif doc.char_span(span[0], span[1]-3) is not None: char_span = doc.char_span(span[0], span[1]-3) elif doc.char_span(span[0]+5, span[1]+5) is not None: char_span = doc.char_span(span[0]+5, span[1]+5) if char_span is None: # print(str(doc)[span[0]- 20:span[1] + 20]) # print(id, span, str(doc)[span[0]:span[1]]) continue # raise RuntimeError( # 'Could not load mention span from %s as it does not align with tokenization. Add \'%s\' to tokenization exceptions.' # % (id, str(doc)[int(span[0]):int(span[1])])) #checks if this span overlaps with any other overlapping = False for idx2, key2 in enumerate(annotation['entities']): for s2 in annotation['entities'][key2]: if char_span.start_char <= s2[1] and s2[0] <= char_span.end_char: if idx == idx2: pass else: # overlapping span, ignore the occurence. overlapping = True if not overlapping: fixed_annotation['entities'][key].append(tuple((char_span.start_char, char_span.end_char, span[2]))) fixed_annotation['entity_labels'] = QUAERO_FRENCHMED_2014_NER_LABELS fixed_annotation['relation_labels'] = QUAERO_FRENCHMED_2014_RELATION_LABELS doc._.id = id yield doc, fixed_annotation
def load_i2b2_2014(partition='train'): assert partition in ['train', 'test'] language = get_language() if partition == 'train': annotation_dir = (os.path.join(base_path, 'ner', 'i2b2_2014', 'training-PHI-Gold-Set1'), os.path.join(base_path, 'ner', 'i2b2_2014', 'training-PHI-Gold-Set2')) file_list = os.listdir(annotation_dir[0]) file_list += os.listdir(annotation_dir[1]) else: annotation_dir = os.path.join(base_path, 'ner', 'i2b2_2014', 'testing-PHI-Gold-fixed') file_list = os.listdir(annotation_dir) file_ids = sorted(set(file[:-4] for file in file_list)) # remove ending to get unique files # print(file_ids) # exit() annotations = [] raw_text_files = [] unique_entity_labels = set() for id in file_ids: if partition == 'train': if os.path.exists(os.path.join(annotation_dir[0], f'{id}.xml')): root = ET.fromstring(open(os.path.join(annotation_dir[0], f'{id}.xml'), 'r').read().strip()) else: root = ET.fromstring(open(os.path.join(annotation_dir[1], f'{id}.xml'), 'r').read().strip()) else: root = ET.fromstring(open(os.path.join(annotation_dir, f'{id}.xml'), 'r').read().strip()) annotation = { 'entities': { }, 'relations': [] } raw_text_files.append(root.findall("./TEXT")[0].text) annotation_xml = root.findall("./TAGS")[0] for annotation_dict in annotation_xml: assert isinstance(annotation_dict.attrib, dict) annotation['entities'][annotation_dict.attrib['id']] = [] start, end, label = int(annotation_dict.attrib['start']), int(annotation_dict.attrib['end']), annotation_dict.attrib['TYPE'] unique_entity_labels.add(label) annotation['entities'][annotation_dict.attrib['id']].append(tuple((start, end, label))) annotations.append(annotation) raw_text_files = list(language.pipe(raw_text_files, batch_size=50)) #Assures that the annotated dataset labels align with the tokenization. for idx, (id, doc, annotation) in enumerate(zip(file_ids, raw_text_files, annotations)): #doc = language(doc) fixed_annotation = { 'entities':{ }, 'relations': annotation['relations'] } for idx, key in enumerate(annotation['entities']): instance = annotation['entities'][key] fixed_annotation['entities'][key] = [] for span in instance: char_span = doc.char_span(span[0], span[1]) if char_span is None: if doc.char_span(span[0]-1, span[1]) is not None: char_span = doc.char_span(span[0]-1, span[1]) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0], span[1]+2) is not None: char_span = doc.char_span(span[0], span[1]+2) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0]-2, span[1]-2) is not None: char_span = doc.char_span(span[0]-2, span[1]-2) elif doc.char_span(span[0]-2, span[1]-1) is not None: char_span = doc.char_span(span[0]-2, span[1]-1) elif doc.char_span(span[0], span[1]-1) is not None: char_span = doc.char_span(span[0], span[1]-1) elif doc.char_span(span[0]-1, span[1]-1) is not None: char_span = doc.char_span(span[0]-1, span[1]-1) elif doc.char_span(span[0], span[1]+1) is not None: char_span = doc.char_span(span[0], span[1]+1) elif doc.char_span(span[0]+1, span[1]) is not None: char_span = doc.char_span(span[0]+1, span[1]) elif doc.char_span(span[0], span[1]-3) is not None: char_span = doc.char_span(span[0], span[1]-3) elif doc.char_span(span[0]+5, span[1]+5) is not None: char_span = doc.char_span(span[0]+5, span[1]+5) if char_span is None: # for token in doc: # print(token) # print(str(doc)[span[0]- 20:span[1] + 20]) # print(id, span, str(doc)[span[0]:span[1]]) continue #checks if this span overlaps with any other overlapping = False for idx2, key2 in enumerate(annotation['entities']): for s2 in annotation['entities'][key2]: if char_span.start_char <= s2[1] and s2[0] <= char_span.end_char: if idx == idx2: pass else: # overlapping span, ignore the occurence. overlapping = True if not overlapping: fixed_annotation['entities'][key].append(tuple((char_span.start_char, char_span.end_char, span[2]))) fixed_annotation['entity_labels'] = I2B2_2014_NER_LABELS fixed_annotation['relation_labels'] = I2B2_2014_RELATION_LABELS doc._.id = id yield doc, fixed_annotation