def read_text(pathname): with open(pathname) as fp: text = fp.read() sentences = [] offset = 0 for sent in text.split('\n'): sentence = bioc.BioCSentence() sentence.infons['filename'] = pathname.stem sentence.offset = offset sentence.text = sent sentences.append(sentence) i = 0 for m in re.finditer('\S+', sent): if i == 0 and m.start() != 0: # add fake ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = '' ann.add_location(bioc.BioCLocation(offset, 0)) sentence.add_annotation(ann) i += 1 ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = m.group() ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group()))) sentence.add_annotation(ann) i += 1 offset += len(sent) + 1 return sentences
def test_get_text(): s = bioc.BioCSentence() s.offset = 100 s.text = 'no evidence of pulmonary edema' loc = (25 + 100, 30 + 100) expected = 'no evidence of pulmonary $X$' actual = get_text(s, loc) assert expected == actual, '{} vs {}'.format(expected, actual)
def sentence(this, json_sent): sent = bioc.BioCSentence() sent.infons = json_sent['infons'] sent.offset = str(json_sent['offset']) sent.text = json_sent['text'] sent.annotations = [this.annotation(a) for a in json_sent['annotations']] sent.relations = [this.relation(r) for r in json_sent['relations']] return sent
def test_load(): s = biocjson.fromJSON(json.loads(json_str), bioc.SENTENCE) g = semgraph.load(s) assert len(g) == 16 assert g.size() == 15 s = bioc.BioCSentence() g = semgraph.load(s) assert len(g) == 0 assert g.size() == 0
def create_ddi_bert(gold_directory, output): fp = open(output, 'w') writer = csv.writer(fp, delimiter='\t', lineterminator='\n') writer.writerow(['index', 'sentence', 'label']) cnt = 0 for root, dirs, files in os.walk(gold_directory): for name in files: pathname = os.path.join(root, name) tree = etree.parse(pathname) for stag in tree.xpath('/document/sentence'): sentence = bioc.BioCSentence() sentence.offset = 0 sentence.text = stag.get('text') entities = {} for etag in stag.xpath('entity'): id = etag.get('id') m = re.match('(\d+)-(\d+)', etag.get('charOffset')) if m is None: logging.warning('{}:{}: charOffset does not match. {}'.format( output, id, etag.get('charOffset'))) continue start = int(m.group(1)) end = int(m.group(2)) + 1 expected_text = etag.get('text') actual_text = sentence.text[start:end] if expected_text != actual_text: logging.warning('{}:{}: Text does not match. Expected {}. Actual {}'.format( output, id, repr(expected_text), repr(actual_text))) entities[id] = { 'start': start, 'end': end, 'type': etag.get('type'), 'id': id, 'text': actual_text } for rtag in stag.xpath('pair'): if rtag.get('ddi') == 'false': label = 'DDI-false' else: label = 'DDI-{}'.format(rtag.get('type')) cnt += 1 e1 = entities.get(rtag.get('e1')) e2 = entities.get(rtag.get('e2')) text = replace_text(sentence.text, sentence.offset, e1, e2) writer.writerow([f'{rtag.get("id")}', text, label]) print(f'Have {cnt} relations')
def __call__(self, doc, *args, **kwargs): """ Split text into sentences with offsets. Args:v document(BioCDocument): one document Returns: BioCDocument """ for passage in doc.passages: for text, offset in self.split(passage.text): sentence = bioc.BioCSentence() sentence.offset = offset + passage.offset sentence.text = text passage.add_sentence(sentence) # passage.text = None return doc
def tokenize_text(text, id): sentences = [] doc = nlp(text) for sent in doc.sents: sentence = bioc.BioCSentence() sentence.infons['filename'] = id sentence.offset = sent.start_char sentence.text = text[sent.start_char:sent.end_char] sentences.append(sentence) i = 0 for token in sent: for t, start, end in split_punct(token.text, token.idx): ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = t ann.add_location(bioc.BioCLocation(start, end - start)) sentence.add_annotation(ann) i += 1 return sentences
def ssplit(document, splitter): """ Split text into sentences with offsets. Args: splitter(Splitter): Sentence splitter document(BioCDocument): one document Returns: BioCDocument """ for passage in document.passages: for text, offset in splitter.split(passage.text): sentence = bioc.BioCSentence() sentence.offset = offset + passage.offset sentence.text = text passage.add_sentence(sentence) # passage.text = None return document
def text_to_bioc(list_of_text, type, **kwargs): if type == 'p/s': offset = 0 passage = bioc.BioCPassage() passage.offset = offset for s in list_of_text: sentence = bioc.BioCSentence() sentence.offset = offset sentence.text = s offset += len(s) + 1 passage.add_sentence(sentence) return passage elif type == 'd/p/s': document = bioc.BioCDocument() passage = text_to_bioc(list_of_text, 'p/s') document.add_passage(passage) return document elif type == 'c/d/p/s': c = bioc.BioCCollection() d = text_to_bioc(list_of_text, 'd/p/s') c.add_document(d) return c elif type == 'd/p': document = bioc.BioCDocument() offset = 0 for s in list_of_text: passage = bioc.BioCPassage() passage.offset = offset offset += len(s) + 1 passage.text = s document.add_passage(passage) return document elif type == 'c/d/p': c = bioc.BioCCollection() d = text_to_bioc(list_of_text, 'd/p') c.add_document(d) return c else: raise KeyError
def add_dependency(self, obj: Dict) -> None: # create bioc sentence sentence = bioc.BioCSentence() sentence.offset = 0 sentence.text = obj['text'] sentence.infons['parse tree'] = obj['parse tree'] self.convert_biocsentence(sentence) m = {} for i, tok in enumerate(obj['toks']): tok['id'] = i # find bioc annotation found = False for ann in sentence.annotations: loc = ann.total_span if utils.intersect((tok['start'], tok['end']), (loc.offset, loc.offset + loc.length)): if ann.id in m: logging.debug('Duplicated id mapping: %s', ann.id) m[ann.id] = i if 'ROOT' in ann.infons: tok['ROOT'] = True found = True break if not found: logging.debug('Cannot find %s in \n%s', tok, obj['id']) for rel in sentence.relations: node0 = rel.nodes[0] node1 = rel.nodes[1] if node0.refid in m and node1.refid in m: if node0.role == 'governor': gov = m[node0.refid] dep = m[node1.refid] else: gov = m[node1.refid] dep = m[node0.refid] if gov == dep: logging.debug('Discard self loop') continue tok = obj['toks'][dep] if 'governor' in tok: if tok['governor'] == gov: pass if 'extra' in rel.infons: pass else: logging.debug('%s: Two heads: %s', obj['id'], str(rel)) else: tok['governor'] = gov tok['dependency'] = rel.infons['dependency'] else: ann0 = None ann1 = None for annotation in sentence.annotations: if annotation.id == node0.refid: ann0 = annotation if annotation.id == node1.refid: ann1 = annotation logging.debug('Cannot find %s or %s in sentence: %s', node0, node1, obj['id']) logging.debug('%s', ann0) logging.debug('%s', ann1)