Ejemplo n.º 1
0
def read_text(pathname):
    with open(pathname) as fp:
        text = fp.read()
    sentences = []
    offset = 0
    for sent in text.split('\n'):
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = pathname.stem
        sentence.offset = offset
        sentence.text = sent
        sentences.append(sentence)
        i = 0
        for m in re.finditer('\S+', sent):
            if i == 0 and m.start() != 0:
                # add fake
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = ''
                ann.add_location(bioc.BioCLocation(offset, 0))
                sentence.add_annotation(ann)
                i += 1
            ann = bioc.BioCAnnotation()
            ann.id = f'a{i}'
            ann.text = m.group()
            ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group())))
            sentence.add_annotation(ann)
            i += 1
        offset += len(sent) + 1
    return sentences
Ejemplo n.º 2
0
def test_get_text():
    s = bioc.BioCSentence()
    s.offset = 100
    s.text = 'no evidence of pulmonary edema'
    loc = (25 + 100, 30 + 100)

    expected = 'no evidence of pulmonary $X$'
    actual = get_text(s, loc)
    assert expected == actual, '{} vs {}'.format(expected, actual)
Ejemplo n.º 3
0
 def sentence(this, json_sent):
     sent = bioc.BioCSentence()
     sent.infons = json_sent['infons']
     sent.offset = str(json_sent['offset'])
     sent.text = json_sent['text']
     sent.annotations = [this.annotation(a)
                         for a in json_sent['annotations']]
     sent.relations = [this.relation(r)
                       for r in json_sent['relations']]
     return sent
Ejemplo n.º 4
0
def test_load():
    s = biocjson.fromJSON(json.loads(json_str), bioc.SENTENCE)
    g = semgraph.load(s)
    assert len(g) == 16
    assert g.size() == 15

    s = bioc.BioCSentence()
    g = semgraph.load(s)
    assert len(g) == 0
    assert g.size() == 0
Ejemplo n.º 5
0
def create_ddi_bert(gold_directory, output):
    fp = open(output, 'w')
    writer = csv.writer(fp, delimiter='\t', lineterminator='\n')
    writer.writerow(['index', 'sentence', 'label'])
    cnt = 0
    for root, dirs, files in os.walk(gold_directory):
        for name in files:
            pathname = os.path.join(root, name)
            tree = etree.parse(pathname)
            for stag in tree.xpath('/document/sentence'):
                sentence = bioc.BioCSentence()
                sentence.offset = 0
                sentence.text = stag.get('text')

                entities = {}
                for etag in stag.xpath('entity'):
                    id = etag.get('id')
                    m = re.match('(\d+)-(\d+)', etag.get('charOffset'))
                    if m is None:
                        logging.warning('{}:{}: charOffset does not match. {}'.format(
                        output, id, etag.get('charOffset')))
                        continue
                    start = int(m.group(1))
                    end = int(m.group(2)) + 1
                    expected_text = etag.get('text')
                    actual_text = sentence.text[start:end]
                    if expected_text != actual_text:
                        logging.warning('{}:{}: Text does not match. Expected {}. Actual {}'.format(
                            output, id, repr(expected_text), repr(actual_text)))
                    entities[id] = {
                        'start': start,
                        'end': end,
                        'type': etag.get('type'),
                        'id': id,
                        'text': actual_text
                    }
                for rtag in stag.xpath('pair'):
                    if rtag.get('ddi') == 'false':
                        label = 'DDI-false'
                    else:
                        label = 'DDI-{}'.format(rtag.get('type'))
                        cnt += 1
                    e1 = entities.get(rtag.get('e1'))
                    e2 = entities.get(rtag.get('e2'))
                    text = replace_text(sentence.text, sentence.offset, e1, e2)
                    writer.writerow([f'{rtag.get("id")}', text, label])

    print(f'Have {cnt} relations')
Ejemplo n.º 6
0
    def __call__(self, doc, *args, **kwargs):
        """
        Split text into sentences with offsets.

        Args:v
            document(BioCDocument): one document

        Returns:
            BioCDocument
        """
        for passage in doc.passages:
            for text, offset in self.split(passage.text):
                sentence = bioc.BioCSentence()
                sentence.offset = offset + passage.offset
                sentence.text = text
                passage.add_sentence(sentence)
            # passage.text = None
        return doc
Ejemplo n.º 7
0
def tokenize_text(text, id):
    sentences = []
    doc = nlp(text)
    for sent in doc.sents:
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = id
        sentence.offset = sent.start_char
        sentence.text = text[sent.start_char:sent.end_char]
        sentences.append(sentence)
        i = 0
        for token in sent:
            for t, start, end in split_punct(token.text, token.idx):
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = t
                ann.add_location(bioc.BioCLocation(start, end - start))
                sentence.add_annotation(ann)
                i += 1
    return sentences
Ejemplo n.º 8
0
def ssplit(document, splitter):
    """
    Split text into sentences with offsets.

    Args:
        splitter(Splitter): Sentence splitter
        document(BioCDocument): one document

    Returns:
        BioCDocument
    """
    for passage in document.passages:
        for text, offset in splitter.split(passage.text):
            sentence = bioc.BioCSentence()
            sentence.offset = offset + passage.offset
            sentence.text = text
            passage.add_sentence(sentence)
        # passage.text = None
    return document
Ejemplo n.º 9
0
def text_to_bioc(list_of_text, type, **kwargs):
    if type == 'p/s':
        offset = 0
        passage = bioc.BioCPassage()
        passage.offset = offset
        for s in list_of_text:
            sentence = bioc.BioCSentence()
            sentence.offset = offset
            sentence.text = s
            offset += len(s) + 1
            passage.add_sentence(sentence)
        return passage
    elif type == 'd/p/s':
        document = bioc.BioCDocument()
        passage = text_to_bioc(list_of_text, 'p/s')
        document.add_passage(passage)
        return document
    elif type == 'c/d/p/s':
        c = bioc.BioCCollection()
        d = text_to_bioc(list_of_text, 'd/p/s')
        c.add_document(d)
        return c
    elif type == 'd/p':
        document = bioc.BioCDocument()
        offset = 0
        for s in list_of_text:
            passage = bioc.BioCPassage()
            passage.offset = offset
            offset += len(s) + 1
            passage.text = s
            document.add_passage(passage)
        return document
    elif type == 'c/d/p':
        c = bioc.BioCCollection()
        d = text_to_bioc(list_of_text, 'd/p')
        c.add_document(d)
        return c
    else:
        raise KeyError
Ejemplo n.º 10
0
    def add_dependency(self, obj: Dict) -> None:
        # create bioc sentence
        sentence = bioc.BioCSentence()
        sentence.offset = 0
        sentence.text = obj['text']
        sentence.infons['parse tree'] = obj['parse tree']
        self.convert_biocsentence(sentence)

        m = {}
        for i, tok in enumerate(obj['toks']):
            tok['id'] = i
            # find bioc annotation
            found = False
            for ann in sentence.annotations:
                loc = ann.total_span
                if utils.intersect((tok['start'], tok['end']),
                                   (loc.offset, loc.offset + loc.length)):
                    if ann.id in m:
                        logging.debug('Duplicated id mapping: %s', ann.id)
                    m[ann.id] = i
                    if 'ROOT' in ann.infons:
                        tok['ROOT'] = True
                    found = True
                    break
            if not found:
                logging.debug('Cannot find %s in \n%s', tok, obj['id'])

        for rel in sentence.relations:
            node0 = rel.nodes[0]
            node1 = rel.nodes[1]
            if node0.refid in m and node1.refid in m:
                if node0.role == 'governor':
                    gov = m[node0.refid]
                    dep = m[node1.refid]
                else:
                    gov = m[node1.refid]
                    dep = m[node0.refid]
                if gov == dep:
                    logging.debug('Discard self loop')
                    continue
                tok = obj['toks'][dep]
                if 'governor' in tok:
                    if tok['governor'] == gov:
                        pass
                    if 'extra' in rel.infons:
                        pass
                    else:
                        logging.debug('%s: Two heads: %s', obj['id'], str(rel))
                else:
                    tok['governor'] = gov
                    tok['dependency'] = rel.infons['dependency']
            else:
                ann0 = None
                ann1 = None
                for annotation in sentence.annotations:
                    if annotation.id == node0.refid:
                        ann0 = annotation
                    if annotation.id == node1.refid:
                        ann1 = annotation
                logging.debug('Cannot find %s or %s in sentence: %s', node0,
                              node1, obj['id'])
                logging.debug('%s', ann0)
                logging.debug('%s', ann1)