Exemple #1
0
def processPassageAnnotation(pred, true, countDict, checkID):
    anns_pred = pred['annotations']
    anns_true = Annotation.sortAnns(true['annotations'], TBDFilter= False)
    for ann_pred in anns_pred:
        if Annotation.gettype(ann_pred) == 'Gene':
            countDict['pred_gene_num'] += 1
        else:
            continue
        flag = False
        for ann_true in anns_true:
            if Annotation.isSame(ann_pred, ann_true, checkID):
                countDict['tp'] += 1
                flag = True
                break
        if not flag:
            countDict['fp'] += 1
            # print(ann_pred)

    countDict['true_gene_num'] += len(anns_true)
    for ann_true in anns_true:
        flag = False
        for ann_pred in anns_pred:
            if Annotation.isSame(ann_pred, ann_true, checkID):
                flag = True
                break
        if not flag:
            countDict['fn'] += 1
Exemple #2
0
def _ner_document_process(document, tokenizer):
    text_li = []
    tag_li = []

    def f(text):
        # str -> List[str]
        x = tokenizer.convert_ids_to_tokens(tokenizer(text)['input_ids'])[1:-1]
        tokens = []
        for token in x:
            if token[:2] == '##':
                tokens[-1] += token[2:]
            else:
                tokens.append(token)
        return tokens
    for passage in document['passages']:
        anns = passage['annotations']
        anns = Annotation.sortAnns(anns, TBDFilter= False)
        text = passage['text']
        offset_p = passage['offset']
        index = 0
        if len(anns) == 0:
            tokens = f(text)
            text_li.extend(tokens)
            tag_li.extend(['O'] * len(tokens))
        else:
            for ann in anns:
                for i, location in enumerate(ann['locations']): # unnecessary currently because of filter in `Annotation.sortAnns`
                    if i > 0:
                        print("WARNING: PMID:{}, Ann id:{} Text:{}".format(
                            document['id'], ann['id'], ann['text']))
                    offset = location['offset']
                    length = location['length']
                    tokens = f(text[index:offset-offset_p])
                    text_li.extend(tokens)
                    tag_li.extend(['O'] * len(tokens))
                    if i == len(ann['locations']) - 1:
                        mention = text[offset-offset_p: offset-offset_p+length]
                        tokens = f(mention)
                        assert mention == ann['text'], mention + '\t' + ann['text'] +'\t'+ document['id']
                        assert len(tokens) > 0
                        tag_li.extend(['B'] + ['I']*(len(tokens) - 1))
                        text_li.extend(tokens)
                    index = max(offset - offset_p + length, index)
            tokens = f(text[index:])
            text_li.extend(tokens)
            tag_li.extend(['O']*len(tokens))
    assert len(text_li) == len(tag_li)
    return text_li, tag_li
Exemple #3
0
def BIO2Documents(documents_, BIO_file_path):
    '''
    convert BIO-tag File to documents with anns.
    :param documents: a `list` of document which is the source of BIO-tag file
    :param path: the path of BIO-tag file 
    '''
    sanity_check = __name__ == '__main__'  # FOR DEBUG
    BIOGenerator = _readBIOResult(BIO_file_path)
    documents = copy.deepcopy(documents_)
    for document in documents:
        passage_offset = 0
        idx_start, idx_end = 0, 0
        flag_start = False
        for passage in document['passages']:
            anns = []
            text = passage['text']
            while True:
                if sanity_check:
                    word, tag = next(BIOGenerator)
                    # print(idx_start, word, tag, _, sep='\t')
                else:
                    word, _, tag = next(BIOGenerator)
                    # print(idx_end, word, _, tag, sep='\t')
                if word == '[UNK]':
                    word = text[idx_end:].strip()[0]
                try:
                    idx_start = idx_end + text[idx_end:].index(word)
                    idx_end = idx_start + len(word)
                except ValueError as e:
                    print(e)
                    print(word)
                    print(text)
                    raise Exception("D")

                if tag == 'B':
                    flag_start = True
                    ann = {'text': word,
                           "infons": {
                               'type': 'Gene',
                               'NCBI GENE': 'TBD'
                           },
                           'locations': [
                               {
                                   'length': len(word),
                                   'offset': passage_offset + idx_start
                               }
                           ]}
                    anns.append(ann)
                    continue
                if flag_start:
                    if tag == 'O':
                        flag_start = False
                    elif tag == 'I':
                        ann['locations'][0]['length'] = passage_offset + \
                            idx_end - ann['locations'][0]['offset']
                        start_idx = ann['locations'][0]['offset'] - \
                            passage_offset
                        ann['text'] = text[start_idx:start_idx +
                                           ann['locations'][0]['length']]
                if idx_end == len(text) or idx_end == len(text.rstrip()):
                    # passage over
                    passage_offset += (len(text) + 1)
                    idx_start, idx_end = 0, 0
                    break
            if sanity_check:
                anns_ori = Annotation.sortAnns(passage['annotations'], TBDFilter= False)
                assert len(anns) == len(anns_ori), str(document) +'\n ' + str(len(anns))+' '+ str(len(anns_ori)) +'\n' + str(anns) + '\n' + str(anns_ori)
                for i in range(len(anns)):
                    ann_my = anns[i]
                    ann_ori = anns_ori[i]
                    assert ann_my['text'] == ann_ori[
                        'text'], f"PMID:{document['id']} My ann: {ann_my['text']} \tOri ann: {ann_ori['text']}"
                    assert ann_my['locations'] == ann_ori['locations']
            passage['annotations'] = anns
            # break

    return documents