Beispiel #1
0
def export_entities(dictionary, list_of_files, label, T, f, f2):

    keyword_processor = KeywordProcessor()

    dic = eval(open(f2 + "/" + dictionary, "r").read())
    now = time.time()

    for x in dic.keys():
        keyword_processor.add_keyword(x)

    for i, file_name in enumerate(list_of_files):

        try:

            file_out = open(f + '/' + file_name.replace('txt2', 'out'), 'a')
            sentence = read_as_list(f + '/' + file_name, encoding='latin-1')

            if sentence and len(sentence) == 1:
                sentence = sentence[0]
                sentence2 = sentence.lower()
                nomes = keyword_processor.extract_keywords(sentence2)

                if nomes:
                    for nome in nomes:
                        try:
                            p = re.compile(r'\b%s\b' % re.escape(nome))
                            for m in p.finditer(sentence2):
                                file_out.write(
                                    str(T) + str(m.start()) + '\t' + label +
                                    '\t' + str(m.start()) + '\t' +
                                    str(m.end()) + '\t' +
                                    str(sentence[m.start():m.end()]) + '\t' +
                                    '#' + T[1] + str(m.start()) + '\t' +
                                    'AnnotatorNotes_' + T + str(m.start()) +
                                    '\t' + str(dic[nome]) + '\n')
                        except:
                            continue
        except:
            continue
Beispiel #2
0
                        help="""Predictions file.""")
    parser.add_argument('--i1',
                        type=str,
                        default=None,
                        help="""Data folder.""")
    parser.add_argument('--i2',
                        type=str,
                        default=None,
                        help="""Root folder.""")

    args = parser.parse_args()

    task = args.task
    predictions = args.predictions

    silver_standard = read_as_list(args.i1 + '/ExTRI_confidence',
                                   encoding='latin-1')[2:]

    positive_abstracts, positive = [], []
    for l in silver_standard:
        l_ = l.split('\t')
        try:
            if l_[-1] == 'High':
                positive_abstracts.append(str(l_[0].split(':')[0]))
        except:
            continue
    positive_abstracts = list(set(positive_abstracts))

    if task == 'triage':

        labels = read_as_list(predictions + '.txt', encoding='latin-1')
        pmids = read_as_list(args.i2 + '/triage_test_pmids.txt',
Beispiel #3
0
def build_data(l_texts, l_ann, type_data, f2, option, out_file):
    ''' Exports data. '''

    nlp = spacy.load('en')

    original, all_sentences, tags, l_gene1, l_gene2, l_pmids = [], [], [], [], [], []

    def find_s_e(e, tag):

        word = e.split('\t')[2]
        b = e.split('\t')[1].split(' ')[1]
        e = e.split('\t')[1].split(' ')[2]

        return int(b), int(e), str(word)

    for i, a in enumerate(l_ann):

        if len(a.split('.')[0].split(':')) == 2:
            sentence_index = a.split('.')[0].split(':')[1]
        else:
            sentence_index = 'None'

        already = []

        try:
            ann = read_as_list(type_data + '/' + a, encoding=encoding)
            txt = read_as_list(type_data + '/' + a.split('.')[0] + '.txt',
                               encoding=encoding)
            txt = ''.join(txt)

            relations = [x for x in ann if x[0] == 'R']

            if type_data == 'train':
                entities = [
                    x for x in ann if x[0] == 'T'
                    and x.split('\t')[1][0:14] != 'AnnotatorNotes'
                ]
            else:
                entities = ann

            n_dbtfs = [
                x for x in entities if x.split('\t')[1].split(' ')[0] == 'DBTF'
            ]

            # If there is at least one DBTF and at least two entities...
            if len(n_dbtfs) > 0 and len(entities) > 1:

                # Build positive sentences!
                if relations:
                    for r in relations:

                        tag = r.split('\t')[1].split(' ')[0]
                        ent1, ent2 = r.split('\t')[1].split(' ')[1].split(':')[
                            1], r.split('\t')[1].split(' ')[2].split(':')[1]

                        for e in entities:
                            if e.split('\t')[0] == ent1 and e.split(
                                    '\t')[1][0:14] != 'AnnotatorNotes':
                                b1, e1, word1 = find_s_e(e, ent1)
                            if e.split('\t')[0] == ent2 and e.split(
                                    '\t')[1][0:14] != 'AnnotatorNotes':
                                b2, e2, word2 = find_s_e(e, ent2)

                        invert = False

                        if b1 > b2:
                            invert = True

                        if not invert:
                            out = txt[:b1] + 'gene1' + txt[
                                e1:b2] + 'gene2' + txt[e2:]

                        if invert:
                            out = txt[:b2] + 'gene2' + txt[
                                e2:b1] + 'gene1' + txt[e1:]

                        s_ = nltk.sent_tokenize(out)
                        sentence = []
                        for i, s in enumerate(s_):
                            if "gene1" in s and "gene2" in s:
                                sentence = s
                                idx = i

                        if sentence:
                            sentence = sentence.replace('_', ' ')
                            sentence = sentence.replace('-', ' ')
                            for e in entities:
                                if e.split('\t')[0][0] == 'T' and e.split(
                                        '\t')[1][0:14] != 'AnnotatorNotes':
                                    w = e.split('\t')[2].replace('_', ' ')
                                    w = ' '.join(w.replace('-', ' ').split())
                                    if w:
                                        sentence = sentence.replace(w, 'genex')

                            all_entities = []
                            all_entities.append('DBTF')
                            sentence, all_entities = find_experimental_methods(
                                f2, sentence, all_entities)

                            out1 = preprocess_text(sentence, nlp, all_entities)

                            if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences:
                                all_sentences.append(out1)
                                tags.append(tag.lower())
                                if sentence_index == 'None':
                                    export = ' '.join(
                                        nltk.sent_tokenize(txt)[int(
                                            idx)].replace('_', ' ').replace(
                                                '-', ' ').split())
                                    original_ = out1[0:4] + export
                                    original.append(original_)
                                    ID = a.split('.')[0] + ':' + str(idx)
                                else:
                                    txt = ' '.join(
                                        txt.replace('_',
                                                    ' ').replace('-',
                                                                 ' ').split())
                                    original_ = out1[0:4] + txt
                                    original.append(original_)
                                    ID = a.split('.')[0]
                                word1, word2 = word1.replace(
                                    '_', ' '), word2.replace('_', ' ')
                                word1, word2 = ' '.join(
                                    word1.replace('-', ' ').split()), ' '.join(
                                        word2.replace('-', ' ').split())
                                l_gene1.append(word1)
                                l_gene2.append(word2)
                                l_pmids.append(ID)
                                already.append(word1)
                                already.append(word2)

                # Build negative sentences!
                non_relations = []
                for e in entities:
                    if e[0] == 'T' and e.split(
                            '\t')[1][0:14] != 'AnnotatorNotes':
                        if ' '.join(
                                e.split('\t')[2].replace('_', ' ').replace(
                                    '-', ' ').split()) not in already:
                            non_relations.append(e)

                DBTF = [
                    x for x in non_relations
                    if x.split('\t')[1].split(' ')[0] == 'DBTF'
                ]
                combinations = [(x, y) for x in DBTF for y in non_relations
                                if y.split('\t')[1].split(' ')[0] == 'DBTF'
                                or y.split('\t')[1].split(' ')[0] == 'NONDBTF']

                if combinations:
                    for d in combinations:
                        # Combine all DBTF with every possible other entity different from it. No self-regulation!
                        b1, e1 = int(d[0].split('\t')[1].split(' ')[1]), int(
                            d[0].split('\t')[1].split(' ')[2])
                        word1 = ' '.join(d[0].split('\t')[2].replace(
                            '_', ' ').replace('-', ' ').split())
                        b2, e2 = int(d[1].split('\t')[1].split(' ')[1]), int(
                            d[1].split('\t')[1].split(' ')[2])
                        word2 = ' '.join(d[1].split('\t')[2].replace(
                            '_', ' ').replace('-', ' ').split())

                        now = time.time()

                        if b1 != b2 and word1 != word2:

                            invert = False

                            if b1 > b2:
                                invert = True

                            if not invert:
                                out = txt[:b1] + 'gene1' + txt[
                                    e1:b2] + 'gene2' + txt[e2:]
                            if invert:
                                out = txt[:b2] + 'gene2' + txt[
                                    e2:b1] + 'gene1' + txt[e1:]

                            s_ = nltk.sent_tokenize(out)
                            sentence = []
                            for i, s in enumerate(s_):
                                if "gene1" in s and "gene2" in s:
                                    sentence = s
                                    idx = i

                            if sentence:
                                sentence = sentence.replace('_', ' ')
                                sentence = sentence.replace('-', ' ')
                                for e in entities:
                                    if e.split('\t')[0][0] == 'T' and e.split(
                                            '\t')[1][0:14] != 'AnnotatorNotes':
                                        w = e.split('\t')[2].replace('_', ' ')
                                        w = ' '.join(
                                            w.replace('-', ' ').split())
                                        sentence = sentence.replace(w, 'genex')

                                all_entities = []
                                all_entities.append('DBTF')
                                sentence, all_entities = find_experimental_methods(
                                    f2, sentence, all_entities)
                                out1 = preprocess_text(sentence, nlp,
                                                       all_entities)

                                if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences:
                                    all_sentences.append(out1)
                                    tags.append('none')
                                    if sentence_index == 'None':
                                        export = ' '.join(
                                            nltk.sent_tokenize(txt)[int(
                                                idx)].replace('_',
                                                              ' ').replace(
                                                                  '-',
                                                                  ' ').split())
                                        original_ = out1[0:4] + export
                                        original.append(original_)
                                        ID = a.split('.')[0] + ':' + str(idx)
                                    else:
                                        txt = ' '.join(
                                            txt.replace('_', ' ').replace(
                                                '-', ' ').split())
                                        original_ = out1[0:4] + txt
                                        original.append(original_)
                                        ID = a.split('.')[0]
                                    l_gene1.append(word1)
                                    l_gene2.append(word2)
                                    l_pmids.append(ID)

        except Exception as e:
            continue

    df = pd.DataFrame()
    df['all_sentences'] = all_sentences
    df['tags'] = tags
    df['original'] = original
    df['l_gene1'] = l_gene1
    df['l_gene2'] = l_gene2
    df['l_pmids'] = l_pmids

    if option == 'test':
        df.to_csv(out_file + '/re_test.csv', index=False)

    write_list(tags,
               out_file + '/re_' + option + '_labels.txt',
               iterate=True,
               encoding=encoding)
    write_list(original,
               out_file + '/re_' + option + '_original.txt',
               iterate=True,
               encoding=encoding)
    write_list(all_sentences,
               out_file + '/re_' + option + '_preprocessed.txt',
               iterate=True,
               encoding=encoding)
Beispiel #4
0
    parser = argparse.ArgumentParser(
        description='Hyper-parameters of the model.')
    parser.add_argument('--train', type=str, help="""Train file.""")
    parser.add_argument('--test', type=str, help="""Test file.""")
    parser.add_argument('--labels', type=str, help="""Labels file.""")
    parser.add_argument('--features',
                        type=str,
                        help="""Features to be used.""")
    parser.add_argument('--report',
                        type=str,
                        help="""If yes, predict unseen data.""")
    parser.add_argument('--dictionary',
                        type=str,
                        help="""Dictionary directory.""")
    parser.add_argument('--o', type=str, help="""Output folder.""")
    args = parser.parse_args()

    train = read_as_list(args.train, encoding='latin-1')
    if args.report == 'yes':
        test = read_as_list(args.test, encoding='latin-1')
    if args.report == 'no':
        test = []
    labels = read_as_list(args.labels, encoding='latin-1')

    f = args.features
    REPORT = args.report
    out_dic = args.o
    dic_dir = args.dictionary

    run_SVM(train, test, labels, f, REPORT, out_dic, dic_dir)
Beispiel #5
0
def merge(f, f2, f3, f_out):

    l_ntnu = [f for f in listdir(f) if f.endswith('.minfner')]
    l_gnormplus = [f for f in listdir(f2) if f.endswith('.minfner')]
    l_text = [f for f in listdir(f3) if f.endswith('.txt')]

    for text in l_text:

        rl, write_out, already, final_merge = [], [], [], []

        tx = read_as_list(f3 + '/' + text, encoding=encoding)
        ann = text + '.out.minfner'
        ntnu_boolean, gn_boolean = False, False

        if ann in l_ntnu:
            ntnu = read_as_list(f + '/' + ann, encoding=encoding)
            ntnu = ['N_' + s for s in ntnu]
            ntnu_boolean = True

        ann = text.split('.')[0] + ':0.txt.out.minfner'
        if ann in l_gnormplus:
            gn = read_as_list(f2 + '/' + ann, encoding=encoding)
            gn = ['G_' + s for s in gn]
            gn_boolean = True

        # Merge both tools and keep only entities
        if ntnu_boolean and gn_boolean:
            entities = ntnu + gn
        elif ntnu_boolean and not gn_boolean:
            entities = ntnu
        elif not ntnu_boolean and gn_boolean:
            entities = gn
        else:
            entities = False

        # Keep all N_DBTF
        final_merge += [
            x for x in entities
            if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'DBTF'
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all G_NONDBTF
        final_merge += [
            x for x in entities
            if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and
            (x.split('\t')[1].split(' ')[1],
             x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all N_NONDBTF
        final_merge += [
            x for x in entities
            if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and
            (x.split('\t')[1].split(' ')[1],
             x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all G_DBTF
        final_merge += [
            x for x in entities
            if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'DBTF' and (
                x.split('\t')[1].split(' ')[1],
                x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']

        elements = [x.split('\t')[0][3:] for x in final_merge]
        final_merge += [
            x for x in entities
            if x[0:3] == 'N_#' and x.split('\t')[0][3:] in elements
        ]

        entities = [x[2:] for x in final_merge]

        ann_out = text.split('.')[0] + '.ann'

        if entities:
            for e in entities:
                if e[0] == 'T':
                    e_ = e.split('\t')
                    entity, tag, start, end, word = e_[0], e_[1].split(' ')[
                        0], e_[1].split(' ')[1], e_[1].split(' ')[2], e_[2]
                    write_out.append(
                        str(entity) + '\t' + tag + ' ' + str(start) + ' ' +
                        str(end) + '\t' + word)
                if e[0] == '#':
                    e_ = e.split('\t')
                    entity, ID = e_[0], ' '.join(
                        e_[1].split(' ')[1:]) + ' ' + e_[2]
                    write_out.append(
                        str(entity) + '\t' + 'AnnotatorNotes T' +
                        str(entity[1:]) + '\t' + str(ID))

        if write_out:
            write_list(write_out,
                       f_out + '/' + ann_out,
                       iterate=True,
                       encoding=encoding)
            write_list(tx, f_out + '/' + text, iterate=True, encoding=encoding)
Beispiel #6
0
def build_data(folder, folder2, folder3, pubtator_articles, option):

    ''' Exports data. '''

    out, out_original, labels, list_pmids = [], [], [], []

    for txt in pubtator_articles:

        text_original = read_as_list(folder +  '/' + txt, encoding=encoding)[0]
        text = text_original
        PMID = txt.split('.')[0]
        ann = read_as_list(folder + '/' + PMID + '.ann', encoding=encoding)  # Read annotations
        ann = [x for x in ann if x[0] == 'T']

        if option == 'train':

            label = read_as_list(folder2 + '/triage_train_labels.txt', encoding=encoding)  # Read annotations
            PMIDs = read_as_list(folder2 + '/triage_train_pmids.txt', encoding=encoding)  # Read annotations
            label_txt = label[PMIDs.index(PMID)]

        lookback, delete, all_entities = [], [], []

        # Build list of start and end of sentences and delete entities inside of entities ...
        for x in ann:
            lookback.append((x.split('\t')[1].split(' ')[1] + ' ' + x.split('\t')[1].split(' ')[2]))

        for x in lookback:
            copy = lookback.copy()
            copy.remove(x)
            start = x.split(' ')[0]

            for el in copy:
                if int(start) in range(int(el.split(' ')[0]), int(el.split(' ')[1])):
                    delete.append(start)

        ann = [x.split('\t') for x in ann if x.split('\t')[1].split(' ')[1] not in delete]
        ann = [x[1].split(' ') for x in ann]
        ann.sort(key=lambda x: int(x[1]))

        c = 0
        for a in ann:
            tag, start, end = a[0], int(a[1]) + c, int(a[2]) + c
            all_entities.append(tag)
            c+= len(tag) - end + start
            text = text[:start] + tag + text[end:]

        # Find experimental settings !
        text, all_entities = find_experimental_methods(folder3, text, all_entities)

        text = preprocess(text)
        
        count_entities = Counter(all_entities)

        # Features: DBTF and EXPMETHOD (boolean)
        order_entities = ['EXPMETHOD', 'DBTF']
        for entity in order_entities:

            if entity in count_entities:
                count_entities[entity] = 1
            else:
                count_entities[entity] = 0

            text = str(count_entities[entity]) + ' ' + text
            text_original = str(count_entities[entity]) + ' ' + text_original

        out.append(text)
        out_original.append(text_original)

        if option == 'train':
            labels.append(label_txt)
        if option == 'test':
            list_pmids.append(':'.join(txt.split(':')[0:2]))

    return out, out_original, labels, list_pmids
Beispiel #7
0
Date: 07.10.2019
'''

import argparse
from os import listdir
from export_abstracts import read_as_list, write_list

if '__main__' == __name__:
    ''' Removes wrong genes. '''

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--i1', type=str, help="""Data folder.""")
    parser.add_argument('--i2', type=str, help="""Data folder.""")
    args = parser.parse_args()

    to_avoid = read_as_list(args.i1 + '/to_avoid.txt', encoding='latin-1')
    l_ann = [f for f in listdir(args.i2) if f.endswith('.ann')]

    for a in l_ann:

        ann = read_as_list(args.i2 + '/' + a, encoding='latin-1')
        final = []
        for p in ann:
            if p[0] == 'T':
                if p.split('\t')[2].replace(' ', '') not in to_avoid:
                    final.append(p)

        entities = [x.split('\t')[0][1:] for x in final]
        final += [
            x for x in ann if x[0] == '#' and x.split('\t')[0][1:] in entities
        ]
Beispiel #8
0
    return df

if '__main__' == __name__:

    ''' Export triage PMIDs and labels of annotated abstracts. '''
    
    encoding = "latin-1"

    parser = argparse.ArgumentParser(description='Options')
    parser.add_argument('--i', type=str, help="""Input directory.""")
    parser.add_argument('--o', type=str, help="""Output directory.""")
    args = parser.parse_args()

    ann1 = pd.read_csv(args.i + '/abstracts.all.labeled.csv', sep='\n|\t', encoding=encoding, engine='python')
    ann2 = read_as_list(args.i + '/hackaton_1.tsv', encoding=encoding)
    ann2 = [x.split('\t') for x in ann2]
    ann3 = read_as_list(args.i + '/hackaton_2.tsv', encoding=encoding)
    ann3 = [x.split('\t') for x in ann3]

    di = {True: 1, False: 0}

    # The values 10 and 20 are replaced by 'A' and 'B'
    ann1['label'].replace(di, inplace=True)

    pmid_1, l_1, pmid_2, l_2, pmid_3, l_3 = list(ann1['pmid']), list(ann1['label']), [x[0] for x in ann2], [x[2] for x in ann2], [x[0] for x in ann3], [x[2] for x in ann3]

    df = pd.DataFrame()
    df['pmid'] = pmid_1 + pmid_2 + pmid_3
    df['label'] = l_1 + l_2 + l_3
    df = df.astype({'pmid': int, 'label': int})