Exemple #1
0
def run_RF(X, test_x, labels_x, f, REPORT, out_dir, dic_dir):

    np.random.seed(500)

    if REPORT == 'yes':
        n_folds = 1
    if REPORT == 'no':
        n_folds = 10

    VALIDATION_SPLIT = 0.2 # Validation %
    l_precision, l_recall, l_f1 = [], [], []

    for n_ in range(n_folds):

        print('Running fold ' + str(n_))

        Encoder = LabelEncoder()
        labels = Encoder.fit_transform(labels_x)

        if f == 'TF-IDF':
            vect = TfidfVectorizer(max_features=1000)
        if f == 'BoW':
            vect = CountVectorizer(max_features=1000)

        vect.fit(X)
        Train_X_Tfidf = vect.transform(X)
        features_train = build_features(dic_dir, X)
        train = hstack((Train_X_Tfidf, features_train))

        if REPORT == 'yes':
            Test_X_Tfidf = vect.transform(test_x)
            features_test = build_features(dic_dir, test_x)
            test = hstack((Test_X_Tfidf, features_test))

        RF = RandomForestClassifier(random_state=0, n_estimators=100)

        if not os.path.exists(out_dir + '/models'):
            os.makedirs(out_dir + '/models')

        if REPORT == 'yes':
            RF.fit(train,labels)
            y_pred = RF.predict(test)
            write_list(y_pred, out_dir + '/models/re_predictions.txt', iterate=True, encoding=encoding)

        if REPORT == 'no':
            train, test, labels, test_labels = train_test_split(train, labels, test_size=VALIDATION_SPLIT, random_state=42)
            RF.fit(train,labels)
            y_pred = RF.predict(test)

            l_precision.append(precision_score(y_true=test_labels, y_pred=y_pred, average='macro'))
            l_recall.append(recall_score(y_true=test_labels, y_pred=y_pred, average='macro'))
            l_f1.append(f1_score(y_true=test_labels, y_pred=y_pred, average='macro'))

    if REPORT == 'no':

        l_results = 're_RF' + '\t' + str(np.mean(l_precision)) + '\t' + str(np.mean(l_recall)) + '\t' + str(np.mean(l_f1))
        print(l_results)
Exemple #2
0
                positive.append(j.split('.')[0])
                if str(j.split('.')[0]) in positive_abstracts:
                    c.append(j)
                if str(j.split('.')[0]) not in positive_abstracts:
                    d.append(j)

        c, d = list(set(c)), list(set(d))
        c_tp, c_fp = len(c), len(d)

        precision = c_tp / (c_tp + c_fp)
        recall = c_tp / (c_tp + (len(positive_abstracts) - c_tp))
        f1 = 2 * recall * precision / (recall + precision)

        print(predictions + ': PRECISION = ' + str(precision) + ', RECALL = ' +
              str(recall) + ', F1-SCORE = ' + str(f1))
        write_list(list(set(positive)), args.i2 + '/triage_pmids.output', True,
                   'latin-1')

    if task == 're':

        labels = read_as_list(predictions + '.txt', encoding='latin-1')

        map_idx = {}
        map_idx['0'], map_idx['1'], map_idx['2'], map_idx[
            '3'] = 'activation', 'none', 'repression', 'undefined'

        labels = [map_idx[x] for x in labels]
        export, final = [], []
        df = pd.read_csv(args.i2 + '/re_test.csv')
        values = df.values.tolist()

        for l, label in zip(values, labels):
Exemple #3
0
def build_data(l_texts, l_ann, type_data, f2, option, out_file):
    ''' Exports data. '''

    nlp = spacy.load('en')

    original, all_sentences, tags, l_gene1, l_gene2, l_pmids = [], [], [], [], [], []

    def find_s_e(e, tag):

        word = e.split('\t')[2]
        b = e.split('\t')[1].split(' ')[1]
        e = e.split('\t')[1].split(' ')[2]

        return int(b), int(e), str(word)

    for i, a in enumerate(l_ann):

        if len(a.split('.')[0].split(':')) == 2:
            sentence_index = a.split('.')[0].split(':')[1]
        else:
            sentence_index = 'None'

        already = []

        try:
            ann = read_as_list(type_data + '/' + a, encoding=encoding)
            txt = read_as_list(type_data + '/' + a.split('.')[0] + '.txt',
                               encoding=encoding)
            txt = ''.join(txt)

            relations = [x for x in ann if x[0] == 'R']

            if type_data == 'train':
                entities = [
                    x for x in ann if x[0] == 'T'
                    and x.split('\t')[1][0:14] != 'AnnotatorNotes'
                ]
            else:
                entities = ann

            n_dbtfs = [
                x for x in entities if x.split('\t')[1].split(' ')[0] == 'DBTF'
            ]

            # If there is at least one DBTF and at least two entities...
            if len(n_dbtfs) > 0 and len(entities) > 1:

                # Build positive sentences!
                if relations:
                    for r in relations:

                        tag = r.split('\t')[1].split(' ')[0]
                        ent1, ent2 = r.split('\t')[1].split(' ')[1].split(':')[
                            1], r.split('\t')[1].split(' ')[2].split(':')[1]

                        for e in entities:
                            if e.split('\t')[0] == ent1 and e.split(
                                    '\t')[1][0:14] != 'AnnotatorNotes':
                                b1, e1, word1 = find_s_e(e, ent1)
                            if e.split('\t')[0] == ent2 and e.split(
                                    '\t')[1][0:14] != 'AnnotatorNotes':
                                b2, e2, word2 = find_s_e(e, ent2)

                        invert = False

                        if b1 > b2:
                            invert = True

                        if not invert:
                            out = txt[:b1] + 'gene1' + txt[
                                e1:b2] + 'gene2' + txt[e2:]

                        if invert:
                            out = txt[:b2] + 'gene2' + txt[
                                e2:b1] + 'gene1' + txt[e1:]

                        s_ = nltk.sent_tokenize(out)
                        sentence = []
                        for i, s in enumerate(s_):
                            if "gene1" in s and "gene2" in s:
                                sentence = s
                                idx = i

                        if sentence:
                            sentence = sentence.replace('_', ' ')
                            sentence = sentence.replace('-', ' ')
                            for e in entities:
                                if e.split('\t')[0][0] == 'T' and e.split(
                                        '\t')[1][0:14] != 'AnnotatorNotes':
                                    w = e.split('\t')[2].replace('_', ' ')
                                    w = ' '.join(w.replace('-', ' ').split())
                                    if w:
                                        sentence = sentence.replace(w, 'genex')

                            all_entities = []
                            all_entities.append('DBTF')
                            sentence, all_entities = find_experimental_methods(
                                f2, sentence, all_entities)

                            out1 = preprocess_text(sentence, nlp, all_entities)

                            if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences:
                                all_sentences.append(out1)
                                tags.append(tag.lower())
                                if sentence_index == 'None':
                                    export = ' '.join(
                                        nltk.sent_tokenize(txt)[int(
                                            idx)].replace('_', ' ').replace(
                                                '-', ' ').split())
                                    original_ = out1[0:4] + export
                                    original.append(original_)
                                    ID = a.split('.')[0] + ':' + str(idx)
                                else:
                                    txt = ' '.join(
                                        txt.replace('_',
                                                    ' ').replace('-',
                                                                 ' ').split())
                                    original_ = out1[0:4] + txt
                                    original.append(original_)
                                    ID = a.split('.')[0]
                                word1, word2 = word1.replace(
                                    '_', ' '), word2.replace('_', ' ')
                                word1, word2 = ' '.join(
                                    word1.replace('-', ' ').split()), ' '.join(
                                        word2.replace('-', ' ').split())
                                l_gene1.append(word1)
                                l_gene2.append(word2)
                                l_pmids.append(ID)
                                already.append(word1)
                                already.append(word2)

                # Build negative sentences!
                non_relations = []
                for e in entities:
                    if e[0] == 'T' and e.split(
                            '\t')[1][0:14] != 'AnnotatorNotes':
                        if ' '.join(
                                e.split('\t')[2].replace('_', ' ').replace(
                                    '-', ' ').split()) not in already:
                            non_relations.append(e)

                DBTF = [
                    x for x in non_relations
                    if x.split('\t')[1].split(' ')[0] == 'DBTF'
                ]
                combinations = [(x, y) for x in DBTF for y in non_relations
                                if y.split('\t')[1].split(' ')[0] == 'DBTF'
                                or y.split('\t')[1].split(' ')[0] == 'NONDBTF']

                if combinations:
                    for d in combinations:
                        # Combine all DBTF with every possible other entity different from it. No self-regulation!
                        b1, e1 = int(d[0].split('\t')[1].split(' ')[1]), int(
                            d[0].split('\t')[1].split(' ')[2])
                        word1 = ' '.join(d[0].split('\t')[2].replace(
                            '_', ' ').replace('-', ' ').split())
                        b2, e2 = int(d[1].split('\t')[1].split(' ')[1]), int(
                            d[1].split('\t')[1].split(' ')[2])
                        word2 = ' '.join(d[1].split('\t')[2].replace(
                            '_', ' ').replace('-', ' ').split())

                        now = time.time()

                        if b1 != b2 and word1 != word2:

                            invert = False

                            if b1 > b2:
                                invert = True

                            if not invert:
                                out = txt[:b1] + 'gene1' + txt[
                                    e1:b2] + 'gene2' + txt[e2:]
                            if invert:
                                out = txt[:b2] + 'gene2' + txt[
                                    e2:b1] + 'gene1' + txt[e1:]

                            s_ = nltk.sent_tokenize(out)
                            sentence = []
                            for i, s in enumerate(s_):
                                if "gene1" in s and "gene2" in s:
                                    sentence = s
                                    idx = i

                            if sentence:
                                sentence = sentence.replace('_', ' ')
                                sentence = sentence.replace('-', ' ')
                                for e in entities:
                                    if e.split('\t')[0][0] == 'T' and e.split(
                                            '\t')[1][0:14] != 'AnnotatorNotes':
                                        w = e.split('\t')[2].replace('_', ' ')
                                        w = ' '.join(
                                            w.replace('-', ' ').split())
                                        sentence = sentence.replace(w, 'genex')

                                all_entities = []
                                all_entities.append('DBTF')
                                sentence, all_entities = find_experimental_methods(
                                    f2, sentence, all_entities)
                                out1 = preprocess_text(sentence, nlp,
                                                       all_entities)

                                if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences:
                                    all_sentences.append(out1)
                                    tags.append('none')
                                    if sentence_index == 'None':
                                        export = ' '.join(
                                            nltk.sent_tokenize(txt)[int(
                                                idx)].replace('_',
                                                              ' ').replace(
                                                                  '-',
                                                                  ' ').split())
                                        original_ = out1[0:4] + export
                                        original.append(original_)
                                        ID = a.split('.')[0] + ':' + str(idx)
                                    else:
                                        txt = ' '.join(
                                            txt.replace('_', ' ').replace(
                                                '-', ' ').split())
                                        original_ = out1[0:4] + txt
                                        original.append(original_)
                                        ID = a.split('.')[0]
                                    l_gene1.append(word1)
                                    l_gene2.append(word2)
                                    l_pmids.append(ID)

        except Exception as e:
            continue

    df = pd.DataFrame()
    df['all_sentences'] = all_sentences
    df['tags'] = tags
    df['original'] = original
    df['l_gene1'] = l_gene1
    df['l_gene2'] = l_gene2
    df['l_pmids'] = l_pmids

    if option == 'test':
        df.to_csv(out_file + '/re_test.csv', index=False)

    write_list(tags,
               out_file + '/re_' + option + '_labels.txt',
               iterate=True,
               encoding=encoding)
    write_list(original,
               out_file + '/re_' + option + '_original.txt',
               iterate=True,
               encoding=encoding)
    write_list(all_sentences,
               out_file + '/re_' + option + '_preprocessed.txt',
               iterate=True,
               encoding=encoding)
Exemple #4
0
def merge(f, f2, f3, f_out):

    l_ntnu = [f for f in listdir(f) if f.endswith('.minfner')]
    l_gnormplus = [f for f in listdir(f2) if f.endswith('.minfner')]
    l_text = [f for f in listdir(f3) if f.endswith('.txt')]

    for text in l_text:

        rl, write_out, already, final_merge = [], [], [], []

        tx = read_as_list(f3 + '/' + text, encoding=encoding)
        ann = text + '.out.minfner'
        ntnu_boolean, gn_boolean = False, False

        if ann in l_ntnu:
            ntnu = read_as_list(f + '/' + ann, encoding=encoding)
            ntnu = ['N_' + s for s in ntnu]
            ntnu_boolean = True

        ann = text.split('.')[0] + ':0.txt.out.minfner'
        if ann in l_gnormplus:
            gn = read_as_list(f2 + '/' + ann, encoding=encoding)
            gn = ['G_' + s for s in gn]
            gn_boolean = True

        # Merge both tools and keep only entities
        if ntnu_boolean and gn_boolean:
            entities = ntnu + gn
        elif ntnu_boolean and not gn_boolean:
            entities = ntnu
        elif not ntnu_boolean and gn_boolean:
            entities = gn
        else:
            entities = False

        # Keep all N_DBTF
        final_merge += [
            x for x in entities
            if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'DBTF'
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all G_NONDBTF
        final_merge += [
            x for x in entities
            if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and
            (x.split('\t')[1].split(' ')[1],
             x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all N_NONDBTF
        final_merge += [
            x for x in entities
            if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and
            (x.split('\t')[1].split(' ')[1],
             x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']
        # Keep all G_DBTF
        final_merge += [
            x for x in entities
            if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'DBTF' and (
                x.split('\t')[1].split(' ')[1],
                x.split('\t')[1].split(' ')[2]) not in already
        ]
        already = [(x.split('\t')[1].split(' ')[1],
                    x.split('\t')[1].split(' ')[2]) for x in final_merge
                   if x[2] == 'T']

        elements = [x.split('\t')[0][3:] for x in final_merge]
        final_merge += [
            x for x in entities
            if x[0:3] == 'N_#' and x.split('\t')[0][3:] in elements
        ]

        entities = [x[2:] for x in final_merge]

        ann_out = text.split('.')[0] + '.ann'

        if entities:
            for e in entities:
                if e[0] == 'T':
                    e_ = e.split('\t')
                    entity, tag, start, end, word = e_[0], e_[1].split(' ')[
                        0], e_[1].split(' ')[1], e_[1].split(' ')[2], e_[2]
                    write_out.append(
                        str(entity) + '\t' + tag + ' ' + str(start) + ' ' +
                        str(end) + '\t' + word)
                if e[0] == '#':
                    e_ = e.split('\t')
                    entity, ID = e_[0], ' '.join(
                        e_[1].split(' ')[1:]) + ' ' + e_[2]
                    write_out.append(
                        str(entity) + '\t' + 'AnnotatorNotes T' +
                        str(entity[1:]) + '\t' + str(ID))

        if write_out:
            write_list(write_out,
                       f_out + '/' + ann_out,
                       iterate=True,
                       encoding=encoding)
            write_list(tx, f_out + '/' + text, iterate=True, encoding=encoding)
Exemple #5
0
def run_RNN(X, test, labels, path_to_glove, REPORT, out_folder):

    ''' Recurrent Neural Networks with Attention enabled '''

    # Hyper-parameters of the model

    MAX_SEQUENCE_LENGTH = 100
    MAX_NB_WORDS = 500
    LSTM_DIM = 100

    # Categorize target labels !

    n_labels = len(set(labels))

    values = array(labels)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    labels = onehot_encoder.fit_transform(integer_encoded)
    print(list(label_encoder.inverse_transform([0, 1, 2, 3])))

    texts, texts_test = [], []

    for idx in X:
        text = BeautifulSoup(idx, 'html.parser')
        texts.append(str(text.get_text().encode()))

    for idx in test:
        text = BeautifulSoup(idx, 'html.parser')
        texts_test.append(str(text.get_text().encode()))

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    tokenizer_test = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer_test.fit_on_texts(texts)
    sequences_test = tokenizer_test.texts_to_sequences(texts_test)

    word_index = tokenizer.word_index
    word_index_test = tokenizer_test.word_index

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

    # Run Model with Cross-validation

    if REPORT == 'yes':
        n_folds = 1
    if REPORT == 'no':
        n_folds = 10

    VALIDATION_SPLIT = 0.2
    l_precision, l_recall, l_f1, l_val_accuracy = [], [], [], []

    for n_ in range(n_folds):

        print('Running fold ' + str(n_))

        # Shuffle data

        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

        # Split data

        if REPORT == 'yes':
            x_train = data
            y_train = labels
            x_val = data
            y_val = labels

        if REPORT == 'no':
            x_train = data[:-nb_validation_samples]
            y_train = labels[:-nb_validation_samples]
            x_val = data[-nb_validation_samples:]
            y_val = labels[-nb_validation_samples:]

        # Oversampling of minority label in training set

        ada = RandomOverSampler(random_state=42, sampling_strategy='minority')
        X_train_resampled, y_train_resampled = ada.fit_sample(x_train, y_train)

        # Build GloVe dictionaries

        embeddings_index = {}
        f = open(path_to_glove, encoding='utf8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        embedding_matrix = np.random.random((len(word_index) + 1, MAX_SEQUENCE_LENGTH))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        embedding_layer = Embedding(len(word_index) + 1,
                                    MAX_SEQUENCE_LENGTH,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=True)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)

        model_type = out_folder + "/Bidirectional LSTM with Attention"

        out = Bidirectional(LSTM(LSTM_DIM, return_sequences=True, dropout=0.30, recurrent_dropout=0.30))(embedded_sequences)
        out = Attention(MAX_SEQUENCE_LENGTH)(out)
        out = Dense(LSTM_DIM, activation="relu")(out)

        out = Dropout(0.30)(out)
        out = Dense(n_labels, activation="softmax")(out)
        model = Model(sequence_input, out)

        # Save model architecture

        if not os.path.isfile(model_type + '.png'):

            plot_model(model, to_file=model_type+'.png', show_shapes=True, show_layer_names=True)

        # Model optimizer and metrics

        opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

        # Model parameters

        if not os.path.exists(out_folder + '/models'):
            os.makedirs(out_folder + '/models')

        early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 0, mode= 'min')
        model_filepath_weights = out_folder + '/models/triage_' + path_to_glove.split('/')[2].split('.')[0] + '_' + str(n_) + '.h5'
        model_filepath_json = out_folder + '/models/triage_' + path_to_glove.split('/')[2].split('.')[0] + '_' + str(n_) + '.json'

        checkpoint = ModelCheckpoint(model_filepath_weights, monitor='val_acc', verbose = 0, save_best_only=True, mode='max')
        callbacks_list = [early_stopping, checkpoint]

        history = model.fit(X_train_resampled, y_train_resampled, validation_data=(x_val, y_val), epochs=100, callbacks=callbacks_list, verbose = 0)

        # Predictions

        if REPORT == 'yes':

            y_pred = model.predict(data_test)
            y_pred = y_pred.argmax(axis=-1)
            y_pred = [str(x) for x in y_pred]

            write_list(y_pred, out_folder + '/models/re_predictions.txt', iterate=True, encoding=encoding)

            # serialize model to JSON
            model_json = model.to_json()
            with open(model_filepath_json, "w") as json_file:
                json_file.write(model_json)
            # serialize weights to HDF5
            keras.models.save_model(model, model_filepath_weights)
            print("Saved model to disk")

        if REPORT == 'no':

            y_pred = model.predict(x_val)
            y_pred = y_pred.argmax(axis=-1)
            y_val = y_val.argmax(axis=-1)
            y_pred = [str(x) for x in y_pred]
            y_val = [str(x) for x in y_val]

            # Results

            l_precision.append(precision_score(y_true=y_val, y_pred=y_pred, average='macro'))
            l_recall.append(recall_score(y_true=y_val, y_pred=y_pred, average='macro'))
            l_f1.append(f1_score(y_true=y_val, y_pred=y_pred, average='macro'))
            l_val_accuracy.append(history.history['val_acc'][-1])

    if REPORT == 'no':

        l_results = 're_RNN' + '\t' + str(np.mean(l_precision)) + '\t' + str(np.mean(l_recall)) + '\t' + str(np.mean(l_f1))
        print(l_results)
Exemple #6
0
    return out, out_original, labels, list_pmids

if '__main__' == __name__:

    ''' Exports triage data. '''

    encoding = "latin-1"
    nltk.download('punkt')
    nltk.download('stopwords')

    parser = argparse.ArgumentParser(description='Options')
    parser.add_argument('--i1', type=str, help="""Folder with abstracts and annotations.""")
    parser.add_argument('--i2', type=str, help="""Data folder.""")
    parser.add_argument('--i3', type=str, help="""Data folder.""")
    parser.add_argument('--option', type=str, help="""Train or test.""")
    parser.add_argument('--o', type=str, help="""Output folder.""")
    args = parser.parse_args()

    pubtator_articles = [f for f in listdir(args.i1) if f.endswith('.txt')]

    # Replace words by entities!
    data, data_original, labels, list_pmids = build_data(args.i1, args.i2, args.i3, pubtator_articles, args.option)

    if args.option == 'test':
        write_list(list_pmids, args.o + '/triage_test_pmids.txt', iterate=True, encoding=encoding)  

    write_list(data, args.o + '/triage_' + args.option + '_preprocessed.txt', iterate=True, encoding=encoding)  
    write_list(data_original, args.o + '/triage_' + args.option + '_original.txt', iterate=True, encoding=encoding)  
    write_list(labels, args.o + '/triage_' + args.option + '_labels.txt', iterate=True, encoding=encoding)  
Exemple #7
0
import argparse
from os import listdir
from export_abstracts import read_as_list, write_list

if '__main__' == __name__:
    ''' Removes wrong genes. '''

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--i1', type=str, help="""Data folder.""")
    parser.add_argument('--i2', type=str, help="""Data folder.""")
    args = parser.parse_args()

    to_avoid = read_as_list(args.i1 + '/to_avoid.txt', encoding='latin-1')
    l_ann = [f for f in listdir(args.i2) if f.endswith('.ann')]

    for a in l_ann:

        ann = read_as_list(args.i2 + '/' + a, encoding='latin-1')
        final = []
        for p in ann:
            if p[0] == 'T':
                if p.split('\t')[2].replace(' ', '') not in to_avoid:
                    final.append(p)

        entities = [x.split('\t')[0][1:] for x in final]
        final += [
            x for x in ann if x[0] == '#' and x.split('\t')[0][1:] in entities
        ]

        write_list(list(set(final)), args.i2 + '/' + a, True, 'latin-1')
Exemple #8
0
    encoding = "latin-1"

    parser = argparse.ArgumentParser(description='Options')
    parser.add_argument('--i', type=str, help="""Input directory.""")
    parser.add_argument('--o', type=str, help="""Output directory.""")
    args = parser.parse_args()

    ann1 = pd.read_csv(args.i + '/abstracts.all.labeled.csv', sep='\n|\t', encoding=encoding, engine='python')
    ann2 = read_as_list(args.i + '/hackaton_1.tsv', encoding=encoding)
    ann2 = [x.split('\t') for x in ann2]
    ann3 = read_as_list(args.i + '/hackaton_2.tsv', encoding=encoding)
    ann3 = [x.split('\t') for x in ann3]

    di = {True: 1, False: 0}

    # The values 10 and 20 are replaced by 'A' and 'B'
    ann1['label'].replace(di, inplace=True)

    pmid_1, l_1, pmid_2, l_2, pmid_3, l_3 = list(ann1['pmid']), list(ann1['label']), [x[0] for x in ann2], [x[2] for x in ann2], [x[0] for x in ann3], [x[2] for x in ann3]

    df = pd.DataFrame()
    df['pmid'] = pmid_1 + pmid_2 + pmid_3
    df['label'] = l_1 + l_2 + l_3
    df = df.astype({'pmid': int, 'label': int})
    df = remove_duplicates(df, 'pmid')

    list_PMID = list(df['pmid'])
    list_labels = list(df['label'])
    write_list(list_PMID, args.o + '/triage_train_pmids.txt', iterate=True, encoding=encoding)
    write_list(list_labels, args.o + '/triage_train_labels.txt', iterate=True, encoding=encoding)