Esempio n. 1
0
def main(paths, params):
    """    
    Arguments:
        paths {obj} -- Paths 
        params {obj} -- Parameters
    """
    path_to_train_input = paths.training  #'/media/druv022/Data1/Masters/Thesis/Data/Converted_train_2'
    path_to_valid_input = paths.develop  #'/media/druv022/Data1/Masters/Thesis/Data/Converted_develop'
    path_to_embeddings = paths.pre_trained_embeddings  # '/media/druv022/Data1/Masters/Thesis/Data/Embeddings'

    path_to_data = paths.experiment_folder  #'/media/druv022/Data1/Masters/Thesis/Data/Experiment'

    X = BratInput(path_to_train_input)
    X = X.transform()
    if params.use_sent_split:
        X = split_annotated_documents(X)

    X_valid = BratInput(path_to_valid_input)
    X_valid = X_valid.transform()
    if params.use_sent_split:
        X_valid = split_annotated_documents(X_valid)

    if not params.randomize:
        torch.manual_seed(5)
        random.seed(5)
        np.random.seed(5)

    # random.Random(5).shuffle(X)
    timestr = params.time

    # print('######################################################################################################')
    # print('                            Test BiLSTM-CRF ')
    # print('######################################################################################################')

    # embedding_dict=None

    if params.use_elmo:
        biLSTM_model = NER_ELMO()

        biLSTM_model = train_BiLSTM_elmo(X,
                                         params,
                                         paths,
                                         X_valid=X_valid,
                                         model=biLSTM_model)
    else:
        embedding_dict = load_embedding_pkl(path_to_embeddings)
        biLSTM_model = NER_word2vec(word_embeddings=embedding_dict)

        biLSTM_model = train_BiLSTM(X,
                                    params,
                                    paths,
                                    X_valid=X_valid,
                                    model=biLSTM_model,
                                    embeddings=embedding_dict)
base_path = '/media/druv022/Data2/Final'
paths = Paths(base_folder=base_path)
paths.reset()
paths.experiment_folder = os.path.join(paths.experiment_folder, 'EX_35')
paths.ner_model_name = 'ner_model.pt'
paths.el_model_name = 'el_model.pt'
paths.mt_model_name = 'shared_model.pt'
paths.linear_model_name = 'linear_model.pt'

params = MultiTask_Params()
params.batch_first = False

X_test = BratInput(paths.test)
X_test = X_test.transform()
X_test = split_annotated_documents(X_test)

x_test_text, ner_test_tags, x_test_tokens = annotated_docs_to_tokens(X_test)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if not os.path.exists(
        os.path.join(paths.multitask_folder, 'text_test_elmo_split.pkl')):
    # elmo embeddings
    options_file = paths.elmo_options
    weight_file = paths.elmo_weights
    ELMO_folder = paths.elmo_folder
    elmo_dim = params.elmo_dim
    elmo = Elmo(options_file, weight_file, 2, dropout=0)
    elmo.to(device)
    with torch.no_grad():
 def split(self, method="nltk"):
     self.documents = split_annotated_documents(self.documents, method=method)
     return self
def main():
    parser = argparse.ArgumentParser(description='data_exploration.py')
    parser.path('path', dest='path', deafult=None)
    parser.label_title('label_title',
                       dest='label_title',
                       default=None,
                       type=str,
                       help='graph title')
    parser.file_name('file_name', dest='file_name', type=str)
    parser.top10_title('top10_title',
                       dest='top10_title',
                       default=None,
                       type=str,
                       help='barplot title for the top10 instances of a label')
    parser.top10_filename('top10_filename',
                          dest='top10_filename',
                          default=None,
                          type=str)
    parser.label('label', dest='label', default=None, type=str)
    args = parser.parse_args()
    """
    This is function to create label frequency barplot
    path: the path of the documents
    labels: is a list of all labels in this dataset
    title: a string that is the title of the barplot
    """
    # Import data
    data_path = path
    ann_docs = BratInput(data_path).transform()
    # Split the documents into sentences
    sent_docs = split_annotated_documents(ann_docs)

    # Create a list with all labels from all documents
    list_all_labels = []
    for doc in ann_docs:
        for i in doc.annotations:
            list_all_labels.append(i.label)

    # WORKS: count the frequency of an item in the list
    counter = collections.Counter(list_all_labels)
    freq_list = counter.most_common()
    list_labels = [i[0] for i in freq_list]

    # Plot the label frequency
    sns.set_context("talk")

    fig, ax = plt.subplots()
    labels = list_labels
    #labels = list(labels)
    plt.barh(range(len(freq_list)), [val[1] for val in freq_list],
             color="darkmagenta",
             orientation='horizontal',
             zorder=3)
    plt.yticks(range(len(freq_list)), [val[0] for val in freq_list])
    plt.yticks()
    ax.set_yticklabels(labels)
    plt.grid(axis='x')
    mpl.rcParams['grid.color'] = 'k'
    mpl.rcParams['grid.linestyle'] = ':'
    rcParams.update({'figure.autolayout': True})
    plt.xlabel('Number of occurrences')
    plt.title(f'{label_title}')
    plt.tight_layout()
    plt.savefig(f'{file_name}.png', format='png', dpi=300)

    # Calculate some statistics
    # Create a list with all text from all documents
    list_all_text = []
    for doc in ann_docs:
        for i in doc.annotations:
            list_all_text.append(i.text)

    # Create a list with all offset0 from all documents
    list_all_offset0 = []
    for doc in ann_docs:
        for i in doc.annotations:
            list_all_offset0.append(i.offset[0])

    # Create a list with all offset1 from all documents
    list_all_offset1 = []
    for doc in ann_docs:
        for i in doc.annotations:
            list_all_offset1.append(i.offset[1])

    ## Zip all lists
    list_data = list(
        zip(list_all_labels, list_all_text, list_all_offset0,
            list_all_offset1))
    # List to Dataframe
    df = pd.DataFrame(list_data,
                      columns=['label', 'text', 'offset0', 'offset1'])

    # Calculate the text length and assign it to the dataframe as a new column
    df['text_length'] = df['text'].apply(len)
    # Lowercase the "text" column
    df['text'] = df['text'].str.lower()

    # Calculate the mean and median of text_length
    mean = df['text_length'].mean()
    median = df['text_length'].median()
    var = df['text_length'].var()
    std = df['text_length'].std()

    print(f"\033[1;32;35mThe mean  of the text_length is:  {mean}"
          '\n'
          f"The median of the text_length is:  {median}"
          '\n'
          f"The variance of the text_length is: {var}"
          '\n'
          f"The standard deviation of the text_length is:  {std}")

    # Find the min and max text length
    mini = df['text_length'].min()
    maxi = df['text_length'].max()
    print(
        f"\033[1;32;35mThe minimum text_length is: {mini} \n The maximum text_length is: {maxi}"
    )

    ####################
    # Create a Top 10 (label) barplot

    # Create a subset of "AdverseReaction" label
    df_AR = df.loc[df['label'] == label]
    df_freq_text = df_AR[['label', 'text']].groupby(
        ['text'])['label'].size().nlargest(10).reset_index(name='top10')
    objects = df_freq_text['text'].values
    y_pos = np.arange(len(objects))
    performance = df_freq_text['top10'].values

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.barh(y_pos,
             performance,
             align='center',
             alpha=0.8,
             color='orange',
             orientation='horizontal')
    plt.yticks(y_pos, objects, size=10, ha='right')
    plt.xlabel('Frequency', size=18)
    plt.title(f'{top10_title}', size=18)
    plt.tight_layout()
    plt.savefig(f'{top10_filename}.eps', format='eps', dpi=300)

    #####################
    # Create text length distribution
    # the range could be changed accordingly
    word_count_dict = dict([(range(i, i + 6), 0) for i in range(1, 67, 6)])
    for ann_doc in sent_docs:
        words = [word.strip() for word in ann_doc.plain_text_.split(' ')]
        word_count = len(words)

        for key in word_count_dict:
            if word_count in key:
                word_count_dict[key] += 1

    # make a barplot for text length distribution
    x = [str(key).split('e')[1] for key in word_count_dict.keys()]
    height = [float(v) for v in word_count_dict.values()]
    plt.bar(x=x, height=height, color='darkmagenta', zorder=3)
    plt.grid(axis='y')
    mpl.rcParams['grid.color'] = 'k'
    mpl.rcParams['grid.linestyle'] = ':'
    rcParams.update({'figure.autolayout': True})
    plt.xlabel('Word Count', fontsize=28)
    plt.ylabel('Sentence Count', fontsize=28)
    plt.title("Text Length Distribution", fontsize=32)
    plt.savefig('text_length_distribution.png', format='png')
def main():
    parser = argparse.ArgumentParser(description='traditional_models.py')
    parser.path('path', dest='path', deafult=None, type=str)
    parser.label('label', dest='label', default=None, type=str)
    parser.output_dir('output_dir', dest='output_dir', default=None, type=str)
    parser.model('model', dest='model', default=None)
    parser.tokenlevel_file_name('tokenlevel_file_name',
                                dest='tokenlevel_file_name',
                                default=None,
                                type=str)
    parser.entitylevel_file_name('entitylevel_file_name',
                                 dest='entitylevel_file_name',
                                 default=None,
                                 type=str)
    args = parser.parse_args()

    # Import data
    data_path = path
    ann_docs = BratInput(data_path).transform()
    data = retain_annotations(ann_docs, label)
    clean_data = clean_annotated_documents(data)
    non_overlap_data = resolve_overlaps(clean_data)

    # Split all documents collection into sentences
    sent_docs = split_annotated_documents(non_overlap_data)

    # Select sentences with less than 130 words
    short_sentences = []
    for i in sent_docs:
        tokens = text_to_tokens(i.plain_text_)
        if len(tokens) < 130:
            print(len(tokens))
            short_sentences.append(i)

    ### Models ###

    #Cvsplit
    splitter_2 = CVSplit(strategy="random", n_folds=5)
    splits = splitter_2.make_cv_folds(short_sentences)

    train_1 = splits[1] + splits[2] + splits[3] + splits[4]
    test_1 = splits[0]

    train_2 = splits[0] + splits[2] + splits[3] + splits[4]
    test_2 = splits[1]

    train_3 = splits[0] + splits[1] + splits[3] + splits[4]
    test_3 = splits[2]

    train_4 = splits[0] + splits[1] + splits[2] + splits[4]
    test_4 = splits[3]

    train_5 = splits[0] + splits[1] + splits[2] + splits[3]
    test_5 = splits[4]

    ### Save the different splits

    BratOutput("output_dir").transform(train_1)
    BratOutput("output_dir").transform(test_1)

    BratOutput("output_dir").transform(train_2)
    BratOutput("output_dir").transform(test_2)

    BratOutput("output_dir").transform(train_3)
    BratOutput("output_dir").transform(test_3)

    BratOutput("output_dir").transform(train_4)
    BratOutput("output_dir").transform(test_4)

    BratOutput("output_dir").transform(train_5)
    BratOutput("output_dir").transform(test_5)

    np.random.seed(0)

    y_true = np.array([0] * 400 + [1] * 600)
    y_pred = np.random.randint(2, size=1000)

    def pandas_classification_report(y_true, y_pred):
        metrics_summary = precision_recall_fscore_support(y_true=y_true,
                                                          y_pred=y_pred)

        avg = list(
            precision_recall_fscore_support(y_true=y_true,
                                            y_pred=y_pred,
                                            average='weighted'))

        metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
        class_report_df = pd.DataFrame(list(metrics_summary),
                                       index=metrics_sum_index)

        support = class_report_df.loc['support']
        total = support.sum()
        avg[-1] = total

        class_report_df['avg / total'] = avg

        return class_report_df.T

    ### RUN the models ###
    idx = 0
    entity_level_results = []
    token_level_df = pd.DataFrame()

    for split in splits:
        test = split
        train_splits = splits[:idx] + splits[idx + 1:]
        train = [item for sublist in train_splits for item in sublist]
        idx += 1

        #Train
        if model == 'ExactMatchDictionaryNER':

            model = model(entity_labels=label)
            model.fit(train)
            pred_docs = model.transform(test)

        if model == 'BidirectionalLSTM':

            model = model(entity_labels=label)
            model.fit(train)
            pred_docs = model.transform(test)

        if model == 'CRF':

            model = model(entity_labels=label)
            model.fit(train, max_iterations=100)
            pred_docs = model.transform(test)

        #Evaluate and store (entity-level evaluation)
        metrics_1fold = []
        p, r, f = annotation_precision_recall_f1score(pred_docs,
                                                      test,
                                                      ann_label=label)
        print(p, r, f)
        metrics_1fold.append(p)
        metrics_1fold.append(r)
        metrics_1fold.append(f)
        entity_level_results.append(metrics_1fold)

        # Convert to X_test, y_test, X_pred, y_pred
        X_test, y_test = transform_annotated_documents_to_bio_format(
            test, entity_labels=label)
        X_pred, y_pred = transform_annotated_documents_to_bio_format(
            pred_docs, entity_labels=label)

        #Keep only the first y_pred of each sentence
        label_pred = []
        for i in range(len(y_pred)):
            unique = y_pred[i][:len(y_test[i])]
            label_pred.append(unique)

        # Flat the nested lists
        flat_y_test = [item for sublist in y_test for item in sublist]
        flat_y_pred = [item for sublist in label_pred for item in sublist]

        # Print separate for B and I (token-level evaluation)
        classes = [f'B_{label}', f'I_{label}']
        print(
            classification_report(flat_y_test,
                                  flat_y_pred,
                                  target_names=classes,
                                  digits=4))
        df_class_report = pandas_classification_report(y_true=flat_y_test,
                                                       y_pred=flat_y_pred)
        token_level_df = token_level_df.append(df_class_report)

    # Save token-level evaluation report in a csv file
    token_level_df.to_csv(f'{tokenlevel_file_name}.csv', sep=',')
    df = pd.DataFrame(entity_level_results,
                      columns=["Precision", "Recall", "F1 measure"])
    # Save entity-level evaluation report in a csv file
    df.to_csv(f'{entitylevel_file_name}.csv')
Esempio n. 6
0
def main(paths, params):
    path_to_train_input = paths.training
    path_to_valid_input = paths.develop
    path_to_test= paths.test

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    writer = SummaryWriter()

    X = BratInput(path_to_train_input)
    X = X.transform()
    X = split_annotated_documents(X)

    X_valid = BratInput(path_to_valid_input)
    X_valid = X_valid.transform()
    X_valid = split_annotated_documents(X_valid)

    if not params.randomize:
        torch.manual_seed(5)
        random.seed(5)
        np.random.seed(5)

    # Obtain MeSH information
    mesh_file = paths.MeSH_file
    disease_file= paths.disease_file
    mesh_graph_file = paths.MeSH_graph_disease

    # read disease file
    with open(disease_file,'r') as f:
        data = f.readlines()

    mesh_dict = read_mesh_file(mesh_file)

    mesh_graph = nx.read_gpickle(mesh_graph_file)
    mesh_graph = mesh_graph.to_undirected()

    # Construct usable data format
    x_text = annotated_docs_to_tokens(X)
    scope_text, id2idx_dict, idx2id_dict = mesh_dict_to_tokens(mesh_dict, data)

    
    _, predictions_tr = transform_annotated_documents_to_bio_format(X)
    annotated_docs_tr = X
    _, predictions_v = transform_annotated_documents_to_bio_format(X_valid)
    annotated_docs_v = X_valid

    # annotated_docs_tr, predictions_tr = get_NER_prediction(X)
    # annotated_docs_v, predictions_v = get_NER_prediction(X_valid)

    # training

    train_gcn(paths, params, X, annotated_docs_tr, predictions_tr, X_valid, annotated_docs_v, predictions_v,
                 scope_text, id2idx_dict, idx2id_dict, mesh_graph, device=device)


# if __name__ == "__main__":
#     # Obtain the training, validation and test dataset
#     path_to_train_input = r'/media/druv022/Data1/Masters/Thesis/Data/Converted_train_2'
#     path_to_valid_input = r'/media/druv022/Data1/Masters/Thesis/Data/Converted_develop'
#     path_to_test= r'/media/druv022/Data1/Masters/Thesis/Data/Converted_test'
#     path_to_embeddings = r'/media/druv022/Data1/Masters/Thesis/Data/Embeddings'
#     ctd_file = r'/media/druv022/Data1/Masters/Thesis/Data/CTD/CTD_diseases.csv'
#     c2m_file = r'/media/druv022/Data1/Masters/Thesis/Data/C2M/C2M_mesh.txt'

#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     writer = SummaryWriter()

#     X = BratInput(path_to_train_input)
#     X = X.transform()
#     # X = split_annotated_documents(X)

#     X_valid = BratInput(path_to_valid_input)
#     X_valid = X_valid.transform()
#     # X_valid = split_annotated_documents(X_valid)

#     X_test = BratInput(path_to_test)
#     X_test = X_test.transform()

#     torch.manual_seed(5)
#     random.seed(5)
#     np.random.seed(5)

#     entity_names = ['B_Disease','I_Disease']
#     embeddings =  load_embedding_pkl(path_to_embeddings)

#     # Obtain MeSH information
#     mesh_file = r'/media/druv022/Data1/Masters/Thesis/Data/MeSH/ASCIImeshd2019.bin'
#     disease_file= r'/media/druv022/Data1/Masters/Thesis/Data/MeSH/disease_list'
#     mesh_graph_file = r'/media/druv022/Data1/Masters/Thesis/Data/MeSH/mesh_graph_disease'
#     mesh_folder = r'/media/druv022/Data1/Masters/Thesis/Data/MeSH'

#     # read disease file
#     with open(disease_file,'r') as f:
#         data = f.readlines()

#     mesh_dict = read_mesh_file(mesh_file)

#     mesh_graph = nx.read_gpickle(mesh_graph_file)
#     mesh_graph = mesh_graph.to_undirected()

#     # Construct usable data format
#     x_text = annotated_docs_to_tokens(X)
#     scope_text, id2idx_dict, idx2id_dict = mesh_dict_to_tokens(mesh_dict, data)

#     node_list = list(idx2id_dict.values())

#     _, predictions_tr = transform_annotated_documents_to_bio_format(X)
#     annotated_docs_tr = X
#     _, predictions_v = transform_annotated_documents_to_bio_format(X_valid)
#     annotated_docs_v = X_valid

#     # annotated_docs_tr, predictions_tr = get_NER_prediction(X)
#     # annotated_docs_v, predictions_v = get_NER_prediction(X_valid)

#     # training
#     epochs=500
#     batch_size=32
#     n_samples = 4

#     train_gcn()
def main(paths, params):
    path_to_train_input = paths.training
    path_to_valid_input = paths.develop
    path_to_test= paths.test
    ctd_file = paths.ctd_file
    c2m_file = paths.c2m_file
    toD_mesh = Convert2D(ctd_file, c2m_file)

    sentence_pad = False # Don't pad sentence with begin and end sentence '<s>' and '<\s>

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    writer = SummaryWriter()

    X = BratInput(path_to_train_input)
    X = X.transform()
    X = split_annotated_documents(X)

    X_valid = BratInput(path_to_valid_input)
    X_valid = X_valid.transform()
    X_valid = split_annotated_documents(X_valid)

    X_test = BratInput(path_to_test)
    X_test = X_test.transform()
    X_test = split_annotated_documents(X_test)

    if params.randomize:
        torch.manual_seed(5)
        random.seed(5)
        np.random.seed(5)

    # Obtain MeSH information
    mesh_file = paths.MeSH_file
    disease_file= paths.disease_file
    mesh_graph_file = paths.MeSH_graph_disease
    mesh_folder = paths.MeSH_folder
    mt_folder = paths.multitask_folder


    # read disease file
    with open(disease_file,'r') as f:
        disease_data = f.readlines()

    mesh_dict = read_mesh_file(mesh_file)

    mesh_graph = nx.read_gpickle(mesh_graph_file)
    mesh_graph = mesh_graph.to_undirected()
    scope_text, id2idx_dict, idx2id_dict = mesh_dict_to_tokens(mesh_dict, disease_data)
    node_list = list(idx2id_dict.values())

    # A_HAT metrix for GCN
    if not os.path.exists(os.path.join(mesh_folder, 'a_hat_matrix')):
        a_matrix = get_adjacancy_matrix(mesh_graph, node_list)

        a_matrix = sparse.coo_matrix(a_matrix)
        with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'wb') as f:
            pickle.dump(data, f)
    else:
        with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'rb') as f:
            a_matrix = pickle.load(f)

    i = torch.tensor([a_matrix.row, a_matrix.col], dtype=torch.long, device=device)
    v = torch.tensor(a_matrix.data, dtype=torch.float32, device=device)
    a_hat = torch.sparse.FloatTensor(i, v, torch.Size([len(node_list), len(node_list)])).to(device)

    # Construct usable data format
    x_tr_text, ner_tr_tags, x_tr_tokens = annotated_docs_to_tokens(X, sentence_pad=sentence_pad)
    x_val_text, ner_val_tags, x_val_tokens = annotated_docs_to_tokens(X_valid, sentence_pad=sentence_pad)
    x_test_text, ner_test_tags, x_test_tokens = annotated_docs_to_tokens(X_test, sentence_pad=sentence_pad)

    # elmo embeddings
    options_file = paths.elmo_options
    weight_file = paths.elmo_weights
    ELMO_folder = paths.elmo_folder
    elmo_dim = params.elmo_dim
    elmo = Elmo(options_file, weight_file, 2,dropout=0)
    elmo.to(device)

    with torch.no_grad():
        if not os.path.exists(os.path.join(mt_folder,'text_tr_elmo_split.pkl')):
            text_tr = get_elmo_representation(x_tr_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_tr, f)
        else:
            with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'rb+') as f:
                text_tr = pickle.load(f)
        
        if not os.path.exists(os.path.join(mt_folder,'text_val_elmo_split.pkl')):
            text_val = get_elmo_representation(x_val_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_val, f)
        else:
            with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'rb+') as f:
                text_val = pickle.load(f)

        if not os.path.exists(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl')):
            text_test = get_elmo_representation(x_test_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_test, f)
        else:
            with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'rb+') as f:
                text_test = pickle.load(f)

    # NER label vocab
    ner_labels_vocab = Vocabulary(lower=False)
    ner_labels_vocab.add_documents(ner_tr_tags)
    ner_labels_vocab.build()

    # mesh scope embedding
    if not os.path.exists(os.path.join(paths.dump_folder, 'scope_emb.pkl')):
        scope_embedding, _ = get_scope_elmo(elmo, ELMO_folder, scope_text, elmo_dim, idx2id_dict, id2idx_dict, device=device)
        with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'wb') as f:
            pickle.dump(scope_embedding, f)
    else:
        with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'rb') as f:
            scope_embedding = pickle.load(f)
            
    train_el_set = EL_set(X, toD_mesh, id2idx_dict)
    val_el_set = EL_set(X_valid, toD_mesh, id2idx_dict)


    train(paths, params, X, text_tr, ner_tr_tags, train_el_set, X_valid, x_val_tokens, text_val,
            ner_val_tags, val_el_set, ner_labels_vocab, scope_text, scope_embedding, a_hat, mesh_graph, id2idx_dict, idx2id_dict, writer, device=device)