def create_df_all_sentences():
    """create the df with pos tags given by each libraries, for each sentence"""
    df_pos = pd.read_csv(os.path.join(THIS_FOLDER, 'source/utils/sentences_to_GT_POS_corrected.csv'))
    for index, doc in enumerate(df_pos['sentence'].tolist()):
        for lib in LIST_PACKAGES:
            df_pos.loc[index, lib + '_pos'] = str(map_results_to_universal_tags(_pos_tag_sentence(lib, doc), lib))
    df_pos.to_csv(os.path.join(THIS_FOLDER, 'source/utils/sentences_to_GT_POS_libraries.csv'))
def test_nb_votes(documents: list):
    """we check that we have the right number of votes for a given token
    """
    for doc in documents:
        iterables = [
            map_results_to_universal_tags(_pos_tag_sentence(lib, doc), lib)
            for lib in LIST_PACKAGES
        ]
        for list_token_tags in zip(*iterables):
            assert len([i[1] for i in list_token_tags]) == len(LIST_PACKAGES)
            assert len(list(set([i[0] for i in list_token_tags]))) == 1
def test_wether_majority_token_equals_gt(documents: list):
    """we check whether the comparison between majority and GT is correct
        """
    for doc in documents:
        iterables = [
            map_results_to_universal_tags(_pos_tag_sentence(lib, doc), lib)
            for lib in LIST_PACKAGES
        ]
        for list_token_tags in zip(*iterables):
            majority_token = return_majority_token(list_token_tags)
            bool = return_wether_majority_token_equals_gt(list_token_tags)
            assert bool in [True, False]
            assert bool == (majority_token == list_token_tags[-1])
Example #4
0
def test_each_token_has_a_mapped_correct_tag(documents: list):
    """
    Check that each mapped tag is a valid value
    """
    mappings = _read_tag_map()
    values = list(mappings['UNIV'].values()) + list(
        mappings['PTB-UNIV'].values()) + list(
            mappings['ARTICLE-UNIV'].values())
    for doc in documents:
        for lib in LIST_PACKAGES:
            assert all(item in values for item in [
                i[1] for i in map_results_to_universal_tags(
                    _pos_tag_sentence(lib, doc), lib)
            ]) == True
def test_token_majority(documents: list):
    """we check that we have the token with the majority votes is indeed the one having most votes, and we check how many votes he gets
    """
    for doc in documents:
        iterables = [
            map_results_to_universal_tags(_pos_tag_sentence(lib, doc), lib)
            for lib in LIST_PACKAGES
        ]
        for list_token_tags in zip(*iterables):
            number_votes_majority_token = return_number_votes_majority_token(
                list_token_tags)
            assert number_votes_majority_token <= len(list_token_tags)
            assert number_votes_majority_token >= 1
            assert number_votes_majority_token == max(
                [list_token_tags.count(i) for i in list_token_tags])
def test_unique_tokens_voted(documents: list):
    """we check that the number of unique tokens voted is the right number
    """
    for doc in documents:
        iterables = [
            map_results_to_universal_tags(_pos_tag_sentence(lib, doc), lib)
            for lib in LIST_PACKAGES
        ]
        for list_token_tags in zip(*iterables):
            nb_unique_tokens = return_unique_tokens(list_token_tags)
            number_votes_majority_token = return_number_votes_majority_token(
                list_token_tags)
            assert nb_unique_tokens <= len(list_token_tags)
            assert nb_unique_tokens >= 1
            assert nb_unique_tokens <= len(
                list_token_tags) - number_votes_majority_token + 1
def evaluation(file):
    import pandas as pd

    df = pd.read_csv(file)
    df['GT'] = df['sentence'].apply(lambda x: [
        i[1] for i in map_results_to_universal_tags(
            _pos_tag_sentence('article', x), 'article')
    ])
    print(df['GT'])

    # nltk
    df['nltk'] = df['sentence'].apply(lambda x: [
        i[1] for i in map_results_to_universal_tags(
            _pos_tag_sentence('nltk', x), 'nltk')
    ])
    print('nltk')

    # we remove sentences when things could be removed when manually reviewing
    df['same'] = df[['GT', 'nltk']].apply(lambda x: 1
                                          if len(x[0]) == len(x[1]) else 0,
                                          axis=1)
    df = df[df.same == 1]

    # stanza
    df['stanza'] = df['sentence'].apply(lambda x: [
        i[1] for i in map_results_to_universal_tags(
            _pos_tag_sentence('stanza', x), 'stanza')
    ])
    print('stanza')
    df['same'] = df[['GT', 'stanza']].apply(lambda x: 1
                                            if len(x[0]) == len(x[1]) else 0,
                                            axis=1)
    df = df[df.same == 1]

    # spacy
    df['spacy'] = df['sentence'].apply(lambda x: [
        i[1] for i in map_results_to_universal_tags(
            _pos_tag_sentence('spacy', x), 'spacy')
    ])
    print('spacy')

    df['same'] = df[['GT', 'spacy']].apply(lambda x: 1
                                           if len(x[0]) == len(x[1]) else 0,
                                           axis=1)
    df = df[df.same == 1]

    df.to_csv('test_set_pos_tagging.csv')

    # Take only Basel's corrected pos
    df = pd.read_csv('test_set_pos_tagging.csv')[500:]

    df['nltk'] = df['nltk'].apply(lambda x: ast.literal_eval(x))
    df['spacy'] = df['spacy'].apply(lambda x: ast.literal_eval(x))
    df['stanza'] = df['stanza'].apply(lambda x: ast.literal_eval(x))
    df['GT'] = df['GT'].apply(lambda x: ast.literal_eval(x))

    # we check whenever the 3 libraries agree to get the confusion matrices + classification reports only for the tokens where
    # there are disagreements
    df['agree'] = df[['nltk', 'spacy', 'stanza']].apply(
        lambda x:
        [1 if nl == sp == st else 0 for nl, sp, st in zip(x[0], x[1], x[2])],
        axis=1)

    flat_list_nltk = [
        item for sublist in df[['nltk', 'agree']].apply(
            lambda x: [nltk for nltk, agree in zip(x[0], x[1]) if agree == 0],
            axis=1).tolist() for item in sublist
    ]
    flat_list_spacy = [
        item for sublist in df[['spacy', 'agree']].apply(
            lambda x: [nltk for nltk, agree in zip(x[0], x[1]) if agree == 0],
            axis=1).tolist() for item in sublist
    ]
    flat_list_stanza = [
        item for sublist in df[['stanza', 'agree']].apply(
            lambda x: [nltk for nltk, agree in zip(x[0], x[1]) if agree == 0],
            axis=1).tolist() for item in sublist
    ]
    flat_list_gt = [
        item for sublist in df[['GT', 'agree']].apply(
            lambda x: [nltk for nltk, agree in zip(x[0], x[1]) if agree == 0],
            axis=1).tolist() for item in sublist
    ]

    array_nltk = confusion_matrix(flat_list_gt,
                                  flat_list_nltk,
                                  labels=list(set(flat_list_gt)))
    array_spacy = confusion_matrix(flat_list_gt,
                                   flat_list_spacy,
                                   labels=list(set(flat_list_gt)))
    array_stanza = confusion_matrix(flat_list_gt,
                                    flat_list_stanza,
                                    labels=list(set(flat_list_gt)))

    # nltk confusion matrix + classification report
    df_cm = pd.DataFrame(array_nltk,
                         index=[i for i in list(set(flat_list_gt))],
                         columns=[i for i in list(set(flat_list_gt))])
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True, fmt='g')
    plt.title('nltk confusion matrix_' + str(np.sum(array_nltk)))
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.show()
    print("NLTK:")
    print(
        classification_report(flat_list_gt,
                              flat_list_nltk,
                              labels=list(set(flat_list_gt))))

    # spacy confusion matrix + classification report
    df_cm = pd.DataFrame(array_spacy,
                         index=[i for i in list(set(flat_list_gt))],
                         columns=[i for i in list(set(flat_list_gt))])
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True, fmt='g')
    plt.title('spacy confusion matrix_' + str(np.sum(array_spacy)))
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.show()
    print("SPACY:")
    print(
        classification_report(flat_list_gt,
                              flat_list_spacy,
                              labels=list(set(flat_list_gt))))

    # stanza confusion matrix + classification report
    df_cm = pd.DataFrame(array_stanza,
                         index=[i for i in list(set(flat_list_gt))],
                         columns=[i for i in list(set(flat_list_gt))])
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True, fmt='g')
    plt.title('stanza confusion matrix_' + str(np.sum(array_stanza)))
    plt.xlabel("predicted")
    plt.ylabel("actual")
    plt.show()
    print("STANZA:")
    print(
        classification_report(flat_list_gt,
                              flat_list_stanza,
                              labels=list(set(flat_list_gt))))