Ejemplo n.º 1
0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data import Statistics, Preprocessing
''' This file plots all the statistics graphs needed for the thesis report. '''

# Preprocesses the data.
preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)

#######################################################################################################################
#                                 PLOTTING DIALOGUE ACT DISTRIBUTIONAL DATA                                           #
#######################################################################################################################
statistics.get_da_distribution()
statistics.get_da_distributions(['participant', 'interviewer'], [1, 2, 3, 4])
statistics.get_da_counts(preprocessed.data, 'dialogue_act', [1, 2, 3, 4])
statistics.get_average_utterance_length(['participant', 'interviewer'],
                                        [1, 2, 3, 4])
statistics.get_speaker_ratios([1, 2, 3, 4])

# Get the dialogue act distribution.
distribution_order = pd.read_csv('analyses/dialogue_act_distribution.csv',
                                 index_col=[0],
                                 header=None)

# Plot the dialogue act distribution.
sns.set_palette(sns.color_palette('Blues_r', 13))
graph = distribution_order.plot.bar()
plt.legend().remove()
_, labels = plt.xticks()
graph.set_xticklabels(labels,
Ejemplo n.º 2
0
# Sets the font size of the plot labels.
plt.rcParams['xtick.labelsize'] = 6

weighted = 'unweighted'
sequence_lengths = [3]
input_settings = ['_d', '_d_s', '_d_s_l', '_d_s_l_u']
baselines = ['majority_class', 'random', 'weighted_random']
colors = ['b', 'r', 'y', 'g']

# Initialises variables to be defined later.
names = []
x = []

preprocessed = Preprocessing('data/DA_labeled_belc_2019.csv')
statistics = Statistics(preprocessed)
da_sorted_by_occurance = list(statistics.get_da_distribution().index)

for sequence_length in sequence_lengths:

    # Initialises the plot format.
    fig, ax = plt.subplots()

    # Gets the precision, recall and f1-score for every dialogue act for different model input settings.
    for input_setting in input_settings:

        # Loads in the data for the plots.
        filename = 'analyses/' + weighted + '_model_sequence_length_' + str(sequence_length) + input_setting + \
                   '_accuracy.csv'
        accuracies = pd.read_csv(filename, index_col=[0], header=[0, 1])

        # Sort the accuracies by the overall occurance ratio of the classes.