Example #1
0
import pandas as pd
import matplotlib
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
matplotlib.use('Agg')  #change backend for headless operation
import matplotlib.pyplot as plt
from framework import util
import numpy as np
from main.data import VAD, BE5

df = util.load_tsv('overview.tsv')

ind = np.arange(1, 9)
print(ind)

fig, ax = plt.subplots()

ax.bar(ind[:3],
       df.loc['Average', VAD],
       color='blue',
       edgecolor='black',
       linewidth=1,
       hatch='/',
       zorder=3,
       label='VAD')  #zorder determines foreground-background ordering

ax.bar(ind[3:],
       df.loc['Average', BE5],
       color='red',
       edgecolor='black',
       linewidth=1,
Example #2
0
import pandas as pd
from framework.util import load_tsv
from framework import prepare_data

# compute additional entries for each data set in the monolingual approach
with open('new_entries_monolingual.tsv', 'w') as f:
    print('Lexcion\tN', file=f)
    #english
    set_new = set(load_tsv('lexicons/Warriner_BE.tsv').index)
    set_old = set(prepare_data.load_anew99().index)
    n_new = len(set_new.difference(set_old))
    print('{}\t{}'.format('Warriner_BE', n_new), file=f)

    #spanish
    set_new = set(load_tsv('lexicons/Stadthagen_Dominance.tsv').index)
    set_old = set(prepare_data.load_redondo07().index)
    set_old = set(prepare_data.load_hinojosa16().index).union(set_old)
    n_new = len(set_new.difference(set_old))
    print('{}\t{}'.format('Stadthagen_Dominance', n_new), file=f)

    #german vo
    set_new = set(load_tsv('lexicons/Vo_BE.tsv').index)
    set_old = set(prepare_data.load_briesemeister11().index)
    n_new = len(set_new.difference(set_old))
    print('{}\t{}'.format('Vo_BE', n_new), file=f)

    #polish
    set_new = set(load_tsv('lexicons/Imbir_BE.tsv').index)
    set_old = set(prepare_data.load_wierzba15().index)
    n_new = len(set_new.difference(set_old))
    print('{}\t{}'.format('Imbir_BE', n_new), file=f)
Example #3
0
import framework.util as util
from main.data import IN_PAPER_NAMES, VA, BE5, SHORT_COLUMNS
import datetime
import pandas as pd


df=util.load_tsv('results.tsv')


df=df[VA+BE5]
df.rename(index=IN_PAPER_NAMES,  inplace=True)
df.rename(index=str, columns=SHORT_COLUMNS, inplace=True)

#reoder index
df=df.reindex([value for key,value in IN_PAPER_NAMES.items()])



df_shr=util.load_tsv('../../analysis/shr/shr_normalized.tsv').drop('Dom', axis=1)

df=df.round(3)

df_lesser=df<df_shr
df_greater=df>df_shr




lines=[]
lines.append('%%%%%% Automatic Python output from {} &%%%%%%%%%%'.format(datetime.datetime.now()))
lines.append('\\begin{tabular}{|l|rr|rrrrr|}')
Example #4
0
                    base_model=base_model, source_lexicon=source_lexicon)
                for var in list(source_lexicon):
                    models[var] = framework.models.SKlearn_Mapping_Model(
                        base_model=base_model,
                        source_lexicon=source_lexicon.drop(var, axis=1))

                # Run actual evaluation
                ev = framework.models.Evaluator(models=models)
                ev.crossvalidate(words=target_lexicon.index,
                                 labels=target_lexicon,
                                 k_splits=k_fold,
                                 outpath='results/{}/{}/{}/'.format(
                                     curr_dir, base_model_name, setting.name))

                ### compute difference to full model:
                df_full = util.load_tsv('results/{}/{}/{}/full.tsv'.format(
                    curr_dir, base_model_name, setting.name))
                print(df_full)
                for var in list(source_lexicon):
                    df_var = util.load_tsv('results/{}/{}/{}/{}.tsv'.format(
                        curr_dir, base_model_name, setting.name, var))
                    print(df_var)
                    df_diff = df_var - df_full
                    print(df_diff)
                    util.save_tsv(df=df_diff,
                                  path='results/{}/{}/{}/diff_{}.tsv'.format(
                                      curr_dir, base_model_name, setting.name,
                                      var))

### compute average values
average_subdirs('results/be2vad/lm')
average_subdirs('results/vad2be/lm')
import pandas as pd
from main.data import SETTINGS, IN_PAPER_NAMES, VAD, BE5, SHORT_COLUMNS
from framework.util import get_average_result_from_df, save_tsv, no_zeros_formatter, load_tsv
import datetime
import framework.util as util

directions = ['be2vad', 'vad2be']

models = ['baseline', 'reference_LM', 'Reference_KNN', 'my_model']
VARS = VAD + BE5

df = pd.DataFrame(index=[setting.name for setting in SETTINGS], columns=VARS)

for d in directions:
    for s in SETTINGS:
        results = load_tsv('results/{}/{}/my_model.tsv'.format(d, s.name))
        for var in VARS:
            if var in list(results):
                df.loc[s.name, var] = results.loc['Average', var]

df.rename(index=IN_PAPER_NAMES, inplace=True)
df.rename(index=str, columns=SHORT_COLUMNS, inplace=True)
save_tsv(df, 'overview_individual.tsv')

# read normalized split half reliabilites to make larger values bold
df_shr = load_tsv('../../analysis/shr/shr_normalized.tsv')
df_greater = df > df_shr
df_lesser = df < df_shr
print(df_greater)
print(df_lesser)
Example #6
0
with open('overview.tex', 'w') as f:
    print(string, file=f)

####################################################

### Significance tests
settings = [s.name for s in SETTINGS]

star_df = pd.DataFrame(columns=directions)

for d in directions:
    for s in settings:
        ### load all individual data frames
        dfs = {}
        for m in models:
            dfs[m] = load_tsv('results/{}/{}/{}.tsv'.format(d, s, m))
        # write average results into single data frame to determine the two best systems
        average_results = pd.DataFrame(columns=['r'])
        for key, value in dfs.items():
            average_results.loc[key, 'r'] = value.loc['Average', 'Average']
        # sort by performance and get name of the best two systems
        average_results = average_results.sort_values(by='r',
                                                      axis=0,
                                                      ascending=False)
        best_2 = list(average_results.index)[:2]
        # compute paired t-test on individual results of cross-validation
        pvalue = st.ttest_rel(a=dfs[best_2[0]].drop(['SD', 'Average'],
                                                    axis=0)['Average'],
                              b=dfs[best_2[1]].drop(['SD', 'Average'],
                                                    axis=0)['Average'])[1]
        # compute the number of stars