Ejemplo n.º 1
0
def crosslingual():
    all_data = pd.concat([df for key, df in lang_data.items()])

    ### VA-->BE5 model on all_data
    model = get_model(n_inputs=len(VA),
                      n_outputs=len(BE5),
                      source_lexicon=all_data[VA])
    model.fit(words=all_data.index, labels=all_data[BE5])

    ###combining chinese data
    def get_zh():
        zh1 = prepare_data.load_yu16()
        zh2 = prepare_data.load_yao16()
        zh = pd.concat([zh1, zh2])
        del zh1
        del zh2
        return drop_duplicates(zh)

    settings = [{
        'name': 'it_Montefinese_BE',
        'load': prepare_data.load_montefinese14
    }, {
        'name': 'pt_Soares_BE',
        'load': prepare_data.load_soares12
    }, {
        'name': 'nl_Moors_BE',
        'load': prepare_data.load_moors13
    }, {
        'name': 'id_Sianipar_BE',
        'load': prepare_data.load_sianipar16
    }, {
        'name': 'zh_Yu_Yao_BE',
        'load': get_zh
    }, {
        'name': 'fr_Monnier_BE',
        'load': prepare_data.load_monnier14
    }, {
        'name': 'gr_Palogiannidi_BE',
        'load': prepare_data.load_palogiannidi16
    }, {
        'name': 'fn_Eilola_BE',
        'load': prepare_data.load_eilola10
    }, {
        'name': 'sv_Davidson_BE',
        'load': prepare_data.load_davidson14
    }]
    num_of_new_entries = pd.DataFrame(columns=['N'])
    for s in settings:
        print(s['name'])
        source_lex = s['load']()
        lex = model.lexicon_creation(words=source_lex.index,
                                     features=source_lex[VA])
        save_tsv(lex, 'lexicons/{}.tsv'.format(s['name']))
        num_of_new_entries.loc[s['name']] = len(lex)
    save_tsv(num_of_new_entries, 'new_entries_crosslingual.tsv')
Ejemplo n.º 2
0
def monolingual():
    def create_lexicon(source_rep, target_rep, train_data, target_data):
        model = get_model(n_inputs=len(source_rep),
                          n_outputs=len(target_rep),
                          source_lexicon=train_data[source_rep])
        model.fit(words=train_data.index, labels=train_data[target_rep])
        lex = model.lexicon_creation(words=target_data.index,
                                     features=target_data[source_rep])
        return lex

    #### for the monolingual set-up, we selected the data sets with the
    #### highest accuracy in the monolingual evaluation (estimated to be the cleanest data)

    settings = [{
        'name': 'Warriner_BE',
        'source_rep': VAD,
        'target_rep': BE5,
        'train_data': prepare_data.get_english_anew(),
        'target_data': prepare_data.load_warriner13
    }, {
        'name': 'Stadthagen_Dominance',
        'source_rep': BE5,
        'target_rep': VAD,
        'train_data': prepare_data.get_spanish_hinojosa(),
        'target_data': prepare_data.load_stadthagen17
    }, {
        'name': 'Vo_BE',
        'source_rep': VA,
        'target_rep': BE5,
        'train_data': prepare_data.get_german_bawl(),
        'target_data': prepare_data.load_vo09
    }, {
        'name': 'Imbir_BE',
        'source_rep': VA,
        'target_rep': BE5,
        'train_data': prepare_data.get_polish_nawl(),
        'target_data': prepare_data.load_imbir16
    }]

    for s in settings:
        lex = create_lexicon(source_rep=s['source_rep'],
                             target_rep=s['target_rep'],
                             train_data=s['train_data'],
                             target_data=s['target_data']())
        save_tsv(lex, 'lexicons/{}.tsv'.format(s['name']))
Ejemplo n.º 3
0
    def crossvalidate(self, words, labels, k_splits, outpath):
        '''
		Performs crossvalidation with each of the models given to this instance
		of the Evaluator class. The different models are tested on identical
		train/test splits which allows for using paired t-tests.
		'''
        if not os.path.isdir(outpath):
            os.makedirs(outpath)
        words = pd.Series(words)

        assert (len(words)==len(labels)), 'Words and labels have unequal'+\
         'length.'
        assert (k_splits > 1), 'Crossvalidation makes no sense for k<2!'

        results={key:pd.DataFrame(columns=labels.columns)\
         for key in list(self.models)}
        k = 1
        for  train_index, test_index in KFold(n_splits=k_splits, shuffle=True).\
         split(labels):
            print('k=' + str(k))
            train_labels = labels.iloc[train_index]
            test_labels = labels.iloc[test_index]
            train_words = words.iloc[train_index]
            test_words = words.iloc[test_index]

            for model_name in list(self.models):
                print(model_name)
                model = self.models[model_name]
                model.fit(train_words, train_labels)
                preds = model.predict(test_words)
                preds = pd.DataFrame(preds, columns=labels.columns)
                results[model_name].loc[k] = util.eval(test_labels, preds)
                model.initialize()  #resets the model.
            k += 1
        # Averaging results
        print('\n')
        for m_name in list(results):
            results[m_name] = util.average_results_df(results[m_name])
            util.save_tsv(df=results[m_name],
                          path=outpath + '/' + m_name + '.tsv')
Ejemplo n.º 4
0
    'bawl': load_vo09(),
    'nawl': load_riegel15(),
    'imbir': load_imbir16()
}

pairs = {
    'anew': 'warriner',
    'redondo': 'stadthagen_va',
    'hinojosa': 'stadthagen_va',
    'ferre': 'stadthagen_be',
    'hinojosa': 'stadthagen_be',
    'schmidtke': 'bawl',
    'nawl': 'imbir'
}

results = {}

for first in pairs.keys():
    second = pairs[first]
    result = compute_isr(data[first], data[second])
    print(results)
    results['{}-{}'.format(first, second)] = result

df = pd.DataFrame(index=results.keys(), columns=VAD + BE5 + ['N'])
for key, value in results.items():
    df.loc[key] = value

print(df)

save_tsv(df, 'results.tsv')
Ejemplo n.º 5
0
import pandas as pd
from main import data
from framework.util import save_tsv
df = pd.DataFrame(columns=['N'])

for s in data.SETTINGS:
    df.loc[s.name] = len(s.load_data())

df.rename(index=data.IN_PAPER_NAMES, inplace=True)

print(df)
save_tsv(df, 'corpus_statistics.tsv')
Ejemplo n.º 6
0
settings = [
    'English_ANEW_Stevenson', 'German_Schmidtke', 'Polish_Imbir',
    'Spanish_Hinojosa', 'Spanish_Redondo'
]

results_be = pd.DataFrame(index=settings, columns=BE5)
for s in settings:
    for c in BE5:
        result=util.get_average_result_from_df('results/be2vad/lm/{}/diff_{}.tsv'\
         .format(s,c))
        results_be.loc[s, c] = result

results_vad = pd.DataFrame(index=settings, columns=VAD)
for s in settings:
    for d in VAD:
        result=util.get_average_result_from_df('results/vad2be/lm/{}/diff_{}.tsv'\
         .format(s,d))
        results_vad.loc[s, d] = result

df = results_vad.join(results_be, how='inner')
df.loc['Average'] = df.mean(axis=0)
df = df * (-1)  #change sign

util.save_tsv(df, 'overview_with_hinojosa.tsv')
df.to_latex('overview_with_hinojosa.tex', float_format=util.no_zeros_formatter)

df.drop(['Spanish_Hinojosa', 'Average'], inplace=True)
df.loc['Average'] = df.mean(axis=0)
util.save_tsv(df, 'overview.tsv')
df.to_latex('overview.tex', float_format=util.no_zeros_formatter)
Ejemplo n.º 7
0

def undo_spearman_brown(x):
    '''
	Reverts Spearman-Brown adjustment for k=2 (the case of adjusting
	split-half reliability).
	'''
    return 1 / ((2 / x) - (2 - 1))


def spearman_brown_adjustment(r, k):
    r_adjusted = (k * r) / (1 + (k - 1) * r)
    return r_adjusted


save_tsv(df.drop('CORRECTED', axis=1), 'shr_as_reported.tsv')

# print(df)
'''
Normalize split-half reliabilities to N=10. That is, apply spearman_brown_
adjustment with k=10/N if the score has already been normalized (e.g., N=30
then each half contains ratings from 15 participants. Then the predicted reliability
for N=30 was reported). Otherwise, if the reported split-half-reliabilities has
not already been adjusted, k is set to 10/(N/2). E.g., N=20, then 10 ratings are 
in each split, so k=10/(20/2)=1, so the reported reliabilities remain unchanged.
'''
N_star = 10  #10
for i in range(df.shape[0]):
    if df['CORRECTED'][i] == 1:
        # for j in range(df.shape[1]):
        # 	df.iloc[i,j]=undo_spearman_brown(df.iloc[i,j])
Ejemplo n.º 8
0
                for var in list(source_lexicon):
                    models[var] = framework.models.SKlearn_Mapping_Model(
                        base_model=base_model,
                        source_lexicon=source_lexicon.drop(var, axis=1))

                # Run actual evaluation
                ev = framework.models.Evaluator(models=models)
                ev.crossvalidate(words=target_lexicon.index,
                                 labels=target_lexicon,
                                 k_splits=k_fold,
                                 outpath='results/{}/{}/{}/'.format(
                                     curr_dir, base_model_name, setting.name))

                ### compute difference to full model:
                df_full = util.load_tsv('results/{}/{}/{}/full.tsv'.format(
                    curr_dir, base_model_name, setting.name))
                print(df_full)
                for var in list(source_lexicon):
                    df_var = util.load_tsv('results/{}/{}/{}/{}.tsv'.format(
                        curr_dir, base_model_name, setting.name, var))
                    print(df_var)
                    df_diff = df_var - df_full
                    print(df_diff)
                    util.save_tsv(df=df_diff,
                                  path='results/{}/{}/{}/diff_{}.tsv'.format(
                                      curr_dir, base_model_name, setting.name,
                                      var))

### compute average values
average_subdirs('results/be2vad/lm')
average_subdirs('results/vad2be/lm')
Ejemplo n.º 9
0
            source_lexicon = pd.concat(
                [train_lex[source_rep], test_lex[source_rep]])

            target_lexicon = test_lex[target_rep]
            n_in = source_lexicon.shape[1]
            n_out = target_lexicon.shape[1]

            my_model = framework.models.Mapping_Model(
                layers=[n_in] + MY_MODEL['hidden_layers'] + [n_out],
                activation=MY_MODEL['activation'],
                dropout_hidden=MY_MODEL['dropout_hidden'],
                train_steps=MY_MODEL['train_steps'],
                batch_size=MY_MODEL['batch_size'],
                optimizer=MY_MODEL['optimizer'],
                source_lexicon=source_lexicon,
                verbose=1)

            my_model.fit(train_lex.index, train_lex[target_rep])
            preds = my_model.predict(target_lexicon.index)
            preds = pd.DataFrame(preds, columns=list(target_lexicon))
            result = util.eval(preds, target_lexicon)
            print(result)
            results[d].loc[setting.name] = result
            print(results[d])

results['vad2be']['Avg_BE'] = results['vad2be'].mean(axis=1)
results['be2vad']['Avg_VA'] = results['be2vad'].mean(axis=1)
results = pd.concat([results[key] for key in list(DIRECTIONS)], axis=1)

util.save_tsv(results, 'results.tsv')
Ejemplo n.º 10
0
models = ['baseline', 'reference_LM', 'Reference_KNN', 'my_model']
VARS = VAD + BE5

df = pd.DataFrame(index=[setting.name for setting in SETTINGS], columns=VARS)

for d in directions:
    for s in SETTINGS:
        results = load_tsv('results/{}/{}/my_model.tsv'.format(d, s.name))
        for var in VARS:
            if var in list(results):
                df.loc[s.name, var] = results.loc['Average', var]

df.rename(index=IN_PAPER_NAMES, inplace=True)
df.rename(index=str, columns=SHORT_COLUMNS, inplace=True)
save_tsv(df, 'overview_individual.tsv')

# read normalized split half reliabilites to make larger values bold
df_shr = load_tsv('../../analysis/shr/shr_normalized.tsv')
df_greater = df > df_shr
df_lesser = df < df_shr
print(df_greater)
print(df_lesser)

outperformed = 0
not_outperformed = 0

# add cell colour
df = df.round(3)
print(df)
Ejemplo n.º 11
0
df=pd.DataFrame(index=[setting.name for setting in SETTINGS],
    columns=['be2vad_'+model for model in models]+\
       ['vad2be_'+model for model in models])

for d in directions:
    for setting in SETTINGS:
        for model in models:
            perf = get_average_result_from_df('results/{}/{}/{}.tsv'.format(
                d, setting.name, model))
            df.loc[setting.name, d + '_' + model] = perf

df.loc['Average'] = df.mean(axis=0)

df.rename(index=IN_PAPER_NAMES, inplace=True)

save_tsv(df, 'overview.tsv', dec=3)

string = df.to_latex(float_format=no_zeros_formatter)

lines = string.split('\n')
lines[0] = '\\begin{tabular}{|l|rrrr|rrrr|}'
lines = [
    '%%%%%% Automatic Python output from {} &%%%%%%%%%%'.format(
        datetime.datetime.now())
] + lines
lines[-1] = '%%%%%%%%%%%%%%%%%%%%%%%%'

lines.insert(
    3, '{} & \multicolumn{4}{c|}{BE2VAD} & \multicolumn{4}{c|}{VAD2BE} \\\\')
lines[4]=lines[4].replace('be2vad\_','').replace('vad2be\_', '').\
 replace('Reference\_','').replace('reference\_','').replace('my\_model','FFNN').\