Ejemplo n.º 1
0
	os.makedirs('results')



for setting_name in settings:
	print('Now processing {}'.format(setting_name))
	setting=settings[setting_name]
	model=setting['model']
	labels=setting['lexicon']
	features=setting['features']
	result= model.test_at_steps(  features=features,
						  labels=labels,
						  test_size=.1,
						  steps=list(range(1000,15000+1, 1000)),
						  runs=20,
						  layers=[len(list(features)), 256, 128,
									len(list(labels))],
						  nonlinearity=util.leaky_relu,
						  loss_function=tf.losses.mean_squared_error,
						  l2_beta=0,
						  learning_rate=1e-3,
						  batch_size=128,
						  dropout_hidden=.5,
						  dropout_embedding=.2,
						  report_at=0,
						  weights_sd=.001,
						  biases=.01,
						  batch_gen=util.Serial_Batch_Gen)
	util.save_tsv(result, 'results/'+setting_name+'.tsv')

Ejemplo n.º 2
0
def main(results_path='results', metric='r'):
    RESULTS = results_path + '/'

    if not os.path.exists(RESULTS):
        os.makedirs(RESULTS)

    ### settings
    for setting in SETTINGS:
        print('Now processing {}'.format(setting.name))

        ### check if this setting has already been processed
        if os.path.isdir(RESULTS + setting.name):
            print('\t{} has already been processed!'.format(setting.name))
        else:

            labels = setting.load_data()
            embs = setting.load_embeddings()

            models = {
                'turney':
                turney.Bootstrapper(embs),
                'densifier':
                densifier.Densifier(embs),
                'my_model_relu':
                my_model_relu,
                'my_model_sigmoid':
                my_model_sigmoid,
                'aicyber':
                aicyber.mlp_ensemble(),
                'li_regressor':
                li_regressor(),
                'linear_model':
                li_regressor(init_fun=sklearn.linear_model.LinearRegression)
            }

            results_setting={key:pd.DataFrame(columns=labels.columns)\
             for key in list(models)}

            ### Crossvalidation
            k = 0
            for  train_index, test_index in KFold(n_splits=10, shuffle=True).\
               split(labels):
                k += 1
                train = labels.iloc[train_index]
                test = labels.iloc[test_index]
                print(k)

                train_features = util.feature_extraction(train.index, embs)
                test_features = util.feature_extraction(test.index, embs)

                ### methods
                for model_name in list(models):
                    model = models[model_name]
                    print(model_name)

                    ### case distinction because models do not share the same
                    ###	interface
                    tf.reset_default_graph()
                    preds = None
                    if model_name in [
                            'aicyber', 'li_regressor', 'linear_model'
                    ]:
                        model.fit(train_features.copy(), train.copy())
                        preds = model.predict(test_features.copy())
                    elif model_name in ['my_model_relu', 'my_model_sigmoid']:
                        # print(train)
                        # sess=tf.Session()
                        session = model.fit(train_features.copy(),
                                            train.copy())
                        preds = model.predict(test_features.copy(),
                                              session,
                                              var_names=train.columns)
                        del session
                    else:
                        model.fit(train.copy())
                        preds = model.predict(test.index.copy())
                        ###
                        print(test)
                        print(preds)
                        ###
                    perf = util.eval(test, preds, metric)
                    print(perf)
                    results_setting[model_name].loc[k] = perf
                    print(results_setting[model_name])

            os.makedirs(RESULTS + setting.name)
            ### after cv, for each individual results data frame, average results and save data
            for model_name in list(models):
                curr_results = results_setting[model_name]
                curr_results = util.average_results_df(curr_results)
                fname = '{}{}/{}.tsv'.format(RESULTS, setting.name, model_name)
                util.save_tsv(curr_results, fname)
            print('\tFinished processing {}'.format(setting.name))

            ### delete respective setting to free up memory
        del setting
Ejemplo n.º 3
0
    siglevel = ''
    pvalue = st.ttest_rel(a=first['Average'], b=second['Average'])[1]
    print(pvalue)
    if pvalue >= .05:
        siglevel = '–'
    elif pvalue < .05 and pvalue >= .01:
        siglevel = '*'
    elif pvalue < .01 and pvalue >= .001:
        siglevel = '**'
    else:
        siglevel = '***'
    # fill data frame
    sig_table.loc[setup] = best2 + [pvalue, siglevel]

print(sig_table)
util.save_tsv(sig_table, 'significance_test_results.tsv')

table.columns = [
    'LinReg', 'RidgReg', 'TL', 'Densifier', 'ensembleNN', 'jointNN'
]

print('\nt-test for averages: ')
print(st.ttest_rel(a=table.ensembleNN, b=table.jointNN))
print()

print()
table.loc['Average'] = table.mean(axis=0)
# print(table)
util.save_tsv(table, 'overall_table.tsv')
print(table.to_latex(float_format="%.3f"))
Ejemplo n.º 4
0
   split(labels):
    k += 1
    train = labels.iloc[train_index]
    test = labels.iloc[test_index]
    print(k)

    for config in configs:
        print(config)

        threshold = config[0]
        alpha = config[1]

        ds.fit(seed_lexicon=train,
               binarization_threshold=threshold,
               alpha=alpha)
        prediction = ds.predict(words=test.index)
        performance = util.eval(test, prediction)
        print(performance)
        results_config[str(config)].loc[k] = performance

meta_df = pd.DataFrame(columns=['threshold', 'alpha'])

for config in configs:
    results_df = results_config[str(config)]
    results_df = util.average_results_df(results_df)
    fname = 'results/{}.tsv'.format(str(config))
    util.save_tsv(results_df, fname)
    meta_df.loc[fname] = config

util.save_tsv(meta_df, 'results/meta.tsv')
Ejemplo n.º 5
0
import pandas as pd
from naacl.framework import util
'''
Queries the results of the parameter grid search and presents it as a 
sorted table.

===> 	Threshold of .5 SDs seems to be most appropriate. Alpha may be less 
		important with .3 to .9 all being roughly identical in performance.
		.7 however was best and will be employed in the future
'''

meta = util.load_tsv('results/meta.tsv')
# print(meta)
for i in meta.index:
    tmp = util.load_tsv(i)
    meta.loc[i, 'Average_Performance'] = tmp.loc['Average', 'Average']

meta = meta.sort_values(by='Average_Performance', ascending=False)

print(meta)
util.save_tsv(meta, 'grid_search_results.tsv')