Exemple #1
0
def predict_all():
	# add threads to a list, and wait for all of them in the end
	threads = []

	for trait in trait_list:
		for si in xrange(low_repetitions, num_repetitions):
			fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True)
			if not os.path.exists(fname):
				thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si))
				sys.stdout.flush()
				thread.start()
				threads.append(thread)
			else:
				print "existing solution:", fname

	for thread in threads:
		thread.join()
		print 'waiting to join'
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from config import conf
import os

hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int)
for trait in xrange(0, conf.n_traits):
    for si in xrange(0, 100):
        filename = conf.get_result_filename(conf.annotation_all,
                                            trait,
                                            False,
                                            si,
                                            add_suffix=True)
        if os.path.exists(filename):
            data = np.load(filename)
            chosen_window_indices = data['chosen_window_indices']
            hist, _ = np.histogram(chosen_window_indices,
                                   bins=np.arange(-0.5,
                                                  len(conf.all_window_sizes),
                                                  1))
            hist_sum += hist
        else:
            print 'did not find', filename

hist_sum_sum = np.sum(hist_sum)

plt.figure()
ax = plt.subplot(111)
bars = ax.bar(conf.all_window_sizes,
              hist_sum / float(hist_sum_sum) * 100,
Exemple #3
0
def get_feature_correlations():
    # find the window size that was most frequently chosen
    hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int)
    for trait in xrange(0, conf.n_traits):
        for si in xrange(0, 100):
            filename = conf.get_result_filename(conf.annotation_all,
                                                trait,
                                                False,
                                                si,
                                                add_suffix=True)
            if os.path.exists(filename):
                data = np.load(filename)
                chosen_window_indices = data['chosen_window_indices']
                hist, _ = np.histogram(chosen_window_indices,
                                       bins=np.arange(
                                           -0.5, len(conf.all_window_sizes),
                                           1))
                hist_sum += hist
            else:
                print 'did not find', filename

    ws = conf.all_window_sizes[np.argmax(hist_sum)]

    # load features for the most frequently chosen time window
    x_file, y_file, id_file = conf.get_merged_feature_files(ws)
    x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
    ids_ws = np.genfromtxt(id_file, delimiter=',',
                           skip_header=1).astype(int)[:, 0]
    y = np.genfromtxt(conf.binned_personality_file,
                      skip_header=1,
                      usecols=xrange(1, conf.n_traits + 1),
                      delimiter=',')
    y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)

    # compute average feature per person
    avg_x_ws = np.zeros((conf.n_participants, conf.max_n_feat))
    for p in xrange(0, conf.n_participants):
        avg_x_ws[p, :] = np.mean(x_ws[ids_ws == p, :], axis=0)

    feature_correlations_avg = []
    for fi in xrange(0, conf.max_n_feat):
        C_avg = np.corrcoef(y.transpose(), avg_x_ws[:, fi])[-1][:-1]
        feature_correlations_avg.append(C_avg)

    feature_correlations_avg = np.array(feature_correlations_avg)

    # find the 5th to highest correlation for each trait and write them into a .tex table - see Table 4 in SI
    n = 15
    highest_correlated_features = []
    highest_correlated_features_lists = []
    highest_correlated_features_names = []
    for t in xrange(0, conf.n_traits):
        hcf = feature_correlations_avg[:, t].argsort()[-n:]
        locallist = []
        for f in hcf:
            if f not in highest_correlated_features:
                highest_correlated_features.append(f)
                highest_correlated_features_names.append(
                    gs.full_long_label_list[f].lower())
            locallist.append(f)

        highest_correlated_features_lists.append(locallist)

    features = zip(highest_correlated_features_names,
                   highest_correlated_features)
    highest_correlated_features = [y for (x, y) in sorted(features)]
    #highest_correlated_features.sort()

    filename = conf.figure_folder + '/table4.tex'
    print len(highest_correlated_features)
    with open(filename, 'w') as f:
        f.write('feature&Neur.&Extr.&Open.&Agree.&Consc.&PCS&CEI')
        f.write('\\\\\n\hline\n')
        for fi in highest_correlated_features:
            f.write(gs.full_long_label_list[fi])
            for t in xrange(0, conf.n_traits):
                fc = feature_correlations_avg[fi, t]
                if math.isnan(fc):
                    f.write('&-')
                elif fi in highest_correlated_features_lists[t]:
                    f.write('&\\textbf{' + '%.2f}' % fc)
                else:
                    f.write('&' + '%.2f' % fc)
            f.write('\\\\\n')
    print
    print filename, 'written'
Exemple #4
0
            if not os.path.exists(result_filename):
                print 'computing data for', comp_title
                print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.'

                predictions_I = np.zeros(
                    (conf.n_participants, conf.n_traits, conf.max_n_iter),
                    dtype=int)
                predictions_II = np.zeros(
                    (conf.n_participants, conf.n_traits, conf.max_n_iter),
                    dtype=int)

                for trait in xrange(0, conf.n_traits):
                    for si in xrange(0, conf.max_n_iter):
                        filenameI = conf.get_result_filename(
                            annotation_value_I,
                            trait,
                            False,
                            si,
                            add_suffix=True)
                        filenameII = conf.get_result_filename(
                            annotation_value_II,
                            trait,
                            False,
                            si,
                            add_suffix=True)

                        if os.path.exists(filenameI) and os.path.exists(
                                filenameII):
                            dataI = np.load(filenameI)
                            detailed_predictions_I = dataI[
                                'detailed_predictions']
                            chosen_window_indices_I = dataI[
        truth = np.genfromtxt(conf.binned_personality_file,
                              skip_header=1,
                              usecols=(trait + 1, ),
                              delimiter=',')
        for i in xrange(0, 100):
            rand_guess = np.random.randint(1, 4, conf.n_participants)
            f1 = f1_score(truth, rand_guess, average='macro')
            collection.append(
                [f1, conf.medium_traitlabels[trait], i, 'random guess'])

        # baseline 3: label permutation test
        #             was computed using label_permutation_test.sh and written into results. ie. is just loaded here
        for si in xrange(0, m_iter):
            filename_rand = conf.get_result_filename(conf.annotation_all,
                                                     trait,
                                                     True,
                                                     si,
                                                     add_suffix=True)
            if os.path.exists(filename_rand):
                data = np.load(filename_rand)
                pr = data['predictions']
                dt = truth[pr > 0]
                pr = pr[pr > 0]
                f1 = f1_score(dt, pr, average='macro')
                collection.append([
                    f1, conf.medium_traitlabels[trait], si, 'label permutation'
                ])
            else:
                print 'did not find', filename_rand
                print 'consider (re-)running label_permutation_test.sh'
                sys.exit(1)