def predict_all(): # add threads to a list, and wait for all of them in the end threads = [] for trait in trait_list: for si in xrange(low_repetitions, num_repetitions): fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True) if not os.path.exists(fname): thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si)) sys.stdout.flush() thread.start() threads.append(thread) else: print "existing solution:", fname for thread in threads: thread.join() print 'waiting to join'
import numpy as np import matplotlib.pyplot as plt import seaborn from config import conf import os hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int) for trait in xrange(0, conf.n_traits): for si in xrange(0, 100): filename = conf.get_result_filename(conf.annotation_all, trait, False, si, add_suffix=True) if os.path.exists(filename): data = np.load(filename) chosen_window_indices = data['chosen_window_indices'] hist, _ = np.histogram(chosen_window_indices, bins=np.arange(-0.5, len(conf.all_window_sizes), 1)) hist_sum += hist else: print 'did not find', filename hist_sum_sum = np.sum(hist_sum) plt.figure() ax = plt.subplot(111) bars = ax.bar(conf.all_window_sizes, hist_sum / float(hist_sum_sum) * 100,
def get_feature_correlations(): # find the window size that was most frequently chosen hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int) for trait in xrange(0, conf.n_traits): for si in xrange(0, 100): filename = conf.get_result_filename(conf.annotation_all, trait, False, si, add_suffix=True) if os.path.exists(filename): data = np.load(filename) chosen_window_indices = data['chosen_window_indices'] hist, _ = np.histogram(chosen_window_indices, bins=np.arange( -0.5, len(conf.all_window_sizes), 1)) hist_sum += hist else: print 'did not find', filename ws = conf.all_window_sizes[np.argmax(hist_sum)] # load features for the most frequently chosen time window x_file, y_file, id_file = conf.get_merged_feature_files(ws) x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1) ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:, 0] y = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits + 1), delimiter=',') y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int) # compute average feature per person avg_x_ws = np.zeros((conf.n_participants, conf.max_n_feat)) for p in xrange(0, conf.n_participants): avg_x_ws[p, :] = np.mean(x_ws[ids_ws == p, :], axis=0) feature_correlations_avg = [] for fi in xrange(0, conf.max_n_feat): C_avg = np.corrcoef(y.transpose(), avg_x_ws[:, fi])[-1][:-1] feature_correlations_avg.append(C_avg) feature_correlations_avg = np.array(feature_correlations_avg) # find the 5th to highest correlation for each trait and write them into a .tex table - see Table 4 in SI n = 15 highest_correlated_features = [] highest_correlated_features_lists = [] highest_correlated_features_names = [] for t in xrange(0, conf.n_traits): hcf = feature_correlations_avg[:, t].argsort()[-n:] locallist = [] for f in hcf: if f not in highest_correlated_features: highest_correlated_features.append(f) highest_correlated_features_names.append( gs.full_long_label_list[f].lower()) locallist.append(f) highest_correlated_features_lists.append(locallist) features = zip(highest_correlated_features_names, highest_correlated_features) highest_correlated_features = [y for (x, y) in sorted(features)] #highest_correlated_features.sort() filename = conf.figure_folder + '/table4.tex' print len(highest_correlated_features) with open(filename, 'w') as f: f.write('feature&Neur.&Extr.&Open.&Agree.&Consc.&PCS&CEI') f.write('\\\\\n\hline\n') for fi in highest_correlated_features: f.write(gs.full_long_label_list[fi]) for t in xrange(0, conf.n_traits): fc = feature_correlations_avg[fi, t] if math.isnan(fc): f.write('&-') elif fi in highest_correlated_features_lists[t]: f.write('&\\textbf{' + '%.2f}' % fc) else: f.write('&' + '%.2f' % fc) f.write('\\\\\n') print print filename, 'written'
if not os.path.exists(result_filename): print 'computing data for', comp_title print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.' predictions_I = np.zeros( (conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int) predictions_II = np.zeros( (conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int) for trait in xrange(0, conf.n_traits): for si in xrange(0, conf.max_n_iter): filenameI = conf.get_result_filename( annotation_value_I, trait, False, si, add_suffix=True) filenameII = conf.get_result_filename( annotation_value_II, trait, False, si, add_suffix=True) if os.path.exists(filenameI) and os.path.exists( filenameII): dataI = np.load(filenameI) detailed_predictions_I = dataI[ 'detailed_predictions'] chosen_window_indices_I = dataI[
truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=(trait + 1, ), delimiter=',') for i in xrange(0, 100): rand_guess = np.random.randint(1, 4, conf.n_participants) f1 = f1_score(truth, rand_guess, average='macro') collection.append( [f1, conf.medium_traitlabels[trait], i, 'random guess']) # baseline 3: label permutation test # was computed using label_permutation_test.sh and written into results. ie. is just loaded here for si in xrange(0, m_iter): filename_rand = conf.get_result_filename(conf.annotation_all, trait, True, si, add_suffix=True) if os.path.exists(filename_rand): data = np.load(filename_rand) pr = data['predictions'] dt = truth[pr > 0] pr = pr[pr > 0] f1 = f1_score(dt, pr, average='macro') collection.append([ f1, conf.medium_traitlabels[trait], si, 'label permutation' ]) else: print 'did not find', filename_rand print 'consider (re-)running label_permutation_test.sh' sys.exit(1)