def one_classification_prediction(directory, log_file, user_id, class_counts, verbose): data_directory = bias_util.data_directory data_file_name = bias_util.data_file_name all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs( directory, log_file) dataset, attr_map = bias_util.read_data(data_directory, data_file_name) classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions( all_logs, dataset) X = [] Y = [] # decisions_labels of the form (index, user classification, actual classification, player id) prev = -1 cur = -1 for i in range(0, len(decisions_labels)): prev = cur cur = decisions_labels[i][1] if (not cur in class_counts): class_counts[cur] = 1 else: class_counts[cur] += 1 if (prev != -1 and cur != -1): # and prev != cur): # create a training instance # comment prev != cur condition to allow repetitions X.append([bias_util.pos_to_num_map[prev]]) Y.append(bias_util.pos_to_num_map[cur]) return X, Y, class_counts
def __init__(self, directory, in_file_name, out_file_name, data_directory, data_file_name): self.directory = directory self.in_file_name = in_file_name self.out_file_name = out_file_name self.data_directory = data_directory self.data_file_name = data_file_name self.dpc_logs = [] self.dpd_logs = [] self.ac_logs = [] self.ad_logs = [] self.awc_logs = [] self.awd_logs = [] self.dataset, self.attr_value_map = bias_util.read_data(data_directory, data_file_name) self.all_logs, self.attr_logs, self.item_logs, self.help_logs, self.cat_logs = bias_util.recreate_logs(directory, in_file_name)
def write_svm_results(directory, file_name, log_file, to_plot, fig_num, verbose): if (verbose): print 'Writing and Plotting SVM Data: ', file_name data_directory = bias_util.data_directory data_file_name = bias_util.data_file_name all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs( directory, log_file) logs = item_logs dataset, attr_map = bias_util.read_data(data_directory, data_file_name) classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions( logs, dataset) all_data = dict() all_data['classifications'] = classification x_data = [] y_data = [] data = [] features = bias_util.get_bball_player( dataset, list(classification.keys())[0]).get_map().keys() features.remove('Name') features = sorted(features) for key in classification.keys(): cur_player = bias_util.get_bball_player(dataset, key) cur_map = cur_player.get_map() cur_map['*Classification'] = classification[key] data.append(cur_map) cur_x = [] for i in range(0, len(features)): cur_x.append(cur_map[features[i]]) cur_x = [float(x) for x in cur_x] x_data.append(cur_x) y_data.append(bias_util.pos_to_num_map[classification[key]]) svm_weights, svm_classes = get_svm_weights(x_data, y_data) weights_map = dict() i = 0 for j in range(0, len(svm_classes)): for k in range(j + 1, len(svm_classes)): key = bias_util.num_to_pos_map[ j] + ' - ' + bias_util.num_to_pos_map[k] value = svm_weights[i] weights_map[key] = value i += 1 all_data['features'] = features all_data['weights'] = weights_map all_data['classifications'] = data if not os.path.exists(directory): os.makedirs(directory) f_out = open(directory + file_name, 'w+') f_out.write('{') f_out.write('"features":' + json.dumps(all_data['features']) + ',') f_out.write('"weights":' + json.dumps(all_data['weights']) + ',') f_out.write('"classifications":' + json.dumps(all_data['classifications'])) f_out.write('}') f_out.close() if (to_plot == True): for key in weights_map.keys(): plot_svm( features, weights_map[key], 'SVM Feature Weights: ' + key, 'Feature', 'Weight', directory.replace('/logs/', '/plots/'), file_name.replace('.json', '.png').replace('svm', 'svm_' + key), fig_num, verbose) fig_num += 1 return svm_weights
def write_classification_accuracy(directory, file_name, log_file, fig_num, verbose): print 'Writing and Plotting Accuracy Over Time: ', file_name data_directory = bias_util.data_directory data_file_name = bias_util.data_file_name all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs( directory, log_file) logs = item_logs dataset, attr_map = bias_util.read_data(data_directory, data_file_name) classification, decisions_labels, decisions_cat, decisions_help = bias_util.get_classifications_and_decisions( logs, dataset) total_labeled = 0 total_correct = 0 decision_points = np.arange(1, len(all_logs) + 1) accuracy = [-1] * len(all_logs) current_labels = dict() correct_labels = dict() for i in range(0, len(decisions_labels)): cur = decisions_labels[i] cur_id = cur[3] correct_labels[cur_id] = cur[2] if ((cur_id not in current_labels and cur[1] != 'Un-Assign') or (cur_id in current_labels and current_labels[cur_id] == 'Un-Assign' and cur[1] != 'Un-Assign')): total_labeled += 1 elif (cur_id in current_labels and cur[1] == 'Un-Assign' and current_labels[cur_id] != 'Un-Assign'): total_labeled -= 1 if (cur_id not in current_labels and cur[1] == correct_labels[cur_id]): total_correct += 1 elif (cur_id in current_labels and current_labels[cur_id] != correct_labels[cur_id] and cur[1] == correct_labels[cur_id]): total_correct += 1 if (cur_id in current_labels and current_labels[cur_id] == correct_labels[cur_id] and cur[1] != correct_labels[cur_id]): total_correct -= 1 if (total_labeled != 0): accuracy[cur[0]] = total_correct / float(total_labeled) else: accuracy[cur[0]] = 0 current_labels[cur_id] = cur[1] if (len(decisions_labels) < 1): first_decision = -1 else: first_decision = decisions_labels[0][0] accuracy = bias_util.remove_defaults(accuracy, first_decision) accuracy = bias_util.forward_fill(accuracy) if not os.path.exists(directory): os.makedirs(directory) f_out = open(directory + file_name, 'w+') f_out.write('[') for i in range(0, len(decisions_labels)): f_out.write('{') f_out.write('"interaction_number":"' + str(decisions_labels[i][0]) + '",') f_out.write('"data_point":"' + str(decisions_labels[i][3]) + '",') f_out.write('"actual_class":"' + str(decisions_labels[i][2]) + '",') f_out.write('"user_class":"' + str(decisions_labels[i][1]) + '",') f_out.write('"current_accuracy":"' + str(accuracy[decisions_labels[i][0]]) + '"') f_out.write('}') if (i != len(decisions_labels) - 1): f_out.write(',') f_out.write(']') f_out.close() plot_classification_accuracy(decision_points, accuracy, 'Accuracy Over Time', 'Interactions', 'Accuracy', directory.replace('/logs/', '/plots/'), file_name.replace('.json', '.png'), decisions_labels, fig_num, verbose)
def write_id_confusion_matrix(directory, file_name, log_file, fig_num, verbose): if (verbose): print 'Writing and Plotting ID-Confusion Matrix Data: ', file_name data_directory = bias_util.data_directory data_file_name = bias_util.data_file_name all_logs, attr_logs, item_logs, help_logs, cat_logs = bias_util.recreate_logs( directory, log_file) logs = item_logs dataset, attr_map = bias_util.read_data(data_directory, data_file_name) id_confusion, pos_to_num_map, all_data = get_id_confusion_matrix( logs, dataset) if not os.path.exists(directory): os.makedirs(directory) f_out = open(directory + file_name, 'w+') summary = dict() summary['rows (y)'] = 'user' summary['cols (x)'] = 'actual' summary['position_indices'] = pos_to_num_map summary['centroids'] = dict() summary['centroids']['user_centroids'] = dict() summary['centroids']['actual_centroids'] = dict() # separate out user labels and actual labels user_labels = dict() actual_labels = dict() for key in all_data.keys(): cur_user_label = key[5:key.index(',')] cur_actual_label = key[key.index('actual') + 7:] cur_data_point = copy.deepcopy(all_data[key]) if (cur_user_label in user_labels.keys()): user_labels[cur_user_label] += cur_data_point else: user_labels[cur_user_label] = cur_data_point if (cur_actual_label in actual_labels.keys()): actual_labels[cur_actual_label] += cur_data_point else: actual_labels[cur_actual_label] = cur_data_point # compute centroids for key in user_labels.keys(): summary['centroids']['user_centroids'][key] = compute_centroid( user_labels[key]) #if (verbose): # print 'User Centroid', key, summary['centroids']['user_centroids'][key] for key in actual_labels.keys(): summary['centroids']['actual_centroids'][key] = compute_centroid( actual_labels[key]) #if (verbose): # print 'Actual Centroid', key, summary['centroids']['actual_centroids'][key] # get total accuracy num_correct = 0 total_classifications = 0 for i in range(0, len(id_confusion)): for j in range(0, len(id_confusion[i])): total_classifications += id_confusion[i][j] if (i == j): num_correct += id_confusion[i][j] summary['total_accuracy'] = str(num_correct) + '/' + str( total_classifications) summary['matrix'] = id_confusion.tolist() f_out.write('{') f_out.write('"summary":' + json.dumps(summary) + ',') f_out.write('"all_data":' + json.dumps(all_data)) f_out.write('}') f_out.close() # plot the matrix labels = [ bias_util.num_to_pos_map[0], bias_util.num_to_pos_map[1], bias_util.num_to_pos_map[2], bias_util.num_to_pos_map[3], bias_util.num_to_pos_map[4] ] plot_id_conf_matrix(id_confusion.tolist(), labels, directory.replace('/logs/', '/plots/'), file_name.replace('.json', '.png'), fig_num)