def display_confmat_compare(): """ This function compares the estimated tasks through assoc rules (and relating post proc) to the manual annotations and displays that nicely as a confusion matrix. In what follows, auto refers to automatic annoattion (with the assoc rules and relating post-proc) hand refers to manual annotation """ data = ftools.readData( params.ANNOTATION_FILE ) data = post_pro.addDurationFeature( data ) # Just putting the data into a 'good' shape titles = ftools.load(params.PATH_TITLE[6:]+params.NEW_DAT+params.TITLE_MAT) data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles] auto = np.array(data[params.AUTO_TASK_STR]) hand = np.array(data[params.HAND_TASK_STR]) unique_tasks = np.unique(hand) n_tasks = len(unique_tasks) conf_mat = np.zeros((n_tasks, n_tasks)) for i, task1 in enumerate(unique_tasks): for j, task2 in enumerate(unique_tasks): conf_mat[j,i] = np.nansum(np.logical_and(auto == task1, hand == task2)) conf_mat_s = conf_mat #Save unscaled matrix conf_mat = conf_mat / np.nansum(conf_mat, axis=1)[:,None] corrects = np.sum([conf_mat_s[i,i] for i in range(len(conf_mat_s))]) print('Correct ', corrects, corrects/np.sum(conf_mat_s)) printTable(unique_tasks, conf_mat) printTable(unique_tasks, conf_mat_s) """ classifier_tools.kNN(data) classifier_tools.randomForest(data) classifier_tools.svm(data) """ displayDurationDensities(data)
def example_q_quartiles(): """ This function serves to display an example of computing Q-quartiles. Specifically, it is for reproducing the graph given in Appendix-I (Pre-processing of quantitaive variables). The below saves the data points into a file, which is later used in a gnu-plot script. """ data = ftools.readData( params.ANNOTATION_FILE ) data = post_pro.addDurationFeature( data ) # Just putting the data into a 'good' shape titles = ftools.load(params.PATH_TITLE[6:]+params.NEW_DAT+params.TITLE_MAT) data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles] array = list(map(int, data[params.DURATION_STR])) y, x = np.histogram(array, bins=np.arange(np.max(array), step=1)+1) x = x[:-1] y = np.cumsum(y) f = open('cdf_duration_dev.txt','w') for i in range(len(x)): f.write(str(x[i])+'\t'+str(y[i])+'\n') f.close()
params.TASK_MAT) keystrokes_quan = ftools.load(params.PATH_KSTROKES + params.DAT_FILE_PREFIX + params.KSTROKE_MAT) lunch = ftools.load(params.PATH_LUNCH + params.DAT_FILE_PREFIX + params.LUNCH_MAT) l_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT + params.LCLICK_MAT) r_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT + params.RCLICK_MAT) duration = ftools.load(params.PATH_DURATION + params.NEW_DAT + params.DURATION_MAT) (exe_names, window_names, time_names, level_of_assoc) = dtools.define_names() title_combinations = np.unique([ftools.joinTitles(t) for t in windows]) title_codes = np.array([ftools.joinTitles(t) for t in windows]) n_actions = len(exes) # or any other matrix """ In est_by_rules_direct, I use use 1 column for line number, 4 columns for estimations. But actually number of maximum estimations is no more than 3. """ est_by_rules_direct = [] for i in range(n_actions): """ Estimation by applying the rules directly """ (exe_only_rules, title_only_rules, exe_and_title_rules, exe_and_keyst_rules, exe_and_lunch_rules) = rtools.define_rules()
if params.STAGE == 1: """ At stage-1, anything that is not Document or Test, needs to be renamed as Others """ for p in [params.HAND_TASK_STR, params.EST_1_STR, \ params.EST_2_STR, params.EST_3_STR]: data[p] = np.array(data[p]) data[p][np.invert(np.logical_or(data[p] == params.TEST, \ data[p] == params.DOCUMENT))] = params.OTHER else: """ At stage-2, we filter out the rows that are labeled Document or Test and consider only the others """ boolean_matrix = (data[params.HAND_TASK_STR] == np.array( params.TASKS)[:, None]) query_array = boolean_matrix.any(axis=0) # Logical or between rows for k in data.keys(): data[k] = np.array(data[k])[query_array] titles = ftools.load(params.PATH_TITLE + params.DAT_FILE_PREFIX + params.TITLE_MAT) data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles] ctools.kNN(data, multi=False) ctools.randomForest(data, multi=False) ctools.svm(data, multi=False)
l_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT + params.LCLICK_MAT) r_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT + params.RCLICK_MAT) duration = ftools.load(params.PATH_DURATION + params.NEW_DAT + params.DURATION_MAT) exe_names, window_titles = dtools.define_names()[:2] window_titles.insert(0, '') # for alien titles, i.e. not a known title """ One hot encoding for exes and window titles """ exe_codes = [exe_names.index(e) + 1 for e in exes] # + 1 because I don't want exes with ID code 0 title_combination = np.unique([ftools.joinTitles(t) for t in titles]).tolist() title_codes = [ title_combination.index(ftools.joinTitles(t)) for t in titles ] descriptors = [exe_codes, title_codes, keystrokes_quan, \ l_clicks, r_clicks, duration] # eventually we omit lunch time info descriptors_names = ['exes', 'titles', 'keystrokes', 'Left clicks', \ 'Right clicks', 'Duration'] # Only for printing purposes jointProbs = [] for i in range(len(descriptors) - 1): for j in range(i + 1, len(descriptors)): desc1 = descriptors[i] desc2 = descriptors[j] q1 = etools.compute_histogram(desc1)