コード例 #1
0
def display_confmat_compare():
    """
    This function compares the estimated tasks through assoc rules (and relating post 
    proc) to the manual annotations and displays that nicely as a confusion
    matrix.
    
    In what follows, 
    auto refers to automatic annoattion (with the assoc rules and relating post-proc)
    hand refers to manual annotation
    """
    data = ftools.readData( params.ANNOTATION_FILE )
    data = post_pro.addDurationFeature( data ) # Just putting the data into a 'good' shape
    titles = ftools.load(params.PATH_TITLE[6:]+params.NEW_DAT+params.TITLE_MAT)
    data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles]
    
    
    auto = np.array(data[params.AUTO_TASK_STR]) 
    hand = np.array(data[params.HAND_TASK_STR]) 
    unique_tasks = np.unique(hand)
    n_tasks = len(unique_tasks)
    conf_mat = np.zeros((n_tasks, n_tasks))
    for i, task1 in enumerate(unique_tasks):
        for j, task2 in enumerate(unique_tasks):    
            conf_mat[j,i] = np.nansum(np.logical_and(auto == task1, hand == task2))
    
    conf_mat_s = conf_mat #Save unscaled matrix
    conf_mat = conf_mat / np.nansum(conf_mat, axis=1)[:,None]
    
    corrects = np.sum([conf_mat_s[i,i] for i in range(len(conf_mat_s))])
    print('Correct ', corrects, corrects/np.sum(conf_mat_s))
    
    printTable(unique_tasks, conf_mat)
    printTable(unique_tasks, conf_mat_s)
    
    """
    classifier_tools.kNN(data)
    classifier_tools.randomForest(data)
    classifier_tools.svm(data)
    """

    displayDurationDensities(data)
コード例 #2
0
def example_q_quartiles():
    """
    This function serves to display an example of computing Q-quartiles. 
    
    Specifically, it is for reproducing the graph given in Appendix-I (Pre-processing
    of quantitaive variables). The below saves the data points into a file, which is
    later used in a gnu-plot script.
    """
    
    data = ftools.readData( params.ANNOTATION_FILE )
    data = post_pro.addDurationFeature( data ) # Just putting the data into a 'good' shape
    titles = ftools.load(params.PATH_TITLE[6:]+params.NEW_DAT+params.TITLE_MAT)
    data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles]
    array = list(map(int, data[params.DURATION_STR]))
    
    y, x = np.histogram(array, bins=np.arange(np.max(array), step=1)+1) 
    x = x[:-1]
    y = np.cumsum(y)
    f = open('cdf_duration_dev.txt','w')
    for i in range(len(x)):
        f.write(str(x[i])+'\t'+str(y[i])+'\n')
    f.close()
コード例 #3
0
                        params.TASK_MAT)
    keystrokes_quan = ftools.load(params.PATH_KSTROKES +
                                  params.DAT_FILE_PREFIX + params.KSTROKE_MAT)
    lunch = ftools.load(params.PATH_LUNCH + params.DAT_FILE_PREFIX +
                        params.LUNCH_MAT)
    l_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT +
                           params.LCLICK_MAT)
    r_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT +
                           params.RCLICK_MAT)
    duration = ftools.load(params.PATH_DURATION + params.NEW_DAT +
                           params.DURATION_MAT)

    (exe_names, window_names, time_names,
     level_of_assoc) = dtools.define_names()

    title_combinations = np.unique([ftools.joinTitles(t) for t in windows])
    title_codes = np.array([ftools.joinTitles(t) for t in windows])

    n_actions = len(exes)  #  or any other matrix
    """
    In est_by_rules_direct, I use use 1 column for line number,  4 columns for 
    estimations. But actually number of maximum estimations is no more than 3.
    """
    est_by_rules_direct = []

    for i in range(n_actions):
        """
        Estimation by applying the rules directly
        """
        (exe_only_rules, title_only_rules, exe_and_title_rules,
         exe_and_keyst_rules, exe_and_lunch_rules) = rtools.define_rules()
コード例 #4
0
        if params.STAGE == 1:
            """
            At stage-1, anything that is not Document or Test, needs to be renamed 
            as Others
            """
            for p in [params.HAND_TASK_STR, params.EST_1_STR, \
                      params.EST_2_STR, params.EST_3_STR]:

                data[p] = np.array(data[p])

                data[p][np.invert(np.logical_or(data[p] == params.TEST, \
                     data[p] == params.DOCUMENT))] = params.OTHER
        else:
            """
            At stage-2, we filter out the rows that are labeled Document or Test
            and consider only the others
            """
            boolean_matrix = (data[params.HAND_TASK_STR] == np.array(
                params.TASKS)[:, None])
            query_array = boolean_matrix.any(axis=0)  # Logical or between rows
            for k in data.keys():
                data[k] = np.array(data[k])[query_array]

    titles = ftools.load(params.PATH_TITLE + params.DAT_FILE_PREFIX +
                         params.TITLE_MAT)
    data[params.WINDOW_STR] = [ftools.joinTitles(t) for t in titles]

    ctools.kNN(data, multi=False)
    ctools.randomForest(data, multi=False)
    ctools.svm(data, multi=False)
コード例 #5
0
    l_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT +
                           params.LCLICK_MAT)
    r_clicks = ftools.load(params.PATH_CLICKS + params.NEW_DAT +
                           params.RCLICK_MAT)
    duration = ftools.load(params.PATH_DURATION + params.NEW_DAT +
                           params.DURATION_MAT)

    exe_names, window_titles = dtools.define_names()[:2]
    window_titles.insert(0, '')  # for alien titles, i.e. not a known title
    """
    One hot encoding for exes and window titles
    """
    exe_codes = [exe_names.index(e) + 1
                 for e in exes]  # + 1 because I don't want exes with ID code 0

    title_combination = np.unique([ftools.joinTitles(t)
                                   for t in titles]).tolist()
    title_codes = [
        title_combination.index(ftools.joinTitles(t)) for t in titles
    ]
    descriptors = [exe_codes, title_codes, keystrokes_quan, \
                   l_clicks, r_clicks, duration] # eventually we omit lunch time info

    descriptors_names = ['exes', 'titles', 'keystrokes', 'Left clicks', \
                         'Right clicks', 'Duration'] # Only for printing purposes
    jointProbs = []
    for i in range(len(descriptors) - 1):
        for j in range(i + 1, len(descriptors)):
            desc1 = descriptors[i]
            desc2 = descriptors[j]
            q1 = etools.compute_histogram(desc1)