Ejemplo n.º 1
0
def cmd_basics(cmd_data):
    tmp_data = cmd_data.map(lambda x: len(x.cmd_list))
    tmp_data = tmp_data.filter_by(lambda x: x > 5)
    print "Total number of sessions (with threshold 5 commands): {}".format(tmp_data.count())

    fig, ax = report_tools.prepare_plot()
    ax.hist(tmp_data, 50)
    plt.savefig('histogram_cmd_per_student.png')

    tmp_data = cmd_data.filter_by(lambda x: len(x.cmd_list) > 5)
    tmp_data = tmp_data.group_by(lambda x: x.user_name).map(lambda x: len(x[1]))
    fig, ax = report_tools.prepare_plot()
    ax.hist(tmp_data, 50)
    plt.savefig('histogram_sessions_per_student.png')


    example = cmd_data.filter_by(lambda x: len(x.cmd_list) > 100).sort_by(lambda x: len(x.cmd_list))[1]
    print example.file_path
    with open('example_2.txt', 'w') as f_out:
        if example.has_timestamp:
            cmd_with_timestamp = zip(example.timestamp_list, example.cmd_list)
            for item in cmd_with_timestamp:
                f_out.write("{}-{}: {}\n".format(item[0][0], item[0][1], item[1]))
        else:
            f_out.write("\n".join(example.cmd_list))
    example = cmd_data.filter_by(lambda x: len(x.cmd_list) > 200).sort_by(lambda x: len(x.cmd_list))[1]
    print example.file_path
    with open('example_3.txt', 'w') as f_out:
        if example.has_timestamp:
            cmd_with_timestamp = zip(example.timestamp_list, example.cmd_list)
            for item in cmd_with_timestamp:
                f_out.write("{}-{}: {}\n".format(item[0][0], item[0][1], item[1]))
        else:
            f_out.write("\n".join(example.cmd_list))
Ejemplo n.º 2
0
 def _pca(self, training_data):
     feature_set = [self._convert_feature(item) for item in training_data]
     pca = PCA(n_components=2)
     plot_data = pca.fit_transform(feature_set)
     avg_x = np.average([item[0] for item in plot_data])
     avg_y = np.average([item[1] for item in plot_data])
     # plot_data = filter(lambda x: x[0]<avg_x+1 and x[0]>avg_x-1 and x[1]<avg_y+1 and x[1]>avg_y, plot_data)
     fig, ax = report_tools.prepare_plot()
     ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data])
     plt.title('Distribution of the feature set')
     plt.savefig('scatter_feature_distribution.png')
Ejemplo n.º 3
0
def editor_insertion_behavior(filtered_editor_log, code_template, user_info):
    editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input())
    insert_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert').map(lambda x: x['lines'][0])
    print len(insert_data)
    tmp_data = insert_data.map(lambda x: len(x))
    fig, ax = report_tools.prepare_plot()
    ax.hist(tmp_data.filter_by(lambda x: x<100), 50)
    plt.title('Histogram on length of inserted contents')
    plt.savefig('hist_length_inserted.png')

    print editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert' and len(x['lines'])>1).count()
Ejemplo n.º 4
0
    def student_anchors(self):
        def _unwrap_contents(item_list):
            result = []
            for item in item_list:
                result += item[1]
            return result
        tmp_data = self.data.sort_by(lambda x: x.timestamp).map(lambda x: (x.user_name, self._get_content(x.cmd_list)))
        tmp_data = tmp_data.group_by(lambda x: x[0]).map(lambda x: (x[0], _unwrap_contents(x[1])))
        tmp_data = tmp_data.map(lambda x: (x[0], filter(lambda y: self._is_anchor(y), x[1]))).map(lambda x: (x[0], [self.predict(item) for item in x[1]]))
        plot_data = tmp_data.map(lambda x: (x[0], set(x[1]))).map(lambda x: len(x[1]))
        fig, ax = report_tools.prepare_plot()
        ax.hist(plot_data)
        plt.title('Histogram on number of anchors detected per student')
        plt.savefig('hist_number_anchors.png')

        plot_data = data_reader.SList([item[1][0] for item in tmp_data.filter_by(lambda x: len(x[1]) > 0)])
        plot_data = plot_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1])))
        plot_x = [item[0] for item in plot_data]
        plot_y = [item[1] for item in plot_data]
        fig, ax = report_tools.prepare_plot()
        ind = np.arange(len(plot_x))
        width = 0.5
        ax.bar(ind, plot_y)
        ax.set_xticks(ind+width)
        ax.set_xticklabels(plot_x)
        plt.title('Distribution of first appeared anchor')
        plt.savefig('first_anchor.png')

        plot_data = data_reader.SList([item[1][-1] for item in tmp_data.filter_by(lambda x: len(x[1]) > 0)])
        plot_data = plot_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1])))
        plot_x = [item[0] for item in plot_data]
        plot_y = [item[1] for item in plot_data]
        fig, ax = report_tools.prepare_plot()
        ind = np.arange(len(plot_x))
        width = 0.5
        ax.bar(ind, plot_y)
        ax.set_xticks(ind+width)
        ax.set_xticklabels(plot_x)
        plt.title('Distribution of last appeared anchor')
        plt.savefig('last_anchor.png')
Ejemplo n.º 5
0
def editor_basics(filtered_editor_log):
    ACTION_LIST = ['insert', 'paste', 'remove', 'copy']
    filtered_editor_log = filtered_editor_log.map(lambda x: x.filter_editor_log(ACTION_LIST))

    tmp_data = filtered_editor_log.map(lambda x: len(x._operation_list))
    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    ax.hist(tmp_data, 50)
    plt.savefig('histogram_editor_operations_per_session.png')

    print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='insert')[0]
    print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='paste')[0]
    print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='remove')[0]
    print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='copy')[0]
Ejemplo n.º 6
0
def cmd_counting(cmd_list):
    tmp_data = cmd_list.flatmap(lambda x: x.cmd_list).map(lambda x: x[0])
    tmp_data = tmp_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1])))
    pre_counter = tmp_data.count()
    tmp_data = tmp_data.filter_by(lambda x: x[1] < 1000 and x[1] > 5)
    post_counter = tmp_data.count()
    print "Number of filtered commands/ Total number of commands: {}/{}".format(post_counter, pre_counter)
    tmp_data = tmp_data.sort_by(lambda x: -x[1])
    print tmp_data[:30]
    plot_x = [item[0] for item in tmp_data]
    plot_y = [item[1] for item in tmp_data]
    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    ax.hist(plot_y, 50)
    plt.savefig('histogram_cmd_counter.png')
Ejemplo n.º 7
0
def user_insertion_length(filtered_editor_log, filtered_shell_log, code_template, user_info):
    def _get_content(item_list):
        result = []
        for item in item_list:
            if item['action'] == 'paste':
                result.append(code_template.strip_template(item['text']))
            elif item['action'] == 'insert':
                result.append(item['lines'][0])
        return result

    def _merge_content(item_list):
        result = ''
        for item in item_list:
            result += u"\n".join(item[1])
        return result
    editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input())
    tmp_data = editor_cmd_data.map(lambda x: (x.user_name, _get_content(x.cmd_list)))
    tmp_data = tmp_data.group_by(lambda x: x[0]).map(lambda x: (x[0], _merge_content(x[1]))).filter_by(lambda x: len(x[1])>0)
    plot_data = tmp_data.map(lambda x: len(x[1]))
    fig, ax = report_tools.prepare_plot()
    ax.hist(plot_data, 50)
    plt.title('Histogram on user input length')
    plt.savefig('hist_user_input.png')

    plot_x = []
    plot_y = []
    for item in tmp_data.filter_by(lambda x: x[0] in user_info):
        if user_info[item[0]] == 'Course_A':
            plot_x.append(len(item[1]))
        elif user_info[item[0]] == 'Course_B':
            plot_y.append(len(item[1]))
    fig, ax = report_tools.prepare_plot()
    ax.hist([plot_x, plot_y], 50, label=['Course_A', 'Course_B'])
    plt.title('Histogram on user input length')
    plt.legend()
    plt.savefig('hist_user_input_comparison.png')
Ejemplo n.º 8
0
def overall_frequency(filtered_shell_log):
    shell_input_list = _generate_counter_list(filtered_shell_log).sort_by(lambda x: int(x[0]))
    print shell_input_list.sort_by(lambda x: x[1])

    fig, ax = report_tools.prepare_plot(figsize=(20, 5), gridWidth=0.5)
    data_x = [item[0] for item in shell_input_list]
    data_y = [item[1] for item in shell_input_list]
    ind = np.arange(len(data_x))
    ax.bar(ind, data_y, 0.5)
    ax.set_xticks(ind+0.5)
    ax.set_xticklabels(data_x, rotation=70)
    plt.xlabel('Input ASCII')
    plt.ylabel('Total number of actions')
    plt.title('Frequency distribution of user inputs.')
    plt.savefig('overall_frequency.png')
Ejemplo n.º 9
0
def student_frequency(filtered_shell_log):
    def get_sum(counter_list):
        tmp_sum = 0
        for item in counter_list:
            tmp_sum += item[1]
        return tmp_sum
    student_input_list = filtered_shell_log.group_by(lambda x: x.user_name)
    student_input_list = student_input_list.map(lambda x: (x[0], _generate_counter_list(x[1])))
    sum_list = student_input_list.map(lambda x: (x[0], get_sum(x[1])))
    plot_data = [item[1] for item in sum_list]
    fig, ax = report_tools.prepare_plot(gridWidth = 0.5)
    ax.hist(plot_data, 50)
    plt.xlabel('Total number of actions')
    plt.ylabel('Number of students')
    plt.title('Histogram on number of actions per student')
    # plt.show()
    plt.savefig('histogram_student_total_actions.png')
Ejemplo n.º 10
0
 def _output_training_result(self, init_training_data):
     labels = [self.predict(item) for item in init_training_data]
     result  = zip(labels, init_training_data)
     size_list = []
     cluster_list = []
     for label in set(labels):
         tmp_result = filter(lambda x: x[0]==label, result)
         if len(tmp_result) > 100:
             size_list.append(len(tmp_result))
             cluster_list.append(label)
         with codecs.open("clustering_{}.txt".format(label), 'w', 'utf-8') as f_out:
             f_out.write(u"Size of cluster: {}\n".format(len(tmp_result)))
             for item in tmp_result:
                 f_out.write(u"{}\n".format(item[1]))
     fig, ax = report_tools.prepare_plot(figsize=(20, 5))
     ind = np.arange(len(size_list))
     width = 0.5
     ax.bar(ind, size_list, width)
     ax.set_xticks(ind+width)
     ax.set_xticklabels(['C{}'.format(i) for i in cluster_list], rotation='90')
     plt.title('Cluster size')
     plt.savefig('cluster_size.png')
Ejemplo n.º 11
0
def student_freq_clustering(filtered_shell_log, user_info):
    student_input_list = filtered_shell_log.group_by(lambda x: x.user_name).filter_by(lambda x: x[0] in user_info)
    student_input_list = student_input_list.map(lambda x: (x[0], _generate_counter_list(x[1])))
    freq_list = student_input_list.map(lambda x: (x[0], _convert_freq(x[1])))
    feature_list = freq_list.map(lambda x: _generate_feature_vector(x[1]))
    pca = PCA(n_components=2)
    pca.fit(feature_list)
    plot_data = pca.transform(feature_list)
    colors = []
    for item in student_input_list:
        if user_info[item[0]] == 'Course_A':
            colors.append('b')
        elif user_info[item[0]] == 'Course_B':
            colors.append('y')
        else:
            colors.append('r')
    plot_x = [item[0] for item in plot_data]
    plot_y = [item[1] for item in plot_data]

    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    plt.scatter(plot_x, plot_y, c=colors)
    plt.savefig('pca_result.png')
Ejemplo n.º 12
0
def editor_input_clustering(filtered_editor_log, code_template, user_info, ankors):
    def _unicode(c):
        if u'\u4e00' <= c <= u'\u9fff':
            return False
        try:
            c.decode('ascii')
        except UnicodeDecodeError:
            return False
        except UnicodeEncodeError:
            return False
        return True
    editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input())
    insert_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert').map(lambda x: x['lines'][0])
    template_filtered_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'paste').map(lambda x: x['text'])
    template_filtered_data = template_filtered_data.map(lambda x: code_template.strip_template(x))
    total_input = data_reader.SList(insert_data + template_filtered_data.flatmap(lambda x: x.split(u"\n")))
    total_input = total_input.filter_by(lambda x: len(filter(lambda y: not y in [u"\n", u"\t", u"\r", u" "] and _unicode(y), x))>5)
    print len(total_input)
    feature_set, ankor_set = _generate_feature_set(total_input,ankors.splitter)
    print len(feature_set)
    # pca = PCA(n_components=2)
    # pca.fit(feature_set)
    # plot_data = pca.transform(feature_set)

    # fig, ax = report_tools.prepare_plot()
    # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data])
    # plt.title('Scatter plot on editor input')
    # plt.savefig('scatter_editor_input.png')

    # fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data], [item[2] for item in plot_data])
    # plt.title('Scatter plot on editor input')
    # plt.savefig('3d_scatter_editor_input.png')

    # db = Birch().fit(feature_set)
    # labels = db.labels_
    model = KMeans(n_clusters=300)
    labels = model.fit_predict(feature_set)
    result  = zip(labels, total_input)
    size_list = []
    cluster_list = []
    print len(set(labels))
    for label in set(labels):
        tmp_result = filter(lambda x: x[0]==label, result)
        if len(tmp_result) > 100:
            size_list.append(len(tmp_result))
            cluster_list.append(label)
            with codecs.open("clustering_{}.txt".format(label), 'w', 'utf-8') as f_out:
                f_out.write(u"Size of cluster: {}\n".format(len(tmp_result)))
                for item in tmp_result:
                    f_out.write(u"{}\n".format(item[1]))
    fig, ax = report_tools.prepare_plot(figsize=(20, 5))
    ind = np.arange(len(size_list))
    width = 0.5
    ax.bar(ind, size_list, width)
    ax.set_xticks(ind+width)
    ax.set_xticklabels(['C{}'.format(i) for i in cluster_list], rotation='90')
    plt.title('Cluster size')
    plt.savefig('cluster_size.png')

    ankor_label = model.predict(ankor_set)
    with open('ankor_label.txt', 'w') as f_out:
        for item in zip(ankors.splitter, ankor_label):
            f_out.write("{}\n{}\n\n".format(item[0], item[1]))

    with open('model.json', 'w') as f_out:
        json.dump(model.get_params(), f_out)
Ejemplo n.º 13
0
def editor_behavior_analysis(filtered_editor_log, code_template, user_info):
    def get_operation_history(session_history):
        result = []
        for session in session_history:
            result += session.cmd_list
        return result
    def split_history(cmd_history, op_list):
        tmp_session = []
        result = []
        for item in cmd_history:
            tmp_session.append(item)
            if item['action'] in op_list:
                result.append(tmp_session)
                tmp_session = []
        return filter(lambda x: len(x) > 0, result)

    editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log([u'insert', u'remove', u'paste', u'copy', u'save', u'open'])).map(lambda x: x.combine_editor_input())

    # plot_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']=='paste').map(lambda x: x['text'])
    # with codecs.open('middle_paste.txt', 'w', 'utf-8') as f_out:
    #     f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)>1000 and len(x)<3000)))
    # with codecs.open('long_paste.txt', 'w', 'utf-8') as f_out:
    #     f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)>4000)))
    # with codecs.open('short_paste.txt', 'w', 'utf-8') as f_out:
    #     f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)<1000)))
    template_filtered_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'paste').map(lambda x: x['text'])
    template_filtered_data = template_filtered_data.map(lambda x: code_template.strip_template(x))
    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    ax.hist(template_filtered_data.map(lambda x: len(x)).filter_by(lambda x: x<1000), 50)
    plt.title('Histogram on filtered pasted content length')
    plt.savefig('hist_filtered_pasted_content.png')

    with codecs.open('middle_filtered_paste.txt', 'w', 'utf-8') as f_out:
        f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)>1000 and len(x)<3000)))
    with codecs.open('long_filtered_paste.txt', 'w', 'utf-8') as f_out:
        f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)>4000)))
    with codecs.open('short_filtered_paste.txt', 'w', 'utf-8') as f_out:
        f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)<1000)))


    # fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    # ax.hist(plot_data.map(lambda x: len(x)).filter_by(lambda x: x<1000), 50)
     #plt.title('Histogram on length of pasted contents')
    # plt.xlabel('Length of pasted content')
    # plt.savefig('hist_length_pasted_content.png')

    student_history_data = editor_cmd_data.group_by(lambda x: x.user_name).map(lambda x: (x[0], get_operation_history(x[1])))
    tmp_data = student_history_data.map(lambda x: (x[0], split_history(x[1], ['save', 'open'])))

    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    ax.hist(tmp_data.map(lambda x: len(x[1])), 50)
    plt.title('Histogram on number of editor sessions')
    plt.xlabel('Number of editor sessions')
    plt.savefig('hist_editor_session.png')

    tmp_data = student_history_data.map(lambda x: filter(lambda y: y['action']=='paste', x[1])).map(lambda x: "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n".join([item['text'] for item in x]))
    counting_data = tmp_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1]))).sort_by(lambda x: -x[1])
    with codecs.open('pasted_content.txt', 'w', 'utf-8') as f_out:
        for item in counting_data:
            f_out.write(str(item[1]))
            f_out.write("\n\n")
            f_out.write(item[0])
            f_out.write("\n****************************************************************************\n")
    fig, ax = report_tools.prepare_plot(gridWidth=0.5)
    ax.hist(tmp_data.map(lambda x: len(x)), 50)
    plt.title('Histogram on length of pasted content')
    plt.xlabel('Length of pasted content')
    plt.savefig('hist_pasted_content.png')