def cmd_basics(cmd_data): tmp_data = cmd_data.map(lambda x: len(x.cmd_list)) tmp_data = tmp_data.filter_by(lambda x: x > 5) print "Total number of sessions (with threshold 5 commands): {}".format(tmp_data.count()) fig, ax = report_tools.prepare_plot() ax.hist(tmp_data, 50) plt.savefig('histogram_cmd_per_student.png') tmp_data = cmd_data.filter_by(lambda x: len(x.cmd_list) > 5) tmp_data = tmp_data.group_by(lambda x: x.user_name).map(lambda x: len(x[1])) fig, ax = report_tools.prepare_plot() ax.hist(tmp_data, 50) plt.savefig('histogram_sessions_per_student.png') example = cmd_data.filter_by(lambda x: len(x.cmd_list) > 100).sort_by(lambda x: len(x.cmd_list))[1] print example.file_path with open('example_2.txt', 'w') as f_out: if example.has_timestamp: cmd_with_timestamp = zip(example.timestamp_list, example.cmd_list) for item in cmd_with_timestamp: f_out.write("{}-{}: {}\n".format(item[0][0], item[0][1], item[1])) else: f_out.write("\n".join(example.cmd_list)) example = cmd_data.filter_by(lambda x: len(x.cmd_list) > 200).sort_by(lambda x: len(x.cmd_list))[1] print example.file_path with open('example_3.txt', 'w') as f_out: if example.has_timestamp: cmd_with_timestamp = zip(example.timestamp_list, example.cmd_list) for item in cmd_with_timestamp: f_out.write("{}-{}: {}\n".format(item[0][0], item[0][1], item[1])) else: f_out.write("\n".join(example.cmd_list))
def _pca(self, training_data): feature_set = [self._convert_feature(item) for item in training_data] pca = PCA(n_components=2) plot_data = pca.fit_transform(feature_set) avg_x = np.average([item[0] for item in plot_data]) avg_y = np.average([item[1] for item in plot_data]) # plot_data = filter(lambda x: x[0]<avg_x+1 and x[0]>avg_x-1 and x[1]<avg_y+1 and x[1]>avg_y, plot_data) fig, ax = report_tools.prepare_plot() ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data]) plt.title('Distribution of the feature set') plt.savefig('scatter_feature_distribution.png')
def editor_insertion_behavior(filtered_editor_log, code_template, user_info): editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input()) insert_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert').map(lambda x: x['lines'][0]) print len(insert_data) tmp_data = insert_data.map(lambda x: len(x)) fig, ax = report_tools.prepare_plot() ax.hist(tmp_data.filter_by(lambda x: x<100), 50) plt.title('Histogram on length of inserted contents') plt.savefig('hist_length_inserted.png') print editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert' and len(x['lines'])>1).count()
def student_anchors(self): def _unwrap_contents(item_list): result = [] for item in item_list: result += item[1] return result tmp_data = self.data.sort_by(lambda x: x.timestamp).map(lambda x: (x.user_name, self._get_content(x.cmd_list))) tmp_data = tmp_data.group_by(lambda x: x[0]).map(lambda x: (x[0], _unwrap_contents(x[1]))) tmp_data = tmp_data.map(lambda x: (x[0], filter(lambda y: self._is_anchor(y), x[1]))).map(lambda x: (x[0], [self.predict(item) for item in x[1]])) plot_data = tmp_data.map(lambda x: (x[0], set(x[1]))).map(lambda x: len(x[1])) fig, ax = report_tools.prepare_plot() ax.hist(plot_data) plt.title('Histogram on number of anchors detected per student') plt.savefig('hist_number_anchors.png') plot_data = data_reader.SList([item[1][0] for item in tmp_data.filter_by(lambda x: len(x[1]) > 0)]) plot_data = plot_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1]))) plot_x = [item[0] for item in plot_data] plot_y = [item[1] for item in plot_data] fig, ax = report_tools.prepare_plot() ind = np.arange(len(plot_x)) width = 0.5 ax.bar(ind, plot_y) ax.set_xticks(ind+width) ax.set_xticklabels(plot_x) plt.title('Distribution of first appeared anchor') plt.savefig('first_anchor.png') plot_data = data_reader.SList([item[1][-1] for item in tmp_data.filter_by(lambda x: len(x[1]) > 0)]) plot_data = plot_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1]))) plot_x = [item[0] for item in plot_data] plot_y = [item[1] for item in plot_data] fig, ax = report_tools.prepare_plot() ind = np.arange(len(plot_x)) width = 0.5 ax.bar(ind, plot_y) ax.set_xticks(ind+width) ax.set_xticklabels(plot_x) plt.title('Distribution of last appeared anchor') plt.savefig('last_anchor.png')
def editor_basics(filtered_editor_log): ACTION_LIST = ['insert', 'paste', 'remove', 'copy'] filtered_editor_log = filtered_editor_log.map(lambda x: x.filter_editor_log(ACTION_LIST)) tmp_data = filtered_editor_log.map(lambda x: len(x._operation_list)) fig, ax = report_tools.prepare_plot(gridWidth=0.5) ax.hist(tmp_data, 50) plt.savefig('histogram_editor_operations_per_session.png') print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='insert')[0] print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='paste')[0] print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='remove')[0] print filtered_editor_log.flatmap(lambda x: x.get_operation_list()).filter_by(lambda x: x['action']=='copy')[0]
def cmd_counting(cmd_list): tmp_data = cmd_list.flatmap(lambda x: x.cmd_list).map(lambda x: x[0]) tmp_data = tmp_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1]))) pre_counter = tmp_data.count() tmp_data = tmp_data.filter_by(lambda x: x[1] < 1000 and x[1] > 5) post_counter = tmp_data.count() print "Number of filtered commands/ Total number of commands: {}/{}".format(post_counter, pre_counter) tmp_data = tmp_data.sort_by(lambda x: -x[1]) print tmp_data[:30] plot_x = [item[0] for item in tmp_data] plot_y = [item[1] for item in tmp_data] fig, ax = report_tools.prepare_plot(gridWidth=0.5) ax.hist(plot_y, 50) plt.savefig('histogram_cmd_counter.png')
def user_insertion_length(filtered_editor_log, filtered_shell_log, code_template, user_info): def _get_content(item_list): result = [] for item in item_list: if item['action'] == 'paste': result.append(code_template.strip_template(item['text'])) elif item['action'] == 'insert': result.append(item['lines'][0]) return result def _merge_content(item_list): result = '' for item in item_list: result += u"\n".join(item[1]) return result editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input()) tmp_data = editor_cmd_data.map(lambda x: (x.user_name, _get_content(x.cmd_list))) tmp_data = tmp_data.group_by(lambda x: x[0]).map(lambda x: (x[0], _merge_content(x[1]))).filter_by(lambda x: len(x[1])>0) plot_data = tmp_data.map(lambda x: len(x[1])) fig, ax = report_tools.prepare_plot() ax.hist(plot_data, 50) plt.title('Histogram on user input length') plt.savefig('hist_user_input.png') plot_x = [] plot_y = [] for item in tmp_data.filter_by(lambda x: x[0] in user_info): if user_info[item[0]] == 'Course_A': plot_x.append(len(item[1])) elif user_info[item[0]] == 'Course_B': plot_y.append(len(item[1])) fig, ax = report_tools.prepare_plot() ax.hist([plot_x, plot_y], 50, label=['Course_A', 'Course_B']) plt.title('Histogram on user input length') plt.legend() plt.savefig('hist_user_input_comparison.png')
def overall_frequency(filtered_shell_log): shell_input_list = _generate_counter_list(filtered_shell_log).sort_by(lambda x: int(x[0])) print shell_input_list.sort_by(lambda x: x[1]) fig, ax = report_tools.prepare_plot(figsize=(20, 5), gridWidth=0.5) data_x = [item[0] for item in shell_input_list] data_y = [item[1] for item in shell_input_list] ind = np.arange(len(data_x)) ax.bar(ind, data_y, 0.5) ax.set_xticks(ind+0.5) ax.set_xticklabels(data_x, rotation=70) plt.xlabel('Input ASCII') plt.ylabel('Total number of actions') plt.title('Frequency distribution of user inputs.') plt.savefig('overall_frequency.png')
def student_frequency(filtered_shell_log): def get_sum(counter_list): tmp_sum = 0 for item in counter_list: tmp_sum += item[1] return tmp_sum student_input_list = filtered_shell_log.group_by(lambda x: x.user_name) student_input_list = student_input_list.map(lambda x: (x[0], _generate_counter_list(x[1]))) sum_list = student_input_list.map(lambda x: (x[0], get_sum(x[1]))) plot_data = [item[1] for item in sum_list] fig, ax = report_tools.prepare_plot(gridWidth = 0.5) ax.hist(plot_data, 50) plt.xlabel('Total number of actions') plt.ylabel('Number of students') plt.title('Histogram on number of actions per student') # plt.show() plt.savefig('histogram_student_total_actions.png')
def _output_training_result(self, init_training_data): labels = [self.predict(item) for item in init_training_data] result = zip(labels, init_training_data) size_list = [] cluster_list = [] for label in set(labels): tmp_result = filter(lambda x: x[0]==label, result) if len(tmp_result) > 100: size_list.append(len(tmp_result)) cluster_list.append(label) with codecs.open("clustering_{}.txt".format(label), 'w', 'utf-8') as f_out: f_out.write(u"Size of cluster: {}\n".format(len(tmp_result))) for item in tmp_result: f_out.write(u"{}\n".format(item[1])) fig, ax = report_tools.prepare_plot(figsize=(20, 5)) ind = np.arange(len(size_list)) width = 0.5 ax.bar(ind, size_list, width) ax.set_xticks(ind+width) ax.set_xticklabels(['C{}'.format(i) for i in cluster_list], rotation='90') plt.title('Cluster size') plt.savefig('cluster_size.png')
def student_freq_clustering(filtered_shell_log, user_info): student_input_list = filtered_shell_log.group_by(lambda x: x.user_name).filter_by(lambda x: x[0] in user_info) student_input_list = student_input_list.map(lambda x: (x[0], _generate_counter_list(x[1]))) freq_list = student_input_list.map(lambda x: (x[0], _convert_freq(x[1]))) feature_list = freq_list.map(lambda x: _generate_feature_vector(x[1])) pca = PCA(n_components=2) pca.fit(feature_list) plot_data = pca.transform(feature_list) colors = [] for item in student_input_list: if user_info[item[0]] == 'Course_A': colors.append('b') elif user_info[item[0]] == 'Course_B': colors.append('y') else: colors.append('r') plot_x = [item[0] for item in plot_data] plot_y = [item[1] for item in plot_data] fig, ax = report_tools.prepare_plot(gridWidth=0.5) plt.scatter(plot_x, plot_y, c=colors) plt.savefig('pca_result.png')
def editor_input_clustering(filtered_editor_log, code_template, user_info, ankors): def _unicode(c): if u'\u4e00' <= c <= u'\u9fff': return False try: c.decode('ascii') except UnicodeDecodeError: return False except UnicodeEncodeError: return False return True editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log(['insert', 'remove', 'paste', 'copy', 'save', 'open'])).map(lambda x: x.combine_editor_input()) insert_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'insert').map(lambda x: x['lines'][0]) template_filtered_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'paste').map(lambda x: x['text']) template_filtered_data = template_filtered_data.map(lambda x: code_template.strip_template(x)) total_input = data_reader.SList(insert_data + template_filtered_data.flatmap(lambda x: x.split(u"\n"))) total_input = total_input.filter_by(lambda x: len(filter(lambda y: not y in [u"\n", u"\t", u"\r", u" "] and _unicode(y), x))>5) print len(total_input) feature_set, ankor_set = _generate_feature_set(total_input,ankors.splitter) print len(feature_set) # pca = PCA(n_components=2) # pca.fit(feature_set) # plot_data = pca.transform(feature_set) # fig, ax = report_tools.prepare_plot() # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data]) # plt.title('Scatter plot on editor input') # plt.savefig('scatter_editor_input.png') # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # ax.scatter([item[0] for item in plot_data], [item[1] for item in plot_data], [item[2] for item in plot_data]) # plt.title('Scatter plot on editor input') # plt.savefig('3d_scatter_editor_input.png') # db = Birch().fit(feature_set) # labels = db.labels_ model = KMeans(n_clusters=300) labels = model.fit_predict(feature_set) result = zip(labels, total_input) size_list = [] cluster_list = [] print len(set(labels)) for label in set(labels): tmp_result = filter(lambda x: x[0]==label, result) if len(tmp_result) > 100: size_list.append(len(tmp_result)) cluster_list.append(label) with codecs.open("clustering_{}.txt".format(label), 'w', 'utf-8') as f_out: f_out.write(u"Size of cluster: {}\n".format(len(tmp_result))) for item in tmp_result: f_out.write(u"{}\n".format(item[1])) fig, ax = report_tools.prepare_plot(figsize=(20, 5)) ind = np.arange(len(size_list)) width = 0.5 ax.bar(ind, size_list, width) ax.set_xticks(ind+width) ax.set_xticklabels(['C{}'.format(i) for i in cluster_list], rotation='90') plt.title('Cluster size') plt.savefig('cluster_size.png') ankor_label = model.predict(ankor_set) with open('ankor_label.txt', 'w') as f_out: for item in zip(ankors.splitter, ankor_label): f_out.write("{}\n{}\n\n".format(item[0], item[1])) with open('model.json', 'w') as f_out: json.dump(model.get_params(), f_out)
def editor_behavior_analysis(filtered_editor_log, code_template, user_info): def get_operation_history(session_history): result = [] for session in session_history: result += session.cmd_list return result def split_history(cmd_history, op_list): tmp_session = [] result = [] for item in cmd_history: tmp_session.append(item) if item['action'] in op_list: result.append(tmp_session) tmp_session = [] return filter(lambda x: len(x) > 0, result) editor_cmd_data = filtered_editor_log.map(lambda x: x.filter_editor_log([u'insert', u'remove', u'paste', u'copy', u'save', u'open'])).map(lambda x: x.combine_editor_input()) # plot_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']=='paste').map(lambda x: x['text']) # with codecs.open('middle_paste.txt', 'w', 'utf-8') as f_out: # f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)>1000 and len(x)<3000))) # with codecs.open('long_paste.txt', 'w', 'utf-8') as f_out: # f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)>4000))) # with codecs.open('short_paste.txt', 'w', 'utf-8') as f_out: # f_out.write("\n***************************\n".join(plot_data.filter_by(lambda x: len(x)<1000))) template_filtered_data = editor_cmd_data.flatmap(lambda x: x.cmd_list).filter_by(lambda x: x['action']==u'paste').map(lambda x: x['text']) template_filtered_data = template_filtered_data.map(lambda x: code_template.strip_template(x)) fig, ax = report_tools.prepare_plot(gridWidth=0.5) ax.hist(template_filtered_data.map(lambda x: len(x)).filter_by(lambda x: x<1000), 50) plt.title('Histogram on filtered pasted content length') plt.savefig('hist_filtered_pasted_content.png') with codecs.open('middle_filtered_paste.txt', 'w', 'utf-8') as f_out: f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)>1000 and len(x)<3000))) with codecs.open('long_filtered_paste.txt', 'w', 'utf-8') as f_out: f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)>4000))) with codecs.open('short_filtered_paste.txt', 'w', 'utf-8') as f_out: f_out.write("\n***************************\n".join(template_filtered_data.filter_by(lambda x: len(x)<1000))) # fig, ax = report_tools.prepare_plot(gridWidth=0.5) # ax.hist(plot_data.map(lambda x: len(x)).filter_by(lambda x: x<1000), 50) #plt.title('Histogram on length of pasted contents') # plt.xlabel('Length of pasted content') # plt.savefig('hist_length_pasted_content.png') student_history_data = editor_cmd_data.group_by(lambda x: x.user_name).map(lambda x: (x[0], get_operation_history(x[1]))) tmp_data = student_history_data.map(lambda x: (x[0], split_history(x[1], ['save', 'open']))) fig, ax = report_tools.prepare_plot(gridWidth=0.5) ax.hist(tmp_data.map(lambda x: len(x[1])), 50) plt.title('Histogram on number of editor sessions') plt.xlabel('Number of editor sessions') plt.savefig('hist_editor_session.png') tmp_data = student_history_data.map(lambda x: filter(lambda y: y['action']=='paste', x[1])).map(lambda x: "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n".join([item['text'] for item in x])) counting_data = tmp_data.group_by(lambda x: x).map(lambda x: (x[0], len(x[1]))).sort_by(lambda x: -x[1]) with codecs.open('pasted_content.txt', 'w', 'utf-8') as f_out: for item in counting_data: f_out.write(str(item[1])) f_out.write("\n\n") f_out.write(item[0]) f_out.write("\n****************************************************************************\n") fig, ax = report_tools.prepare_plot(gridWidth=0.5) ax.hist(tmp_data.map(lambda x: len(x)), 50) plt.title('Histogram on length of pasted content') plt.xlabel('Length of pasted content') plt.savefig('hist_pasted_content.png')