def training(data, learnrate, regular, numofstep): students = {} problems = {} testing_sample = [] N = len(data) print "Num Of Lines to train: ", N for i in range(1,N): student, hierarchy, problem_name, step_name = data_key(data[i]) is_first_correct = float(data[i][13]) item_key = process_step_name(step_name) students.setdefault(student, []).append(is_first_correct) problems.setdefault(item_key, []).append(is_first_correct) testing_sample.append((student, item_key, is_first_correct)) matrix = create_matrix(testing_sample, students, problems) matrix = latent_factor(testing_sample, matrix, students, problems, learnrate, regular, numofstep) print "Training Done..." return matrix, students, problems, testing_sample
def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) #shuffle the training data #training_data = random.shuffle(training_data) learnrate = 0.01 regular = 0.02 numofstep = 30 matrix, students, problems, testing_sample = training(training_data, learnrate, regular, numofstep) predict_result = predict_from_matrix(matrix, students, problems,[ (data[0].upper(), data[1].upper()) for data in testing_sample]) training_error = rmse(predict_result, [float(i[2]) for i in testing_sample]) predict_test_result = predict_from_matrix(matrix, students, problems,[ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]]) predict_error = rmse(predict_test_result, [float(i[13]) for i in testing_result_data[1:]]) print "first 50 items of prediction before rounding: ",[float(i) for i in predict_test_result[:50]] print "first 50 items of prediction: ",[int(round(float(i))) for i in predict_test_result[:50]] print "first 50 items of test GT: ", [int(i[13]) for i in testing_result_data[1:50]] print '|', dataset, '|', training_error, '|', predict_error ,'|' plotroc([float(i[2]) for i in testing_sample], predict_result,\ [float(i[13]) for i in testing_result_data[1:]], predict_test_result) return
def get_feature_vectors_nb( training_data, maxtrainID, dataset, N, studentId_list, unit_list, section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict, step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, student_unit_dict, student_kc_dict, student_kc_temporal_dict, day_list): step_name_dict = [] for i in range(1, N): p_step = process_step_name(dataset[i][5]) step_name_hist = [ p_step.count('+'), p_step.count('-'), p_step.count('*'), p_step.count('/'), p_step.count('{var}'), p_step.count('{d}'), p_step.count('(') ] if step_name_hist not in step_name_dict: step_name_dict.append(step_name_hist) rows = [] for i in range(1, N): if dataset[i][1] in studentId_list: student_id_feature = studentId_list.index(dataset[i][1]) else: student_id_feature = len(studentId_list) + 1 unit, section = dataset[i][2].split(", ") if unit in unit_list: unit_feature = unit_list.index(unit) else: unit_feature = len(unit_list) + 1 if section in section_list: section_feature = section_list.index(section) else: section_feature = len(section_list) + 1 if dataset[i][3] in problem_name_list: problem_name_feature = problem_name_list.index(dataset[i][3]) else: problem_name_feature = len(problem_name_list) + 1 step = process_step_name(dataset[i][5]) step_name_processed = [ step.count('+'), step.count('-'), step.count('*'), step.count('/'), step.count('{var}'), step.count('{d}'), p_step.count('(') ] if step_name_processed in step_name_dict: step_name_feature = step_name_dict.index(step_name_processed) else: step_name_feature = len(step_name_dict) + 1 if dataset[i][len(dataset[i]) - 2] in kc_list_raw: kc_feature = kc_list_raw.index(dataset[i][len(dataset[i]) - 2]) else: kc_feature = len(kc_list_raw) + 1 #o = dataset[i][len(dataset[i])-1].split("~~") #print problem_hierarchy_feature rows.append([student_id_feature] + [unit_feature] + [section_feature] + [problem_name_feature] + [step_name_feature] + [kc_feature]) return rows
def process_data(training_data, testing_data, testing_result_data, N,\ IsSkipRowNotYetTrain, IsCondenseVecMode): #show_data(training_data) studentId_list = [] section_list = [] unit_list = [] problem_name_list = [] step_name_list = [] CFA_list = [] kc_list = [] kc_list_raw = [] testing_rows = [] #CFAR student_dict = {} student_dict_sum = {} step_dict = {} step_dict_sum = {} problem_name_dict = {} problem_name_dict_sum = {} kc_dict = {} kc_dict_sum = {} problem_step_dict = {} problem_step_dict_sum = {} student_problem_dict = {} student_problem_dict_sum = {} student_unit_dict = {} student_unit_dict_sum = {} student_kc_dict = {} student_kc_dict_sum = {} student_kc_temporal = {} day_list = [0] #N = 10000#len(training_data) print "Num Of Lines to train: ", N for i in range(1, N): studentId = training_data[i][1] unit, section = training_data[i][2].split(", ") problem_name = training_data[i][3] step_name = process_step_name(training_data[i][5]) step_name_raw = training_data[i][5] kcraw = training_data[i][len(training_data[i]) - 2] kcs = training_data[i][len(training_data[i]) - 2].split("~~") #opp = training_data[i][len(training_data[i])-1].split("~~") cfa = training_data[i][13] CFA_list.append(cfa) if studentId not in studentId_list: studentId_list.append(studentId) if unit not in unit_list: unit_list.append(unit) if section not in section_list: section_list.append(section) ppname = process_problem_name(problem_name) if ppname not in problem_name_list: problem_name_list.append(ppname) if step_name not in step_name_list: step_name_list.append(step_name) if kcraw not in kc_list_raw: kc_list_raw.append(kcraw) for kc in kcs: if kc not in kc_list: kc_list.append(kc) if IsCondenseVecMode > 0: #CFAR problem_step = (problem_name, step_name) student_problem = (studentId, problem_name) student_unit = (studentId, unit) student_kcs = (studentId, training_data[i][len(training_data[i]) - 2]) if student_dict.has_key(studentId): student_dict[studentId] = student_dict[studentId] + int(cfa) student_dict_sum[studentId] = student_dict_sum[studentId] + 1 else: student_dict[studentId] = int(cfa) student_dict_sum[studentId] = 1 if step_dict.has_key(step_name): step_dict[step_name] = step_dict[step_name] + int(cfa) step_dict_sum[step_name] = step_dict_sum[step_name] + 1 else: step_dict[step_name] = int(cfa) step_dict_sum[step_name] = 1 if problem_name_dict.has_key(problem_name): problem_name_dict[ problem_name] = problem_name_dict[problem_name] + int(cfa) problem_name_dict_sum[ problem_name] = problem_name_dict_sum[problem_name] + 1 else: problem_name_dict[problem_name] = int(cfa) problem_name_dict_sum[problem_name] = 1 if kc_dict.has_key(kcraw): kc_dict[kcraw] = kc_dict[kcraw] + int(cfa) kc_dict_sum[kcraw] = kc_dict_sum[kcraw] + 1 else: kc_dict[kcraw] = int(cfa) kc_dict_sum[kcraw] = 1 if problem_step_dict.has_key(problem_step): problem_step_dict[ problem_step] = problem_step_dict[problem_step] + int(cfa) problem_step_dict_sum[ problem_step] = problem_step_dict_sum[problem_step] + 1 else: problem_step_dict[problem_step] = int(cfa) problem_step_dict_sum[problem_step] = 1 if student_problem_dict.has_key(student_problem): student_problem_dict[student_problem] = student_problem_dict[ student_problem] + int(cfa) student_problem_dict_sum[ student_problem] = student_problem_dict_sum[ student_problem] + 1 else: student_problem_dict[student_problem] = int(cfa) student_problem_dict_sum[student_problem] = 1 if student_unit_dict.has_key(student_unit): student_unit_dict[ student_unit] = student_unit_dict[student_unit] + int(cfa) student_unit_dict_sum[ student_unit] = student_unit_dict_sum[student_unit] + 1 else: student_unit_dict[student_unit] = int(cfa) student_unit_dict_sum[student_unit] = 1 if student_kc_dict.has_key(student_kcs): student_kc_dict[ student_kcs] = student_kc_dict[student_kcs] + int(cfa) student_kc_dict_sum[ student_kcs] = student_kc_dict_sum[student_kcs] + 1 student_kc_temporal[student_kcs].append(i) else: student_kc_dict[student_kcs] = int(cfa) student_kc_dict_sum[student_kcs] = 1 student_kc_temporal[student_kcs] = [i] if float(training_data[i][10]) >= 0: day_list.append(day_list[-1]) else: day_list.append(day_list[-1] + 1) if IsCondenseVecMode > 0: #CFAR for key in student_dict: student_dict[key] = float( student_dict[key]) / student_dict_sum[key] for key in step_dict: step_dict[key] = float(step_dict[key]) / step_dict_sum[key] for key in problem_name_dict: problem_name_dict[key] = float( problem_name_dict[key]) / problem_name_dict_sum[key] for key in kc_dict: kc_dict[key] = float(kc_dict[key]) / kc_dict_sum[key] for key in problem_step_dict: problem_step_dict[key] = float( problem_step_dict[key]) / problem_step_dict_sum[key] for key in student_problem_dict: student_problem_dict[key] = float( student_problem_dict[key]) / student_problem_dict_sum[key] for key in student_unit_dict: student_unit_dict[key] = float( student_unit_dict[key]) / student_unit_dict_sum[key] for key in student_kc_dict: student_kc_dict[key] = float( student_kc_dict[key]) / student_kc_dict_sum[key] maxtrainID = int(training_data[N - 1][0]) #print problem_name_list #print "#of unique item in each categories: ",len(studentId_list), len(unit_list),\ #len(section_list), len(problem_name_list), len(step_name_list), len(kc_list), len(kc_list_raw) # do it in multi-thread # NumOfCore=4 # partsize = N/NumOfCore # thread_list = [] # training_data_rows= [] # trainpartresult = [0, 0, 0, 0] # pool = ThreadPool(processes=NumOfCore) # for i in range(0, NumOfCore): # trainpartresult[i] = pool.apply_async(get_feature_vectors, (training_data, maxtrainID, training_data[i*partsize:(i+1)*partsize], partsize, studentId_list, unit_list,\ # section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict,\ # step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, \ # student_unit_dict, student_kc_dict, student_kc_temporal, day_list)) # for i in range(0, NumOfCore): # training_data_rows.append(trainpartresult[i].get()) # Create matrix... training_data_rows = get_feature_vectors( training_data, IsCondenseVecMode, IsSkipRowNotYetTrain, maxtrainID, training_data, N, studentId_list, unit_list, section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict, step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, student_unit_dict, student_kc_dict, student_kc_temporal, day_list) testing_data_rows = get_feature_vectors( training_data, IsCondenseVecMode, IsSkipRowNotYetTrain, maxtrainID, testing_data, len(testing_data), studentId_list, unit_list, section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict, step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, student_unit_dict, student_kc_dict, student_kc_temporal, day_list) test_CFA = [] for i in range(1, len(testing_result_data)): #skip those rows if not yet trained if int(testing_result_data[i] [0]) <= maxtrainID + 1 or IsSkipRowNotYetTrain == False: test_CFA.append(testing_result_data[i][13]) return training_data_rows, CFA_list, testing_data_rows, test_CFA
def get_feature_vectors(training_data, IsCondenseVecMode, IsSkipRowNotYetTrain, maxtrainID, dataset, N, studentId_list, unit_list, section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict, step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, student_unit_dict, student_kc_dict, student_kc_temporal_dict, day_list): rows = [] for i in range(1, N): #skip those rows if not yet trained if IsSkipRowNotYetTrain == True and int( dataset[i][0]) > maxtrainID + 1: continue student_id_feature = get_feature_vector(studentId_list, [dataset[i][1]], 5) studentId_size = len(student_id_feature) unit, section = dataset[i][2].split(", ") unit_feature = get_feature_vector(unit_list, [unit], 1) unit_size = len(unit_feature) section_feature = get_feature_vector(section_list, [section], 1) section_size = len(section_feature) ppname = process_problem_name(dataset[i][3]) problem_name_feature = get_feature_vector(problem_name_list, [ppname], 1) problem_name_size = len(problem_name_feature) problem_view_feature = [ float(dataset[i][4]) / (float(dataset[i][4]) + 1) ] problem_view_size = len(problem_view_feature) p_step = process_step_name(dataset[i][5]) step_name_feature = [ p_step.count('+'), p_step.count('-'), p_step.count('*'), p_step.count('/'), p_step.count('{var}'), p_step.count('{d}'), p_step.count('(') ] #step_name_feature = [float(x)*2 for x in step_name_feature] step_name_size = len(step_name_feature) #print step_name_feature kc_feature = get_feature_vector( kc_list, dataset[i][len(dataset[i]) - 2].split("~~"), 1) kc_feature_size = len(kc_feature) opp_feature = get_feature_vector_opp( kc_list, dataset[i][len(dataset[i]) - 2].split("~~"), dataset[i][len(dataset[i]) - 1].split("~~"), 1) opp_size = len(opp_feature) if IsCondenseVecMode > 0: #CFAR if student_dict.has_key(dataset[i][1]): student_cfar = student_dict[dataset[i][1]] else: student_cfar = numpy.mean(student_dict.values()) if step_dict.has_key(p_step): step_cfar = step_dict[p_step] else: step_cfar = numpy.mean(step_dict.values()) if problem_name_dict.has_key(dataset[i][3]): problem_name_cfar = problem_name_dict[dataset[i][3]] else: problem_name_cfar = numpy.mean(problem_name_dict.values()) if kc_dict.has_key(dataset[i][len(dataset[i]) - 2]): kc_cfar = kc_dict[dataset[i][len(dataset[i]) - 2]] else: kc_cfar = numpy.mean(kc_dict.values()) if problem_step_dict.has_key((dataset[i][3], p_step)): problem_step_cfar = problem_step_dict[(dataset[i][3], p_step)] else: problem_step_cfar = numpy.mean(problem_step_dict.values()) if student_problem_dict.has_key((dataset[i][1], dataset[i][3])): student_problem_cfar = student_problem_dict[(dataset[i][1], dataset[i][3])] else: student_problem_cfar = numpy.mean( student_problem_dict.values()) if student_unit_dict.has_key((dataset[i][1], unit)): student_unit_cfar = student_unit_dict[(dataset[i][1], unit)] else: student_unit_cfar = numpy.mean(student_unit_dict.values()) student_kc = (dataset[i][1], dataset[i][len(dataset[i]) - 2]) student_kc_temporal = [0, 0] memory = [0, 0, 0, 0] #[1day, 1week, 1 month, >1 month] if student_kc_dict.has_key(student_kc): student_kc_cfar = student_kc_dict[student_kc] itemlist = student_kc_temporal_dict[student_kc] # extract the historyitemlist historyitemlist = [] currid = dataset[i][0] for rowindex in itemlist: rowid = training_data[rowindex][0] if int(rowid) <= int(currid): historyitemlist.append(rowindex) currday = day_list[ rowindex] #Find the best possible day of today #Perform the memory check for rowindex in historyitemlist: testday = day_list[rowindex] if testday > currday: continue elif testday == currday: memory[0] = 1 elif testday + 7 >= currday: memory[1] = 1 elif testday + 30 >= currday: memory[2] = 1 else: memory[3] = 1 # Take the last 6 or if any CFA and hint of this (student, kc) pairs historyitemlist = historyitemlist[-6:] if len(historyitemlist) > 0: cfa_mean = 0 hint_mean = 0 for rowindex in historyitemlist: cfa_mean = cfa_mean + int(training_data[rowindex][13]) hint_mean = hint_mean + int( training_data[rowindex][15]) cfa_mean = float(cfa_mean) / len(historyitemlist) #hint_mean = float(hint_mean)/len(historyitemlist) student_kc_temporal = [cfa_mean, 1] else: student_kc_cfar = numpy.mean(student_kc_dict.values()) o = dataset[i][len(dataset[i]) - 1].split("~~") oppsum = 0 for opp in o: try: oppsum = oppsum + int(opp) except ValueError: oppsum = oppsum oppsum = float(oppsum) / (float(oppsum) + 1) #print problem_hierarchy_feature if IsCondenseVecMode == 0: rows.append(student_id_feature + unit_feature + section_feature + problem_name_feature + problem_view_feature + step_name_feature + kc_feature + opp_feature) elif IsCondenseVecMode == 1: rows.append(student_id_feature + unit_feature + section_feature + problem_name_feature + problem_view_feature + step_name_feature + kc_feature + opp_feature + [student_cfar] + [step_cfar] + [problem_name_cfar] + [kc_cfar] + [problem_step_cfar] + [student_problem_cfar] + [student_unit_cfar] + [student_kc_cfar] + student_kc_temporal + memory + [oppsum]) else: rows.append(problem_view_feature + [student_cfar] + [step_cfar] + [problem_name_cfar] + [kc_cfar] + [problem_step_cfar] + [student_problem_cfar] + [student_unit_cfar] + [student_kc_cfar] + student_kc_temporal + memory + [oppsum]) print "feature vector composition: ", 'student', studentId_size, 'unit', unit_size,\ 'section', section_size, 'problem', problem_name_size, 'view', problem_view_size,\ 'step', step_name_size, 'kc', kc_feature_size, 'opportunity', opp_size return rows
def main(arg): dataset = arg[1] #'algebra_2005_2006' training_data, testing_data, testing_result_data = load_data(dataset) NumOfLineToTrain = 300000 #len(training_data) # mode0: normal, 1: normal+condensed, 2: only condensed Feature_vector_mode = 0 rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\ testing_data, testing_result_data, NumOfLineToTrain, False, Feature_vector_mode) print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \ '# of features:', len(rows[0]) clf=[]; y_pred_list=[]; M=3 #number of classifier name_list=['KNN', 'RandomForest', 'LinearSVM', \ 'Collabrative filtering'] #y_pred_list.append([random.randint(0,1) for i in testing_rows]) ############################################################## #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist) clf.append(KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2)) clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False)) clf.append(svm.LinearSVC(verbose=False, C=4.0)) #clf = tree.DecisionTreeClassifier() #clf.append(GaussianNB()) #clf = MultinomialNB(alpha=1.0) #clf.append(BernoulliNB(alpha=2.0, binarize=1.0)) #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000)) #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False)) ############################################################# #Train and do prediction for each method for i in range(M): start = time.time() print 'Training', name_list[i], '...' clf[i].fit(rows, CFA_list) print 'Predicting', name_list[i], '...' y_pred_list.append(clf[i].predict(testing_rows)) end = time.time() print "Time elapse: ", end-start, " sec" start = time.time() learnrate = 0.01; regular = 0.02; numofstep = 100 matrix, students, problems, testing_sample = training(training_data[:NumOfLineToTrain], learnrate, regular, numofstep) y_pred_list.append(predict_from_matrix(matrix, students, problems,\ [ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]])) end = time.time() print "Time elapse: ", end-start, " sec" for i in range(len(name_list)): print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA) print "first 30 items of prediction: ",[int(round(float(i))) for i in y_pred_list[i][:30]] print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]] print 'Please close the ROC curve plot' plotrocmany(test_CFA, y_pred_list, name_list) return