Example #1
0
def training(data, learnrate, regular, numofstep):
    students = {}
    problems = {}
    testing_sample = []
    N = len(data)

    print "Num Of Lines to train: ", N
    for i in range(1,N):
        student, hierarchy, problem_name, step_name = data_key(data[i])
        is_first_correct = float(data[i][13])
        item_key = process_step_name(step_name)
        students.setdefault(student, []).append(is_first_correct)
        problems.setdefault(item_key, []).append(is_first_correct)
        testing_sample.append((student, item_key, is_first_correct))

    matrix = create_matrix(testing_sample, students, problems)
    matrix = latent_factor(testing_sample, matrix, students, problems, learnrate, regular, numofstep)

    print "Training Done..."
    return matrix, students, problems, testing_sample
Example #2
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)

    #shuffle the training data
    #training_data = random.shuffle(training_data)

    learnrate = 0.01
    regular = 0.02
    numofstep = 30
    matrix, students, problems, testing_sample = training(training_data, learnrate, regular, numofstep)
    predict_result = predict_from_matrix(matrix, students, problems,[ (data[0].upper(), data[1].upper()) for data in testing_sample])
    training_error = rmse(predict_result, [float(i[2]) for i in testing_sample])

    predict_test_result = predict_from_matrix(matrix, students, problems,[ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]])
    predict_error = rmse(predict_test_result, [float(i[13]) for i in testing_result_data[1:]])

    print "first 50 items of prediction before rounding: ",[float(i) for i in predict_test_result[:50]]
    print "first 50 items of prediction: ",[int(round(float(i))) for i in predict_test_result[:50]]
    print "first 50 items of test GT: ", [int(i[13]) for i in testing_result_data[1:50]]
    print '|', dataset, '|', training_error, '|', predict_error ,'|'
    plotroc([float(i[2]) for i in testing_sample], predict_result,\
     [float(i[13]) for i in testing_result_data[1:]], predict_test_result)
    return
Example #3
0
def get_feature_vectors_nb(
        training_data, maxtrainID, dataset, N, studentId_list, unit_list,
        section_list, problem_name_list, step_name_list, kc_list, kc_list_raw,
        student_dict, step_dict, problem_name_dict, kc_dict, problem_step_dict,
        student_problem_dict, student_unit_dict, student_kc_dict,
        student_kc_temporal_dict, day_list):

    step_name_dict = []
    for i in range(1, N):
        p_step = process_step_name(dataset[i][5])
        step_name_hist = [
            p_step.count('+'),
            p_step.count('-'),
            p_step.count('*'),
            p_step.count('/'),
            p_step.count('{var}'),
            p_step.count('{d}'),
            p_step.count('(')
        ]

        if step_name_hist not in step_name_dict:
            step_name_dict.append(step_name_hist)

    rows = []
    for i in range(1, N):
        if dataset[i][1] in studentId_list:
            student_id_feature = studentId_list.index(dataset[i][1])
        else:
            student_id_feature = len(studentId_list) + 1

        unit, section = dataset[i][2].split(", ")
        if unit in unit_list:
            unit_feature = unit_list.index(unit)
        else:
            unit_feature = len(unit_list) + 1
        if section in section_list:
            section_feature = section_list.index(section)
        else:
            section_feature = len(section_list) + 1

        if dataset[i][3] in problem_name_list:
            problem_name_feature = problem_name_list.index(dataset[i][3])
        else:
            problem_name_feature = len(problem_name_list) + 1

        step = process_step_name(dataset[i][5])
        step_name_processed = [
            step.count('+'),
            step.count('-'),
            step.count('*'),
            step.count('/'),
            step.count('{var}'),
            step.count('{d}'),
            p_step.count('(')
        ]

        if step_name_processed in step_name_dict:
            step_name_feature = step_name_dict.index(step_name_processed)
        else:
            step_name_feature = len(step_name_dict) + 1

        if dataset[i][len(dataset[i]) - 2] in kc_list_raw:
            kc_feature = kc_list_raw.index(dataset[i][len(dataset[i]) - 2])
        else:
            kc_feature = len(kc_list_raw) + 1
        #o = dataset[i][len(dataset[i])-1].split("~~")

        #print problem_hierarchy_feature
        rows.append([student_id_feature] + [unit_feature] + [section_feature] +
                    [problem_name_feature] + [step_name_feature] +
                    [kc_feature])

    return rows
Example #4
0
def process_data(training_data, testing_data, testing_result_data, N,\
 IsSkipRowNotYetTrain, IsCondenseVecMode):

    #show_data(training_data)
    studentId_list = []
    section_list = []
    unit_list = []
    problem_name_list = []
    step_name_list = []
    CFA_list = []
    kc_list = []
    kc_list_raw = []
    testing_rows = []

    #CFAR
    student_dict = {}
    student_dict_sum = {}
    step_dict = {}
    step_dict_sum = {}
    problem_name_dict = {}
    problem_name_dict_sum = {}
    kc_dict = {}
    kc_dict_sum = {}
    problem_step_dict = {}
    problem_step_dict_sum = {}
    student_problem_dict = {}
    student_problem_dict_sum = {}
    student_unit_dict = {}
    student_unit_dict_sum = {}
    student_kc_dict = {}
    student_kc_dict_sum = {}
    student_kc_temporal = {}
    day_list = [0]

    #N = 10000#len(training_data)
    print "Num Of Lines to train: ", N
    for i in range(1, N):
        studentId = training_data[i][1]
        unit, section = training_data[i][2].split(", ")
        problem_name = training_data[i][3]
        step_name = process_step_name(training_data[i][5])
        step_name_raw = training_data[i][5]

        kcraw = training_data[i][len(training_data[i]) - 2]
        kcs = training_data[i][len(training_data[i]) - 2].split("~~")
        #opp = training_data[i][len(training_data[i])-1].split("~~")

        cfa = training_data[i][13]
        CFA_list.append(cfa)

        if studentId not in studentId_list:
            studentId_list.append(studentId)
        if unit not in unit_list:
            unit_list.append(unit)
        if section not in section_list:
            section_list.append(section)

        ppname = process_problem_name(problem_name)
        if ppname not in problem_name_list:
            problem_name_list.append(ppname)

        if step_name not in step_name_list:
            step_name_list.append(step_name)
        if kcraw not in kc_list_raw:
            kc_list_raw.append(kcraw)
        for kc in kcs:
            if kc not in kc_list:
                kc_list.append(kc)

        if IsCondenseVecMode > 0:
            #CFAR
            problem_step = (problem_name, step_name)
            student_problem = (studentId, problem_name)
            student_unit = (studentId, unit)
            student_kcs = (studentId,
                           training_data[i][len(training_data[i]) - 2])

            if student_dict.has_key(studentId):
                student_dict[studentId] = student_dict[studentId] + int(cfa)
                student_dict_sum[studentId] = student_dict_sum[studentId] + 1
            else:
                student_dict[studentId] = int(cfa)
                student_dict_sum[studentId] = 1

            if step_dict.has_key(step_name):
                step_dict[step_name] = step_dict[step_name] + int(cfa)
                step_dict_sum[step_name] = step_dict_sum[step_name] + 1
            else:
                step_dict[step_name] = int(cfa)
                step_dict_sum[step_name] = 1

            if problem_name_dict.has_key(problem_name):
                problem_name_dict[
                    problem_name] = problem_name_dict[problem_name] + int(cfa)
                problem_name_dict_sum[
                    problem_name] = problem_name_dict_sum[problem_name] + 1
            else:
                problem_name_dict[problem_name] = int(cfa)
                problem_name_dict_sum[problem_name] = 1

            if kc_dict.has_key(kcraw):
                kc_dict[kcraw] = kc_dict[kcraw] + int(cfa)
                kc_dict_sum[kcraw] = kc_dict_sum[kcraw] + 1
            else:
                kc_dict[kcraw] = int(cfa)
                kc_dict_sum[kcraw] = 1

            if problem_step_dict.has_key(problem_step):
                problem_step_dict[
                    problem_step] = problem_step_dict[problem_step] + int(cfa)
                problem_step_dict_sum[
                    problem_step] = problem_step_dict_sum[problem_step] + 1
            else:
                problem_step_dict[problem_step] = int(cfa)
                problem_step_dict_sum[problem_step] = 1

            if student_problem_dict.has_key(student_problem):
                student_problem_dict[student_problem] = student_problem_dict[
                    student_problem] + int(cfa)
                student_problem_dict_sum[
                    student_problem] = student_problem_dict_sum[
                        student_problem] + 1
            else:
                student_problem_dict[student_problem] = int(cfa)
                student_problem_dict_sum[student_problem] = 1

            if student_unit_dict.has_key(student_unit):
                student_unit_dict[
                    student_unit] = student_unit_dict[student_unit] + int(cfa)
                student_unit_dict_sum[
                    student_unit] = student_unit_dict_sum[student_unit] + 1
            else:
                student_unit_dict[student_unit] = int(cfa)
                student_unit_dict_sum[student_unit] = 1

            if student_kc_dict.has_key(student_kcs):
                student_kc_dict[
                    student_kcs] = student_kc_dict[student_kcs] + int(cfa)
                student_kc_dict_sum[
                    student_kcs] = student_kc_dict_sum[student_kcs] + 1
                student_kc_temporal[student_kcs].append(i)
            else:
                student_kc_dict[student_kcs] = int(cfa)
                student_kc_dict_sum[student_kcs] = 1
                student_kc_temporal[student_kcs] = [i]

            if float(training_data[i][10]) >= 0:
                day_list.append(day_list[-1])
            else:
                day_list.append(day_list[-1] + 1)

    if IsCondenseVecMode > 0:
        #CFAR
        for key in student_dict:
            student_dict[key] = float(
                student_dict[key]) / student_dict_sum[key]
        for key in step_dict:
            step_dict[key] = float(step_dict[key]) / step_dict_sum[key]
        for key in problem_name_dict:
            problem_name_dict[key] = float(
                problem_name_dict[key]) / problem_name_dict_sum[key]
        for key in kc_dict:
            kc_dict[key] = float(kc_dict[key]) / kc_dict_sum[key]

        for key in problem_step_dict:
            problem_step_dict[key] = float(
                problem_step_dict[key]) / problem_step_dict_sum[key]
        for key in student_problem_dict:
            student_problem_dict[key] = float(
                student_problem_dict[key]) / student_problem_dict_sum[key]
        for key in student_unit_dict:
            student_unit_dict[key] = float(
                student_unit_dict[key]) / student_unit_dict_sum[key]
        for key in student_kc_dict:
            student_kc_dict[key] = float(
                student_kc_dict[key]) / student_kc_dict_sum[key]

    maxtrainID = int(training_data[N - 1][0])

    #print problem_name_list
    #print "#of unique item in each categories: ",len(studentId_list), len(unit_list),\
    #len(section_list), len(problem_name_list), len(step_name_list), len(kc_list), len(kc_list_raw)

    # do it in multi-thread
    # NumOfCore=4
    # partsize = N/NumOfCore
    # thread_list = []
    # training_data_rows= []
    # trainpartresult = [0, 0, 0, 0]
    # pool = ThreadPool(processes=NumOfCore)
    # for i in range(0, NumOfCore):
    #     trainpartresult[i] = pool.apply_async(get_feature_vectors, (training_data, maxtrainID, training_data[i*partsize:(i+1)*partsize], partsize, studentId_list, unit_list,\
    #     section_list, problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict,\
    #      step_dict, problem_name_dict, kc_dict, problem_step_dict, student_problem_dict, \
    #      student_unit_dict, student_kc_dict, student_kc_temporal, day_list))

    # for i in range(0, NumOfCore):
    #     training_data_rows.append(trainpartresult[i].get())

    # Create matrix...
    training_data_rows = get_feature_vectors(
        training_data, IsCondenseVecMode, IsSkipRowNotYetTrain, maxtrainID,
        training_data, N, studentId_list, unit_list, section_list,
        problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict,
        step_dict, problem_name_dict, kc_dict, problem_step_dict,
        student_problem_dict, student_unit_dict, student_kc_dict,
        student_kc_temporal, day_list)

    testing_data_rows = get_feature_vectors(
        training_data,
        IsCondenseVecMode, IsSkipRowNotYetTrain, maxtrainID, testing_data,
        len(testing_data), studentId_list, unit_list, section_list,
        problem_name_list, step_name_list, kc_list, kc_list_raw, student_dict,
        step_dict, problem_name_dict, kc_dict, problem_step_dict,
        student_problem_dict, student_unit_dict, student_kc_dict,
        student_kc_temporal, day_list)

    test_CFA = []
    for i in range(1, len(testing_result_data)):
        #skip those rows if not yet trained
        if int(testing_result_data[i]
               [0]) <= maxtrainID + 1 or IsSkipRowNotYetTrain == False:
            test_CFA.append(testing_result_data[i][13])

    return training_data_rows, CFA_list, testing_data_rows, test_CFA
Example #5
0
def get_feature_vectors(training_data, IsCondenseVecMode, IsSkipRowNotYetTrain,
                        maxtrainID, dataset, N, studentId_list, unit_list,
                        section_list, problem_name_list, step_name_list,
                        kc_list, kc_list_raw, student_dict, step_dict,
                        problem_name_dict, kc_dict, problem_step_dict,
                        student_problem_dict, student_unit_dict,
                        student_kc_dict, student_kc_temporal_dict, day_list):

    rows = []
    for i in range(1, N):
        #skip those rows if not yet trained
        if IsSkipRowNotYetTrain == True and int(
                dataset[i][0]) > maxtrainID + 1:
            continue

        student_id_feature = get_feature_vector(studentId_list,
                                                [dataset[i][1]], 5)
        studentId_size = len(student_id_feature)

        unit, section = dataset[i][2].split(", ")
        unit_feature = get_feature_vector(unit_list, [unit], 1)
        unit_size = len(unit_feature)
        section_feature = get_feature_vector(section_list, [section], 1)
        section_size = len(section_feature)

        ppname = process_problem_name(dataset[i][3])
        problem_name_feature = get_feature_vector(problem_name_list, [ppname],
                                                  1)
        problem_name_size = len(problem_name_feature)

        problem_view_feature = [
            float(dataset[i][4]) / (float(dataset[i][4]) + 1)
        ]
        problem_view_size = len(problem_view_feature)

        p_step = process_step_name(dataset[i][5])
        step_name_feature = [
            p_step.count('+'),
            p_step.count('-'),
            p_step.count('*'),
            p_step.count('/'),
            p_step.count('{var}'),
            p_step.count('{d}'),
            p_step.count('(')
        ]
        #step_name_feature = [float(x)*2 for x in step_name_feature]
        step_name_size = len(step_name_feature)

        #print step_name_feature
        kc_feature = get_feature_vector(
            kc_list, dataset[i][len(dataset[i]) - 2].split("~~"), 1)
        kc_feature_size = len(kc_feature)

        opp_feature = get_feature_vector_opp(
            kc_list, dataset[i][len(dataset[i]) - 2].split("~~"),
            dataset[i][len(dataset[i]) - 1].split("~~"), 1)
        opp_size = len(opp_feature)

        if IsCondenseVecMode > 0:
            #CFAR
            if student_dict.has_key(dataset[i][1]):
                student_cfar = student_dict[dataset[i][1]]
            else:
                student_cfar = numpy.mean(student_dict.values())

            if step_dict.has_key(p_step):
                step_cfar = step_dict[p_step]
            else:
                step_cfar = numpy.mean(step_dict.values())

            if problem_name_dict.has_key(dataset[i][3]):
                problem_name_cfar = problem_name_dict[dataset[i][3]]
            else:
                problem_name_cfar = numpy.mean(problem_name_dict.values())

            if kc_dict.has_key(dataset[i][len(dataset[i]) - 2]):
                kc_cfar = kc_dict[dataset[i][len(dataset[i]) - 2]]
            else:
                kc_cfar = numpy.mean(kc_dict.values())

            if problem_step_dict.has_key((dataset[i][3], p_step)):
                problem_step_cfar = problem_step_dict[(dataset[i][3], p_step)]
            else:
                problem_step_cfar = numpy.mean(problem_step_dict.values())

            if student_problem_dict.has_key((dataset[i][1], dataset[i][3])):
                student_problem_cfar = student_problem_dict[(dataset[i][1],
                                                             dataset[i][3])]
            else:
                student_problem_cfar = numpy.mean(
                    student_problem_dict.values())

            if student_unit_dict.has_key((dataset[i][1], unit)):
                student_unit_cfar = student_unit_dict[(dataset[i][1], unit)]
            else:
                student_unit_cfar = numpy.mean(student_unit_dict.values())

            student_kc = (dataset[i][1], dataset[i][len(dataset[i]) - 2])
            student_kc_temporal = [0, 0]
            memory = [0, 0, 0, 0]  #[1day, 1week, 1 month, >1 month]

            if student_kc_dict.has_key(student_kc):
                student_kc_cfar = student_kc_dict[student_kc]

                itemlist = student_kc_temporal_dict[student_kc]
                # extract the historyitemlist
                historyitemlist = []
                currid = dataset[i][0]
                for rowindex in itemlist:
                    rowid = training_data[rowindex][0]
                    if int(rowid) <= int(currid):
                        historyitemlist.append(rowindex)
                        currday = day_list[
                            rowindex]  #Find the best possible day of today

                #Perform the memory check
                for rowindex in historyitemlist:
                    testday = day_list[rowindex]
                    if testday > currday:
                        continue
                    elif testday == currday:
                        memory[0] = 1
                    elif testday + 7 >= currday:
                        memory[1] = 1
                    elif testday + 30 >= currday:
                        memory[2] = 1
                    else:
                        memory[3] = 1

                # Take the last 6 or if any CFA and hint of this (student, kc) pairs
                historyitemlist = historyitemlist[-6:]
                if len(historyitemlist) > 0:
                    cfa_mean = 0
                    hint_mean = 0
                    for rowindex in historyitemlist:
                        cfa_mean = cfa_mean + int(training_data[rowindex][13])
                        hint_mean = hint_mean + int(
                            training_data[rowindex][15])
                    cfa_mean = float(cfa_mean) / len(historyitemlist)
                    #hint_mean = float(hint_mean)/len(historyitemlist)
                    student_kc_temporal = [cfa_mean, 1]
            else:
                student_kc_cfar = numpy.mean(student_kc_dict.values())

            o = dataset[i][len(dataset[i]) - 1].split("~~")
            oppsum = 0
            for opp in o:
                try:
                    oppsum = oppsum + int(opp)
                except ValueError:
                    oppsum = oppsum
            oppsum = float(oppsum) / (float(oppsum) + 1)

        #print problem_hierarchy_feature

        if IsCondenseVecMode == 0:
            rows.append(student_id_feature + unit_feature + section_feature +
                        problem_name_feature + problem_view_feature +
                        step_name_feature + kc_feature + opp_feature)

        elif IsCondenseVecMode == 1:
            rows.append(student_id_feature + unit_feature + section_feature +
                        problem_name_feature + problem_view_feature +
                        step_name_feature + kc_feature + opp_feature +
                        [student_cfar] + [step_cfar] + [problem_name_cfar] +
                        [kc_cfar] + [problem_step_cfar] +
                        [student_problem_cfar] + [student_unit_cfar] +
                        [student_kc_cfar] + student_kc_temporal + memory +
                        [oppsum])

        else:
            rows.append(problem_view_feature + [student_cfar] + [step_cfar] +
                        [problem_name_cfar] + [kc_cfar] + [problem_step_cfar] +
                        [student_problem_cfar] + [student_unit_cfar] +
                        [student_kc_cfar] + student_kc_temporal + memory +
                        [oppsum])

    print "feature vector composition: ", 'student', studentId_size, 'unit', unit_size,\
     'section', section_size, 'problem', problem_name_size, 'view', problem_view_size,\
      'step', step_name_size, 'kc', kc_feature_size, 'opportunity', opp_size

    return rows
Example #6
0
def main(arg):
    dataset = arg[1] #'algebra_2005_2006'
    training_data, testing_data, testing_result_data = load_data(dataset)
    NumOfLineToTrain = 300000 #len(training_data)

    # mode0: normal, 1: normal+condensed, 2: only condensed
    Feature_vector_mode = 0
    rows, CFA_list, testing_rows, test_CFA = process_data(training_data,\
     testing_data, testing_result_data, NumOfLineToTrain, False, Feature_vector_mode)
    print 'Training rows:', len(rows),'Testing rows:', len(testing_rows), \
    '# of features:', len(rows[0])


    clf=[]; y_pred_list=[]; M=3 #number of classifier
    name_list=['KNN', 'RandomForest', 'LinearSVM', \
    'Collabrative filtering']
    #y_pred_list.append([random.randint(0,1) for i in testing_rows])

    ##############################################################

    #clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5, metric='pyfunc', func=myknndist)
    clf.append(KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10, p=2))
    clf.append(RandomForestClassifier(n_estimators=100,n_jobs=-1, verbose=False))
    clf.append(svm.LinearSVC(verbose=False, C=4.0))
    #clf = tree.DecisionTreeClassifier()

    #clf.append(GaussianNB())
    #clf = MultinomialNB(alpha=1.0)
    #clf.append(BernoulliNB(alpha=2.0, binarize=1.0))

    #clf.append(linear_model.SGDClassifier(n_jobs=-1,n_iter=1000))
    #clf.append(linear_model.LogisticRegressionCV(n_jobs=-1, verbose=False))

    #############################################################

    #Train and do prediction for each method
    for i in range(M):
        start = time.time()
        print 'Training', name_list[i], '...'
        clf[i].fit(rows, CFA_list)
        print 'Predicting', name_list[i], '...'
        y_pred_list.append(clf[i].predict(testing_rows))
        end = time.time()
        print "Time elapse: ", end-start, " sec"

    start = time.time()
    learnrate = 0.01; regular = 0.02; numofstep = 100
    matrix, students, problems, testing_sample = training(training_data[:NumOfLineToTrain], learnrate, regular, numofstep)
    y_pred_list.append(predict_from_matrix(matrix, students, problems,\
        [ (data[1].upper(), process_step_name(data[5].upper())) for data in testing_data[1:]]))
    end = time.time()
    print "Time elapse: ", end-start, " sec"

    for i in range(len(name_list)):
        print name_list[i], ' rmse= ', rmse(y_pred_list[i], test_CFA)
        print "first 30 items of prediction: ",[int(round(float(i))) for i in y_pred_list[i][:30]]

    print "first 30 items of test GT: ", [int(i) for i in test_CFA[:30]]
    print 'Please close the ROC curve plot'
    plotrocmany(test_CFA, y_pred_list, name_list)
    return