Ejemplo n.º 1
0
def connectUser(data, connected_file_name):
    print("==> load data successful")
    u, c = counter(data['user_id'])
    # UserNumberDict = dict(zip(u, c))

    userQuesNumIndexList = getUserQuesNumIndexList(data['user_id'])
    newdata = pd.DataFrame()

    print('==> begin concatenate dataset')
    for i in pp.prog_percent(range(len(u)), stream=sys.stdout):
        for k in range(len(userQuesNumIndexList)):
            if userQuesNumIndexList[k, 0] == u[i]:
                temp = data.iloc[int(userQuesNumIndexList[
                    k, 2]):int(userQuesNumIndexList[k, 2] +
                               userQuesNumIndexList[k, 1])]
                newdata = newdata.append(temp)

    newdata.reset_index(drop=True)
    newdata.to_csv(connected_file_name, index=False)

    print(
        '==> before connect\t',
        aux.stastic_SecNumber_UserNumber_SkillNumber(data,
                                                     code0.DatasetParameter()))
    print(
        '==> after connect\t',
        aux.stastic_SecNumber_UserNumber_SkillNumber(newdata,
                                                     code0.DatasetParameter()))

    return newdata
Ejemplo n.º 2
0
def trainAEWeights():
    if not code0.BASELINE:
        dp = code0.DatasetParameter()
        dataset, labels = code1.load_data(dp)

        dp.skill_num = len(dataset['skill_id'].unique()) + 1
        dp.skill_set = list(dataset['skill_id'].unique())
        dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset)
        dp.seq_width = len(dp.columnsName_to_index)


        SAEconfig = code0.SAEParamsConfig()
        SAEconfig.num_steps = 30
        SAEconfig.seq_width = dp.seq_width

        g = tf.Graph()
        with g.as_default():
            model_autoencoder = SIMPLEAUTOENCODER(SAEconfig, dp)
            initializer = tf.random_uniform_initializer(-SAEconfig.init_scale, SAEconfig.init_scale)

        with tf.Session(graph=g) as sess:
            tf.initialize_all_variables().run()

            for i in range(SAEconfig.max_max_epoch):
                p = run_ae_epoch(sess, model_autoencoder, dataset, SAEconfig)
                print(str(i)+"/"+str(SAEconfig.max_max_epoch)+" epoch,avgcost ", str(p))
            model_autoencoder.saveWeights(sess)
    else:
        print("BASELINE model, don't need train weights")
Ejemplo n.º 3
0
def read_data_from_csv2():
    processedFileName = './data/cmu_stat_f2011/test_data.csv'
    raw_data_txt = "./data/cmu_stat_f2011/cmu.txt"

    if os.path.exists(processedFileName):
        data = pd.read_csv(processedFileName)
        print("==> read ", processedFileName, " directly")

    else:
        if os.path.exists(raw_data_txt):
            data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t')
            print(data.columns)
            data.rename(columns={
                'Duration (sec)': 'time',
                'Outcome': 'correct',
                'KC (F2011)': 'skill_id',
                'Problem Name': 'problem_id',
                'Step Name': 'step_id',
                'Anon Student Id': 'user_id',
                "Student Response Type": "first_action",
                'Attempt At Step': "attempt_level"
            },
                        inplace=True)

            data = data.fillna(-1)

            filer_data = data[code0.DatasetParameter(
                'cmu_stat_f2011').filtedColumnNameList]
            filer_data = filer_data[(filer_data['correct'] != -1)
                                    & (filer_data['correct'] != 'HINT') &
                                    (filer_data['skill_id'] != '-1') &
                                    (filer_data['time'] != '.')]

            filer_data['correct'].replace({
                'CORRECT': 1,
                'INCORRECT': 0
            },
                                          inplace=True)

            # change str to integar
            for feature in [
                    'skill_id', 'step_id', 'problem_id', 'user_id',
                    'Level (Unit)', 'Level (Module)', 'first_action',
                    'attempt_level'
            ]:
                print("==> BEGIN ", feature)
                temp_set = set(list(filer_data[feature]))
                temp_dict = {
                    key: value + 1
                    for value, key in enumerate(temp_set)
                }
                filer_data[feature].replace(temp_dict, inplace=True)
                print("==> END   ", feature)

            print("==> first_action", set(filer_data['first_action']))
            print("==> attempt_level", set(filer_data['attempt_level']))
            data.to_csv(processedFileName, index=False)
        else:
            raise ('No data file exists!')
    return data
Ejemplo n.º 4
0
def time_basic_process(data):
    # -1-transfer to second unit
    print("==> transfer time unit: millsecond to second")
    tempTimeList = list(data['time'])
    newTimeList = [int(x / 1000) for x in tempTimeList]
    data['time'] = newTimeList
    del newTimeList, tempTimeList

    # -2-remove outlier records
    print('==> delete outlier of time feature')
    print('==> length before delete\t', len(data))
    data = data[(data['time'] <= code0.DatasetParameter().time_threshold)
                & (data['time'] > 0)]
    print('==> length after delete\t', len(data))

    # -3-transfer to z-score
    time_z_level = code0.DatasetParameter().time_z_level
    print('==> preprocerss time to z-score based on ', time_z_level)
    time_z_id_set = np.unique(data[time_z_level])
    std_dict = {}
    mean_dict = {}
    for itme_id in pp.prog_percent(time_z_id_set,
                                   stream=sys.stdout,
                                   title='==> extract mean and std of time'):
        temp_data = data[data[time_z_level] == itme_id]
        temp_list = list(temp_data['time'])
        # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --')
        std_dict[itme_id] = np.std(temp_list, axis=0)
        mean_dict[itme_id] = np.mean(temp_list, axis=0)

    assert len(std_dict) == len(mean_dict)

    data = data.reset_index(drop=True)
    for id in pp.prog_percent(range(len(data)),
                              stream=sys.stdout,
                              title='==> cast time to z-score'):
        data.loc[id, 'time'] = (data.loc[id, 'time'] -
                                mean_dict[data.loc[id, time_z_level]]) / (
                                    std_dict[data.loc[id, time_z_level]] * 1.0)

    data = data.fillna(0)
    """
    plt.hist(list(data['time']), bins=np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval*2))
    plt.title("time z score distribution")
    plt.savefig('./result/assistment2009/time_distribution' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
    """
    return data
Ejemplo n.º 5
0
def attempt_correct_analysis(data):
    data = data[data['attempt_count'] <= code0.DatasetParameter().attemp_max]
    u, c = aux.counter(list(data['attempt_count']))

    atempt_list = np.arange(code0.DatasetParameter().attemp_max + 1)
    correct_num_list = []
    for item in atempt_list:
        temp_data = data[(data['attempt_count'] == item)]
        if len(temp_data) != 0:
            correct_num_list.append(
                sum(temp_data['correct']) * 1.0 / len(temp_data))
        else:
            correct_num_list.append(0)
    print(u, "\n", c)
    print(atempt_list, "\n", correct_num_list)

    for a in correct_num_list:
        print("%.3f" % a)
Ejemplo n.º 6
0
def attemp_hint_and_correctness_analysis(data):
    data = data.reset_index(drop=True)
    bins = np.concatenate([[-1], np.arange(0.0, 1.1, 0.1)])

    for attri in ['hint_count_level', 'attempt_count_level']:
        correct_mean_list = []
        correct_std_list = []
        correct_num_list = []

        for item_index in pp.prog_percent(
                range(len(bins)),
                stream=sys.stdout,
                title='==> get correctness according to ' + attri):
            up_bin = bins[item_index] + 0.05
            down_bin = bins[item_index] - 0.05

            temp_data = data[(data[attri] >= down_bin)
                             & (data[attri] < up_bin)]
            temp_correct_list = list(temp_data['correct'])
            correct_num_list.append(len(temp_correct_list))

            if (len(temp_correct_list) != 0):
                correct_mean_list.append(np.mean(temp_correct_list, axis=0))
                correct_std_list.append(np.std(temp_correct_list, axis=0))
            else:
                correct_mean_list.append(0)
                correct_std_list.append(0)

        fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
        ax = axs[0]
        ax.plot(bins, correct_mean_list)
        ax.set_title('correctness ' + attri)

        boundary_list = code0.DatasetParameter().correct_boundary_list
        for nmber in boundary_list:
            ax.axhline(y=nmber,
                       xmin=0,
                       xmax=1,
                       c="red",
                       linewidth=0.5,
                       zorder=0)

        ax = axs[1]
        ax.plot(bins, correct_num_list)
        ax.set_title(attri + " number distribution")
        ax.set_xlim([-1.1, 1.1])
        plt.savefig('./result/assistment2009/' + attri + '_correctness_' +
                    str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) +
                    '.png')
Ejemplo n.º 7
0
        return TensorCrossFeatures

    def get_init_value_for_train_weights(self):
        featureslist = [
            self.getSkillCorrectCrossFeature(),
            self.getCrossFeatureAll(),
            self.getCategoryFeatureInputs(),
            self.getContinuesFeatureInputs()
        ]
        x_tmp = tf.concat(2, featureslist)
        x = tf.reshape(x_tmp, [self.batch_size * self.num_steps, -1])
        return x


if __name__ == "__main__":
    dp = code0.DatasetParameter()
    ap = code0.autoencoderParameter()

    dataset, labels = code1.load_data(dp)
    # tuple_data = code1.convert_data_labels_to_tuples(dataset, labels)

    skill_num = len(
        dataset['skill_id'].unique()) + 1  # 0 for unlisted skill_id
    dp.skill_num = skill_num
    dp.skill_set = list(dataset['skill_id'].unique())
    dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(
        dataset)
    dp.seq_width = len(dp.columnsName_to_index)

    print("columns_max\n", dp.columns_max)
    print("columns_numb\n", dp.columns_numb)
Ejemplo n.º 8
0
def time_add_level_process(data):
    data = data.reset_index(drop=True)
    bins = np.arange(min(data['time']), max(data['time']),
                     code0.DatasetParameter().time_interval * 2)
    correct_mean_list = []
    correct_std_list = []
    correct_num_list = []
    for item_index in pp.prog_percent(range(len(bins)),
                                      stream=sys.stdout,
                                      title='==> get correctness'):
        up_bin = bins[item_index] + code0.DatasetParameter().time_interval
        down_bin = bins[item_index] - code0.DatasetParameter().time_interval

        temp_data = data[data['time'] >= down_bin]
        temp_data = temp_data[temp_data['time'] < up_bin]

        temp_correct_list = list(temp_data['correct'])
        correct_num_list.append(len(temp_correct_list))
        if (len(temp_correct_list) != 0):
            correct_mean_list.append(np.mean(temp_correct_list, axis=0))
            correct_std_list.append(np.std(temp_correct_list, axis=0))
        else:
            correct_mean_list.append(0)
            correct_std_list.append(0)

    # plot the relationship
    fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True)
    ax = axs[0]
    ax.plot(bins, correct_mean_list)
    ax.set_title('correctness')
    boundary_list = code0.DatasetParameter().correct_boundary_list
    for nmber in boundary_list:
        ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0)

    ax = axs[1]
    ax.plot(bins, correct_num_list)
    ax.set_title("time z score distribution")

    ax.set_xlim([-2, 4])
    plt.savefig('./result/assistment2009/time_distribution_correctness_' +
                str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) +
                '.png')
    # plt.show()

    # add a colum according to correctness boundary
    time_level_list = []
    temp_list = list(data['time'])
    bd = code0.DatasetParameter().time_boundary_list
    # 0 ~        time <-0.8
    # 1 ~ -0.8 < time < -0.6
    # 2 ~ -0.6 < time < 0
    # 3 ~    0 < time
    for idx in range(len(temp_list)):
        if temp_list[idx] <= bd[0]:
            time_level_list.append(0)
        elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]):
            time_level_list.append(1)
        elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]):
            time_level_list.append(2)
        elif (temp_list[idx] > bd[2]):
            time_level_list.append(3)
        else:
            raise Exception("Error in time division")

    data['time_level'] = time_level_list
    return data
Ejemplo n.º 9
0
def main(unused_args):
    if not code0.BASELINE and code0.AUTOENCODER_LABEL:
        trainAEWeights()

    dp = code0.DatasetParameter()
    dataset, labels = code1.load_data(dp)
    tuple_data = code1.convert_data_labels_to_tuples(dataset, labels)

    skill_num = len(
        dataset['skill_id'].unique()) + 1  # 0 for unlisted skill_id
    dp.skill_num = skill_num
    dp.skill_set = list(dataset['skill_id'].unique())
    dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(
        dataset)
    dp.seq_width = len(dp.columnsName_to_index)

    print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n")
    print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n")
    print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index,
          "\n")

    config = code0.ModelParamsConfig(dp)
    eval_config = code0.ModelParamsConfig(dp)

    if dp.dataSetType == 'kdd':
        config.num_steps = 2000
    else:
        config.num_steps = aux.get_num_step(dataset)

    eval_config.num_steps = config.num_steps
    eval_config.batch_size = 2

    config.skill_num = skill_num
    eval_config.skill_num = config.skill_num

    auc_train, r2_train, rmse_train, auc_test, r2_test, rmse_test = aux.defineResult(
    )
    CVname = auc_test.columns
    size = len(tuple_data)

    # write all the records to log file
    aux.printConfigration(config=config,
                          dp=dp,
                          train_numb=int(size * 0.8),
                          test_numb=int(size * 0.2))
    aux.logwrite([
        "==> model_continues_columns\n" + ','.join(dp.model_continues_columns)
    ], dp, True)
    aux.logwrite(
        ["==> model_category_columns\n" + ','.join(dp.model_category_columns)],
        dp, True)
    str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns]
    str_cross_columns = ','.join(str_cross_columns_list)
    aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True)

    for index, cv_num_name in enumerate(CVname):
        aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"],
                     dp,
                     prt=True)
        timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M")
        aux.logwrite(["\ntime:\t" + timeStampe], dp)

        train_tuple_rows = tuple_data[:int(index * 0.2 *
                                           size)] + tuple_data[int(
                                               (index + 1) * 0.2 * size):]
        test_tuple_rows = tuple_data[int(index * 0.2 * size):int((index + 1) *
                                                                 0.2 * size)]

        with tf.Graph().as_default(), tf.Session() as session:
            initializer = tf.random_uniform_initializer(
                -config.init_scale, config.init_scale)
            # training model
            print("\n==> Load Training model")
            with tf.variable_scope("model",
                                   reuse=None,
                                   initializer=initializer):
                m = code2.Model(is_training=True, config=config, dp=dp)
            # testing model
            print("\n==> Load Testing model")
            with tf.variable_scope("model",
                                   reuse=True,
                                   initializer=initializer):
                mtest = code2.Model(is_training=False,
                                    config=eval_config,
                                    dp=dp)

            tf.initialize_all_variables().run()

            print("==> begin to run epoch...")
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i - config.max_epoch, 0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                rt = session.run(m.lr)
                rmse, auc, r2 = code3.run_epoch(session,
                                                m,
                                                train_tuple_rows,
                                                m.train_op,
                                                verbose=True)
                train_result = "\n==> %s cross-valuation: Train Epoch: %d\tLearning rate: %.3f\t rmse: %.3f \t auc: %.3f \t r2: %.3f" % (
                    cv_num_name, i + 1, rt, rmse, auc, r2)
                print(train_result)
                auc_train.loc[i, cv_num_name] = auc
                rmse_train.loc[i, cv_num_name] = rmse
                r2_train.loc[i, cv_num_name] = r2
                aux.logwrite(train_result, dp, False)

                display = 5
                if ((i + 1) % display == 0):
                    print("-" * 80)
                    rmse, auc, r2 = code3.run_epoch(session,
                                                    mtest, test_tuple_rows,
                                                    tf.no_op())
                    test_result = "\n==> %s cross-valuation: Test Epoch: %d \t rmse: %.3f \t auc: %.3f \t r2: %.3f" % (
                        cv_num_name, (i + 1) / display, rmse, auc, r2)
                    print(test_result)
                    print("=" * 80)
                    auc_test.loc[(i + 1) / display - 1, cv_num_name] = auc
                    rmse_test.loc[(i + 1) / display - 1, cv_num_name] = rmse
                    r2_test.loc[(i + 1) / display - 1, cv_num_name] = r2
                    aux.logwrite(test_result, dp, False)
    print("==> Finsih! whole process, save result and print\t" +
          dp.currentTime)

    try:
        mean_result = pd.DataFrame({
            "AUC": list(auc_test.mean(1)),
            "RMSE": list(rmse_test.mean(1)),
            "R2": list(r2_test.mean(1))
        })
        print(mean_result)
        aux.saveResult(dp, auc_train, rmse_train, r2_train, auc_test,
                       rmse_test, r2_test, mean_result)
    except:
        print("except during save result")
        pass
Ejemplo n.º 10
0
def main(unused_args):
    aux.check_directories()

    if not code0.BASELINE and code0.AUTOENCODER_LABEL:
        trainAEWeights()

    dp = code0.DatasetParameter()
    dataset, labels = code1.load_data(dp)
    tuple_data = code1.convert_data_labels_to_tuples(dataset, labels)

    skill_num = len(dataset['skill_id'].unique()) + 1
    dp.skill_num = skill_num
    dp.skill_set = list(dataset['skill_id'].unique())
    dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset)
    dp.seq_width = len(dp.columnsName_to_index)

    print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n")
    print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n")
    print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index, "\n")

    config = code0.ModelParamsConfig(dp)
    eval_config = code0.ModelParamsConfig(dp)

    if dp.dataSetType == 'kdd':
        config.num_steps = 1500
    elif dp.dataSetType == 'cmu_stat_f2011':
        config.num_steps = 1500
    else:
        config.num_steps = aux.get_num_step(dataset)

    eval_config.num_steps = config.num_steps
    eval_config.batch_size = 2

    config.skill_num = skill_num
    eval_config.skill_num = config.skill_num

    name_list = ['cv', 'epoch', 'type', 'rmse', 'auc', 'r2', 'inter_rmse', 'inter_auc', 'inter_r2', 'intra_rmse',
                 'intra_auc', 'intra_r2']
    result_data = pd.DataFrame(columns=name_list)
    CVname = ['c1', 'c2', 'c3', 'c4', 'c5']
    size = len(tuple_data)

    # write all the records to log file
    aux.printConfigration(config=config, dp=dp, train_numb=int(size * 0.8), test_numb=int(size * 0.2))
    aux.logwrite(["==> model_continues_columns\n" + ','.join(dp.model_continues_columns)], dp, True)
    aux.logwrite(["==> model_category_columns\n" + ','.join(dp.model_category_columns)], dp, True)
    str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns]
    str_cross_columns = ','.join(str_cross_columns_list)
    aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True)

    for index, cv_num_name in enumerate(CVname):
        aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"], dp, prt=True)
        timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M")
        aux.logwrite(["\ntime:\t" + timeStampe], dp)

        train_tuple_rows = tuple_data[:int(index * 0.2 * size)] + tuple_data[int((index + 1) * 0.2 * size):]
        test_tuple_rows = tuple_data[int(index * 0.2 * size): int((index + 1) * 0.2 * size)]

        with tf.Graph().as_default(), tf.Session() as session:
            initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
            # training model
            print("\n==> Load Training model")
            with tf.variable_scope("model", reuse=None, initializer=initializer):
                m = code2.Model(is_training=True, config=config, dp=dp)
            # testing model
            print("\n==> Load Testing model")
            with tf.variable_scope("model", reuse=True, initializer=initializer):
                mtest = code2.Model(is_training=False, config=eval_config, dp=dp)

            tf.initialize_all_variables().run()

            print("==> begin to run epoch...")
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay ** max(i - config.max_epoch, 0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                rt = session.run(m.lr)
                rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch(
                    session, m, train_tuple_rows, m.train_op, verbose=True)

                aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse,
                                 intra_auc, intra_r2, 'train')

                result_data = result_data.append(pd.Series(
                    [cv_num_name, i, 'train', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc,
                     intra_r2], index=name_list), ignore_index=True)

                display = 5
                if ((i + 1) % display == 0):
                    print('BEGIN', "-" * 80)
                    rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch(
                        session, mtest, test_tuple_rows, tf.no_op())
                    aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse,
                                     intra_auc, intra_r2, 'test', display)
                    print('END--', "-" * 80)

                    result_data = result_data.append(pd.Series(
                        [cv_num_name, (i + 1) / display, 'test', rmse, auc, r2, inter_rmse, inter_auc, inter_r2,
                         intra_rmse, intra_auc, intra_r2], index=name_list), ignore_index=True)

                #print ("-*"*50,"\n",result_data)

    print("==> Finsih! whole process, save result and print\t" + dp.currentTime)

    temp_data = result_data[result_data['type'] == 'test']
    for idx in set(temp_data['epoch']):
        tp = temp_data[temp_data['epoch'] == idx]
        result_data = result_data.append(pd.Series(
            ['average', idx, 'test_mean', tp['rmse'].mean(), tp['auc'].mean(), tp['r2'].mean(), tp['inter_rmse'].mean(),
             tp['inter_auc'].mean(), tp['inter_r2'].mean(), tp['intra_rmse'].mean(), tp['intra_auc'].mean(),
             tp['intra_r2'].mean()], index=name_list), ignore_index=True)

    print(result_data[result_data['cv']=='average'])
    result_data.to_csv('./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv')
    print('==> save to ./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv')