def connectUser(data, connected_file_name): print("==> load data successful") u, c = counter(data['user_id']) # UserNumberDict = dict(zip(u, c)) userQuesNumIndexList = getUserQuesNumIndexList(data['user_id']) newdata = pd.DataFrame() print('==> begin concatenate dataset') for i in pp.prog_percent(range(len(u)), stream=sys.stdout): for k in range(len(userQuesNumIndexList)): if userQuesNumIndexList[k, 0] == u[i]: temp = data.iloc[int(userQuesNumIndexList[ k, 2]):int(userQuesNumIndexList[k, 2] + userQuesNumIndexList[k, 1])] newdata = newdata.append(temp) newdata.reset_index(drop=True) newdata.to_csv(connected_file_name, index=False) print( '==> before connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(data, code0.DatasetParameter())) print( '==> after connect\t', aux.stastic_SecNumber_UserNumber_SkillNumber(newdata, code0.DatasetParameter())) return newdata
def trainAEWeights(): if not code0.BASELINE: dp = code0.DatasetParameter() dataset, labels = code1.load_data(dp) dp.skill_num = len(dataset['skill_id'].unique()) + 1 dp.skill_set = list(dataset['skill_id'].unique()) dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset) dp.seq_width = len(dp.columnsName_to_index) SAEconfig = code0.SAEParamsConfig() SAEconfig.num_steps = 30 SAEconfig.seq_width = dp.seq_width g = tf.Graph() with g.as_default(): model_autoencoder = SIMPLEAUTOENCODER(SAEconfig, dp) initializer = tf.random_uniform_initializer(-SAEconfig.init_scale, SAEconfig.init_scale) with tf.Session(graph=g) as sess: tf.initialize_all_variables().run() for i in range(SAEconfig.max_max_epoch): p = run_ae_epoch(sess, model_autoencoder, dataset, SAEconfig) print(str(i)+"/"+str(SAEconfig.max_max_epoch)+" epoch,avgcost ", str(p)) model_autoencoder.saveWeights(sess) else: print("BASELINE model, don't need train weights")
def read_data_from_csv2(): processedFileName = './data/cmu_stat_f2011/test_data.csv' raw_data_txt = "./data/cmu_stat_f2011/cmu.txt" if os.path.exists(processedFileName): data = pd.read_csv(processedFileName) print("==> read ", processedFileName, " directly") else: if os.path.exists(raw_data_txt): data = pd.read_csv(raw_data_txt, sep=" ", delimiter='\t') print(data.columns) data.rename(columns={ 'Duration (sec)': 'time', 'Outcome': 'correct', 'KC (F2011)': 'skill_id', 'Problem Name': 'problem_id', 'Step Name': 'step_id', 'Anon Student Id': 'user_id', "Student Response Type": "first_action", 'Attempt At Step': "attempt_level" }, inplace=True) data = data.fillna(-1) filer_data = data[code0.DatasetParameter( 'cmu_stat_f2011').filtedColumnNameList] filer_data = filer_data[(filer_data['correct'] != -1) & (filer_data['correct'] != 'HINT') & (filer_data['skill_id'] != '-1') & (filer_data['time'] != '.')] filer_data['correct'].replace({ 'CORRECT': 1, 'INCORRECT': 0 }, inplace=True) # change str to integar for feature in [ 'skill_id', 'step_id', 'problem_id', 'user_id', 'Level (Unit)', 'Level (Module)', 'first_action', 'attempt_level' ]: print("==> BEGIN ", feature) temp_set = set(list(filer_data[feature])) temp_dict = { key: value + 1 for value, key in enumerate(temp_set) } filer_data[feature].replace(temp_dict, inplace=True) print("==> END ", feature) print("==> first_action", set(filer_data['first_action'])) print("==> attempt_level", set(filer_data['attempt_level'])) data.to_csv(processedFileName, index=False) else: raise ('No data file exists!') return data
def time_basic_process(data): # -1-transfer to second unit print("==> transfer time unit: millsecond to second") tempTimeList = list(data['time']) newTimeList = [int(x / 1000) for x in tempTimeList] data['time'] = newTimeList del newTimeList, tempTimeList # -2-remove outlier records print('==> delete outlier of time feature') print('==> length before delete\t', len(data)) data = data[(data['time'] <= code0.DatasetParameter().time_threshold) & (data['time'] > 0)] print('==> length after delete\t', len(data)) # -3-transfer to z-score time_z_level = code0.DatasetParameter().time_z_level print('==> preprocerss time to z-score based on ', time_z_level) time_z_id_set = np.unique(data[time_z_level]) std_dict = {} mean_dict = {} for itme_id in pp.prog_percent(time_z_id_set, stream=sys.stdout, title='==> extract mean and std of time'): temp_data = data[data[time_z_level] == itme_id] temp_list = list(temp_data['time']) # print ('-- problem_id ',problem_id,' -- ',len(temp_list),' --') std_dict[itme_id] = np.std(temp_list, axis=0) mean_dict[itme_id] = np.mean(temp_list, axis=0) assert len(std_dict) == len(mean_dict) data = data.reset_index(drop=True) for id in pp.prog_percent(range(len(data)), stream=sys.stdout, title='==> cast time to z-score'): data.loc[id, 'time'] = (data.loc[id, 'time'] - mean_dict[data.loc[id, time_z_level]]) / ( std_dict[data.loc[id, time_z_level]] * 1.0) data = data.fillna(0) """ plt.hist(list(data['time']), bins=np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval*2)) plt.title("time z score distribution") plt.savefig('./result/assistment2009/time_distribution' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') """ return data
def attempt_correct_analysis(data): data = data[data['attempt_count'] <= code0.DatasetParameter().attemp_max] u, c = aux.counter(list(data['attempt_count'])) atempt_list = np.arange(code0.DatasetParameter().attemp_max + 1) correct_num_list = [] for item in atempt_list: temp_data = data[(data['attempt_count'] == item)] if len(temp_data) != 0: correct_num_list.append( sum(temp_data['correct']) * 1.0 / len(temp_data)) else: correct_num_list.append(0) print(u, "\n", c) print(atempt_list, "\n", correct_num_list) for a in correct_num_list: print("%.3f" % a)
def attemp_hint_and_correctness_analysis(data): data = data.reset_index(drop=True) bins = np.concatenate([[-1], np.arange(0.0, 1.1, 0.1)]) for attri in ['hint_count_level', 'attempt_count_level']: correct_mean_list = [] correct_std_list = [] correct_num_list = [] for item_index in pp.prog_percent( range(len(bins)), stream=sys.stdout, title='==> get correctness according to ' + attri): up_bin = bins[item_index] + 0.05 down_bin = bins[item_index] - 0.05 temp_data = data[(data[attri] >= down_bin) & (data[attri] < up_bin)] temp_correct_list = list(temp_data['correct']) correct_num_list.append(len(temp_correct_list)) if (len(temp_correct_list) != 0): correct_mean_list.append(np.mean(temp_correct_list, axis=0)) correct_std_list.append(np.std(temp_correct_list, axis=0)) else: correct_mean_list.append(0) correct_std_list.append(0) fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True) ax = axs[0] ax.plot(bins, correct_mean_list) ax.set_title('correctness ' + attri) boundary_list = code0.DatasetParameter().correct_boundary_list for nmber in boundary_list: ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0) ax = axs[1] ax.plot(bins, correct_num_list) ax.set_title(attri + " number distribution") ax.set_xlim([-1.1, 1.1]) plt.savefig('./result/assistment2009/' + attri + '_correctness_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png')
return TensorCrossFeatures def get_init_value_for_train_weights(self): featureslist = [ self.getSkillCorrectCrossFeature(), self.getCrossFeatureAll(), self.getCategoryFeatureInputs(), self.getContinuesFeatureInputs() ] x_tmp = tf.concat(2, featureslist) x = tf.reshape(x_tmp, [self.batch_size * self.num_steps, -1]) return x if __name__ == "__main__": dp = code0.DatasetParameter() ap = code0.autoencoderParameter() dataset, labels = code1.load_data(dp) # tuple_data = code1.convert_data_labels_to_tuples(dataset, labels) skill_num = len( dataset['skill_id'].unique()) + 1 # 0 for unlisted skill_id dp.skill_num = skill_num dp.skill_set = list(dataset['skill_id'].unique()) dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info( dataset) dp.seq_width = len(dp.columnsName_to_index) print("columns_max\n", dp.columns_max) print("columns_numb\n", dp.columns_numb)
def time_add_level_process(data): data = data.reset_index(drop=True) bins = np.arange(min(data['time']), max(data['time']), code0.DatasetParameter().time_interval * 2) correct_mean_list = [] correct_std_list = [] correct_num_list = [] for item_index in pp.prog_percent(range(len(bins)), stream=sys.stdout, title='==> get correctness'): up_bin = bins[item_index] + code0.DatasetParameter().time_interval down_bin = bins[item_index] - code0.DatasetParameter().time_interval temp_data = data[data['time'] >= down_bin] temp_data = temp_data[temp_data['time'] < up_bin] temp_correct_list = list(temp_data['correct']) correct_num_list.append(len(temp_correct_list)) if (len(temp_correct_list) != 0): correct_mean_list.append(np.mean(temp_correct_list, axis=0)) correct_std_list.append(np.std(temp_correct_list, axis=0)) else: correct_mean_list.append(0) correct_std_list.append(0) # plot the relationship fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True) ax = axs[0] ax.plot(bins, correct_mean_list) ax.set_title('correctness') boundary_list = code0.DatasetParameter().correct_boundary_list for nmber in boundary_list: ax.axhline(y=nmber, xmin=0, xmax=1, c="red", linewidth=0.5, zorder=0) ax = axs[1] ax.plot(bins, correct_num_list) ax.set_title("time z score distribution") ax.set_xlim([-2, 4]) plt.savefig('./result/assistment2009/time_distribution_correctness_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.png') # plt.show() # add a colum according to correctness boundary time_level_list = [] temp_list = list(data['time']) bd = code0.DatasetParameter().time_boundary_list # 0 ~ time <-0.8 # 1 ~ -0.8 < time < -0.6 # 2 ~ -0.6 < time < 0 # 3 ~ 0 < time for idx in range(len(temp_list)): if temp_list[idx] <= bd[0]: time_level_list.append(0) elif (bd[0] < temp_list[idx] and temp_list[idx] <= bd[1]): time_level_list.append(1) elif (bd[1] < temp_list[idx] and temp_list[idx] <= bd[2]): time_level_list.append(2) elif (temp_list[idx] > bd[2]): time_level_list.append(3) else: raise Exception("Error in time division") data['time_level'] = time_level_list return data
def main(unused_args): if not code0.BASELINE and code0.AUTOENCODER_LABEL: trainAEWeights() dp = code0.DatasetParameter() dataset, labels = code1.load_data(dp) tuple_data = code1.convert_data_labels_to_tuples(dataset, labels) skill_num = len( dataset['skill_id'].unique()) + 1 # 0 for unlisted skill_id dp.skill_num = skill_num dp.skill_set = list(dataset['skill_id'].unique()) dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info( dataset) dp.seq_width = len(dp.columnsName_to_index) print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n") print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n") print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index, "\n") config = code0.ModelParamsConfig(dp) eval_config = code0.ModelParamsConfig(dp) if dp.dataSetType == 'kdd': config.num_steps = 2000 else: config.num_steps = aux.get_num_step(dataset) eval_config.num_steps = config.num_steps eval_config.batch_size = 2 config.skill_num = skill_num eval_config.skill_num = config.skill_num auc_train, r2_train, rmse_train, auc_test, r2_test, rmse_test = aux.defineResult( ) CVname = auc_test.columns size = len(tuple_data) # write all the records to log file aux.printConfigration(config=config, dp=dp, train_numb=int(size * 0.8), test_numb=int(size * 0.2)) aux.logwrite([ "==> model_continues_columns\n" + ','.join(dp.model_continues_columns) ], dp, True) aux.logwrite( ["==> model_category_columns\n" + ','.join(dp.model_category_columns)], dp, True) str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns] str_cross_columns = ','.join(str_cross_columns_list) aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True) for index, cv_num_name in enumerate(CVname): aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"], dp, prt=True) timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M") aux.logwrite(["\ntime:\t" + timeStampe], dp) train_tuple_rows = tuple_data[:int(index * 0.2 * size)] + tuple_data[int( (index + 1) * 0.2 * size):] test_tuple_rows = tuple_data[int(index * 0.2 * size):int((index + 1) * 0.2 * size)] with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer( -config.init_scale, config.init_scale) # training model print("\n==> Load Training model") with tf.variable_scope("model", reuse=None, initializer=initializer): m = code2.Model(is_training=True, config=config, dp=dp) # testing model print("\n==> Load Testing model") with tf.variable_scope("model", reuse=True, initializer=initializer): mtest = code2.Model(is_training=False, config=eval_config, dp=dp) tf.initialize_all_variables().run() print("==> begin to run epoch...") for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0) m.assign_lr(session, config.learning_rate * lr_decay) rt = session.run(m.lr) rmse, auc, r2 = code3.run_epoch(session, m, train_tuple_rows, m.train_op, verbose=True) train_result = "\n==> %s cross-valuation: Train Epoch: %d\tLearning rate: %.3f\t rmse: %.3f \t auc: %.3f \t r2: %.3f" % ( cv_num_name, i + 1, rt, rmse, auc, r2) print(train_result) auc_train.loc[i, cv_num_name] = auc rmse_train.loc[i, cv_num_name] = rmse r2_train.loc[i, cv_num_name] = r2 aux.logwrite(train_result, dp, False) display = 5 if ((i + 1) % display == 0): print("-" * 80) rmse, auc, r2 = code3.run_epoch(session, mtest, test_tuple_rows, tf.no_op()) test_result = "\n==> %s cross-valuation: Test Epoch: %d \t rmse: %.3f \t auc: %.3f \t r2: %.3f" % ( cv_num_name, (i + 1) / display, rmse, auc, r2) print(test_result) print("=" * 80) auc_test.loc[(i + 1) / display - 1, cv_num_name] = auc rmse_test.loc[(i + 1) / display - 1, cv_num_name] = rmse r2_test.loc[(i + 1) / display - 1, cv_num_name] = r2 aux.logwrite(test_result, dp, False) print("==> Finsih! whole process, save result and print\t" + dp.currentTime) try: mean_result = pd.DataFrame({ "AUC": list(auc_test.mean(1)), "RMSE": list(rmse_test.mean(1)), "R2": list(r2_test.mean(1)) }) print(mean_result) aux.saveResult(dp, auc_train, rmse_train, r2_train, auc_test, rmse_test, r2_test, mean_result) except: print("except during save result") pass
def main(unused_args): aux.check_directories() if not code0.BASELINE and code0.AUTOENCODER_LABEL: trainAEWeights() dp = code0.DatasetParameter() dataset, labels = code1.load_data(dp) tuple_data = code1.convert_data_labels_to_tuples(dataset, labels) skill_num = len(dataset['skill_id'].unique()) + 1 dp.skill_num = skill_num dp.skill_set = list(dataset['skill_id'].unique()) dp.columns_max, dp.columns_numb, dp.columnsName_to_index = code1.get_columns_info(dataset) dp.seq_width = len(dp.columnsName_to_index) print("-" * 50, "\ndp.columns_max\n", dp.columns_max, "\n") print("-" * 50, "\ndp.columns_numb\n", dp.columns_numb, "\n") print("-" * 50, "\ndp.columnsName_to_index\n", dp.columnsName_to_index, "\n") config = code0.ModelParamsConfig(dp) eval_config = code0.ModelParamsConfig(dp) if dp.dataSetType == 'kdd': config.num_steps = 1500 elif dp.dataSetType == 'cmu_stat_f2011': config.num_steps = 1500 else: config.num_steps = aux.get_num_step(dataset) eval_config.num_steps = config.num_steps eval_config.batch_size = 2 config.skill_num = skill_num eval_config.skill_num = config.skill_num name_list = ['cv', 'epoch', 'type', 'rmse', 'auc', 'r2', 'inter_rmse', 'inter_auc', 'inter_r2', 'intra_rmse', 'intra_auc', 'intra_r2'] result_data = pd.DataFrame(columns=name_list) CVname = ['c1', 'c2', 'c3', 'c4', 'c5'] size = len(tuple_data) # write all the records to log file aux.printConfigration(config=config, dp=dp, train_numb=int(size * 0.8), test_numb=int(size * 0.2)) aux.logwrite(["==> model_continues_columns\n" + ','.join(dp.model_continues_columns)], dp, True) aux.logwrite(["==> model_category_columns\n" + ','.join(dp.model_category_columns)], dp, True) str_cross_columns_list = ['-'.join(i) for i in dp.model_cross_columns] str_cross_columns = ','.join(str_cross_columns_list) aux.logwrite(["==> model_cross_columns\n" + str_cross_columns], dp, True) for index, cv_num_name in enumerate(CVname): aux.logwrite(["\nCross-validation: \t" + str(index + 1) + "/5"], dp, prt=True) timeStampe = datetime.datetime.now().strftime("%m-%d-%H:%M") aux.logwrite(["\ntime:\t" + timeStampe], dp) train_tuple_rows = tuple_data[:int(index * 0.2 * size)] + tuple_data[int((index + 1) * 0.2 * size):] test_tuple_rows = tuple_data[int(index * 0.2 * size): int((index + 1) * 0.2 * size)] with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) # training model print("\n==> Load Training model") with tf.variable_scope("model", reuse=None, initializer=initializer): m = code2.Model(is_training=True, config=config, dp=dp) # testing model print("\n==> Load Testing model") with tf.variable_scope("model", reuse=True, initializer=initializer): mtest = code2.Model(is_training=False, config=eval_config, dp=dp) tf.initialize_all_variables().run() print("==> begin to run epoch...") for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0) m.assign_lr(session, config.learning_rate * lr_decay) rt = session.run(m.lr) rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch( session, m, train_tuple_rows, m.train_op, verbose=True) aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2, 'train') result_data = result_data.append(pd.Series( [cv_num_name, i, 'train', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2], index=name_list), ignore_index=True) display = 5 if ((i + 1) % display == 0): print('BEGIN', "-" * 80) rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2 = code3.run_epoch( session, mtest, test_tuple_rows, tf.no_op()) aux.print_result(dp, cv_num_name, i, rt, rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2, 'test', display) print('END--', "-" * 80) result_data = result_data.append(pd.Series( [cv_num_name, (i + 1) / display, 'test', rmse, auc, r2, inter_rmse, inter_auc, inter_r2, intra_rmse, intra_auc, intra_r2], index=name_list), ignore_index=True) #print ("-*"*50,"\n",result_data) print("==> Finsih! whole process, save result and print\t" + dp.currentTime) temp_data = result_data[result_data['type'] == 'test'] for idx in set(temp_data['epoch']): tp = temp_data[temp_data['epoch'] == idx] result_data = result_data.append(pd.Series( ['average', idx, 'test_mean', tp['rmse'].mean(), tp['auc'].mean(), tp['r2'].mean(), tp['inter_rmse'].mean(), tp['inter_auc'].mean(), tp['inter_r2'].mean(), tp['intra_rmse'].mean(), tp['intra_auc'].mean(), tp['intra_r2'].mean()], index=name_list), ignore_index=True) print(result_data[result_data['cv']=='average']) result_data.to_csv('./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv') print('==> save to ./result/'+code0.DATASETTYPE+'/result_'+timeStampe+'.csv')