def _fit(self, data, skill, forgets, preload=False): """ Helper function for fitting data. """ num_learns = len(data["resource_names"]) num_gs = len(data["gs_names"]) self._check_manual_param_init(num_learns, num_gs, skill) num_fit_initializations = self.num_fits best_likelihood = float("-inf") best_model = None for i in range(num_fit_initializations): fitmodel = random_model_uni.random_model_uni(num_learns, num_gs) if forgets: fitmodel["forgets"] = np.random.uniform( size=fitmodel["forgets"].shape) if self.manual_param_init and skill in self.fit_model: for var in self.fit_model[skill]: if var in fitmodel: fitmodel[var] = self.fit_model[skill][var] if not preload: fitmodel, log_likelihoods = EM_fit.EM_fit( fitmodel, data, parallel=self.parallel) if log_likelihoods[-1] > best_likelihood: best_likelihood = log_likelihoods[-1] best_model = fitmodel else: best_model = fitmodel fit_model = best_model fit_model["learns"] = fit_model["As"][:, 1, 0] fit_model["forgets"] = fit_model["As"][:, 0, 1] fit_model["prior"] = fit_model["pi_0"][1][0] fit_model["resource_names"] = data["resource_names"] fit_model["gs_names"] = data["gs_names"] return fit_model
truemodel["prior"] = truemodel["pi_0"][1][0] truemodel["guesses"] = np.full(num_subparts, 0.1, dtype=np.float_) truemodel["slips"] = np.full(num_subparts, 0.03, dtype=np.float_) #data! print("generating data...") data = synthetic_data.synthetic_data(truemodel, observation_sequence_lengths) #fit models, starting with random initializations print('fitting! each dot is a new EM initialization') best_likelihood = float("-inf") fitmodel = deepcopy(truemodel) # NOTE: include this line to initialize at the truth (fitmodel, log_likelihoods) = EM_fit.EM_fit(fitmodel, data) if(log_likelihoods[-1] > best_likelihood): best_likelihood = log_likelihoods[-1] best_model = fitmodel # compare the fit model to the true model print('') print('\ttruth\tlearned') for r in range(num_resources): print('learn%d\t%.4f\t%.4f' % (r+1, truemodel['As'][r, 1, 0].squeeze(), best_model['As'][r, 1, 0].squeeze())) for r in range(num_resources): print('forget%d\t%.4f\t%.4f' % (r+1, truemodel['As'][r, 0, 1].squeeze(), best_model['As'][r, 0, 1].squeeze())) for s in range(num_subparts): print('guess%d\t%.4f\t%.4f' % (s+1, truemodel['guesses'][s], best_model['guesses'][s]))
print("All data okay") total_auc = 0 total_trials = 0 all_true = [] all_pred = [] for skill in range(skill_count): num_fit_initializations = 5 best_likelihood = float("-inf") if len(Data[skill]["resources"]) < 1: print("No data for skill %s" % skill) continue else: for i in range(num_fit_initializations): fitmodel = random_model_uni.random_model_uni(1, 1) (fitmodel, log_likelihoods) = EM_fit.EM_fit(fitmodel, Data[skill]) if (log_likelihoods[-1] > best_likelihood): best_likelihood = log_likelihoods[-1] best_model = fitmodel #print(" ") #print('\tlearned') #print('prior\t%.4f' % (best_model["pi_0"][1][0])) #for r in range(1): # print('learn%d\t%.4f' % (r+1, best_model['As'][r, 1, 0].squeeze())) #for r in range(1): # print('forget%d\t%.4f' % (r+1, best_model['As'][r, 0, 1].squeeze())) #for s in range(1): # print('guess%d\t%.4f' % (s+1, best_model['guesses'][s])) #for s in range(1):
def crossvalidate(data, folds=5, verbose=False, seed=0, return_arrays=False): if "resource_names" in data: num_learns = len(data["resource_names"]) else: num_learns = 1 if "gs_names" in data: num_gs = len(data["gs_names"]) else: num_gs = 1 total = 0 acc = 0 area_under_curve = 0 num_fit_initializations = 20 split_size = (len(data["starts"]) // folds) #create random permutation to act as indices for folds for crossvalidation shuffle = np.random.RandomState(seed=seed).permutation(len(data["starts"])) all_true, all_pred = [], [] # crossvalidation on students which are identified by the starts array for iteration in range(folds): #create training/test data based on random permutation from earlier train = np.concatenate( (shuffle[0:iteration * split_size], shuffle[(iteration + 1) * split_size:len(data["starts"])])) test = shuffle[iteration * split_size:(iteration + 1) * split_size] training_data = fix_data(data, train) num_fit_initializations = 5 best_likelihood = float("-inf") for i in range(num_fit_initializations): fitmodel = random_model_uni.random_model_uni( num_learns, num_gs ) # include this line to randomly set initial param values (fitmodel, log_likelihoods) = EM_fit.EM_fit(fitmodel, training_data) if (log_likelihoods[-1] > best_likelihood): best_likelihood = log_likelihoods[-1] best_model = fitmodel if verbose: print(" ") print('Iteration %d' % (iteration)) print('\tlearned') print('prior\t%.4f' % (best_model["pi_0"][1][0])) for r in range(num_learns): print('learn%d\t%.4f' % (r + 1, best_model['As'][r, 1, 0].squeeze())) for r in range(num_learns): print('forget%d\t%.4f' % (r + 1, best_model['As'][r, 0, 1].squeeze())) for s in range(num_gs): print('guess%d\t%.4f' % (s + 1, best_model['guesses'][s])) for s in range(num_gs): print('slip%d\t%.4f' % (s + 1, best_model['slips'][s])) test_data = fix_data(data, test) # run model predictions from training data on test data (correct_predictions, state_predictions) = predict_onestep.run(best_model, test_data) flat_true_values = np.zeros((len(test_data["data"][0]), ), dtype=np.intc) for i in range(len(test_data["data"])): for j in range(len(test_data["data"][0])): if test_data["data"][i][j] != 0: flat_true_values[j] = test_data["data"][i][j] flat_true_values = flat_true_values.tolist() # print(len(flat_true_values)) # print(len(correct_predictions)) # print(auc.compute_auc(flat_true_values, correct_predictions)) all_true.extend(flat_true_values) all_pred.extend(correct_predictions) if return_arrays: return (all_true, all_pred) # print(len(all_true)) print(len(all_pred)) total += rmse.compute_rmse(all_true, all_pred) acc += accuracy.compute_acc(all_true, all_pred) area_under_curve += auc.compute_auc(all_true, all_pred) if verbose: print("Average RMSE: ", total) print("Average Accuracy: ", acc) print("Average AUC: ", area_under_curve) return (acc, total, area_under_curve)