def sample_expert_action(concept_tree, knowledge): ''' Samples an optimal action given the current knowledge and the concept tree. Samples uniformly from all optimal actions. Returns a StudentAction ''' next_concepts = [] # find all possible concepts that have not been learned yet but whose prereq are fulfilled for i in six.moves.range(concept_tree.n): if not knowledge[i]: cur_concept = np.zeros((concept_tree.n, ), dtype=np.int) cur_concept[i] = 1 if fulfilled_prereqs(concept_tree, knowledge, cur_concept): next_concepts.append(i) if not next_concepts: # nothing new can be learned, then just be random next_action = np.random.randint(0, concept_tree.n) else: # uniformly pick an optimal action next_action = np.random.choice(next_concepts) next_c = np.zeros((concept_tree.n, ), dtype=np.int) next_c[next_action] = 1 return st.StudentAction(next_action, next_c)
def best_greedy_action(self): ''' For each action, does a 1-step lookahead to determine best action. ''' next_rewards = [] # probability of observations probs = self.model.sample_observations() if probs is None: # assume [1 0 0 0 0 ...] probs = [0] * self.sim.dgraph.n probs[0] = 1 for a in xrange(self.n_concepts): avg_reward = 0.0 # action conceptvec = np.zeros((self.n_concepts, )) conceptvec[a] = 1.0 action = st.StudentAction(a, conceptvec) # for each observation, weight reward with probability of seeing observation new_model = self.model.copy() new_model.advance_simulator(action, 1) avg_reward += probs[a] * np.sum(new_model.sample_observations()) new_model = self.model.copy() new_model.advance_simulator(action, 0) avg_reward += (1.0 - probs[a]) * np.sum( new_model.sample_observations()) # append next reward next_rewards.append(avg_reward) return argmaxlist(next_rewards)[0]
def advance(self, concept): ''' Advances the true student simulator. Creates a new state where the DKT model is advanced according to the result of the true simulator. ''' conceptvec = np.zeros((self.n_concepts, )) conceptvec[concept] = 1.0 action = st.StudentAction(concept, conceptvec) # advance the true student simulator (ob, r) = self.sim.advance_simulator(action) # advance the model with the true observation self.model.advance_simulator(action, ob)
def advance(self, concept): ''' Advances both the simulator and model. ''' conceptvec = np.zeros((self.n_concepts, )) conceptvec[concept] = 1.0 action = st.StudentAction(concept, conceptvec) # first advance the real world simulator self.sim.advance_simulator(action) # make a copy of the real world simulator self.model = self.sim.copy()
def egreedy_expert(concept_tree, knowledge, epsilon): ''' egreedy over the expert policy ''' if np.random.random() < epsilon: # random action next_action = np.random.randint(0, concept_tree.n) next_c = np.zeros((concept_tree.n, ), dtype=np.int) next_c[next_action] = 1 next_act = st.StudentAction(next_action, next_c) else: next_act = sample_expert_action(concept_tree, knowledge) return next_act
def __init__(self, model, sim, step, horizon, r_type, dktcache, use_real, new_act=None, new_ob=None, histhash=''): ''' :param model: RnnStudentSim object :param sim: StudentExactSim object :param step: int, current step :param horizon: int, horizon :param r_type: an r_type :param dktcache: a dictionary used for caching the Rnn predictions or None to disable it :param use_real: use the sim as the real world, otherwise use model :param new_act: immediate action that led to this state :param new_ob: immediate observation that led to this state :param histhash: str rep of the current history used for dktcache ''' # the model will be passed down when doing real world perform self.belief = model self._probs = None # caches the current prob predictions self.step = step self.horizon = horizon self.r_type = r_type self.use_real = use_real # keep track of history for debugging and various uses self.act = new_act self.ob = new_ob # this sim should be shared between all DKTStates # and it is advanced only when real_world_perform is called # so all references to it will all be advanced self.sim = sim self.n_concepts = sim.dgraph.n # setup caching rnn queries self.dktcache = dktcache self.histhash = histhash self.actions = [] for i in range(self.n_concepts): concepts = np.zeros((self.n_concepts, )) concepts[i] = 1 self.actions.append(st.StudentAction(i, concepts))
def test_drqn_single(dgraph, student, horizon, model, DEBUG=False): ''' Performs a single trajectory with MCTS and returns the final true student knowledge. ''' n_concepts = dgraph.n # create the model and simulators student.reset() student.knowledge[0] = 1 # initialize the first concept to be known sim = st.StudentExactSim(student, dgraph) # initialize state (or alternatively choose random first action) act_hist = [0] ob_hist = [0] for i in range(horizon - 1): # print('Step {}'.format(i)) inputs = construct_drqn_inputs(act_hist, ob_hist, n_concepts) best_action, _ = model.predict(inputs, last_timestep_only=True) best_action = best_action[0] concept = best_action conceptvec = np.zeros(n_concepts) conceptvec[concept] = 1 action = st.StudentAction(concept, conceptvec) # print(best_action.concept) # debug check for whether action is optimal if DEBUG: opt_acts = compute_optimal_actions( sim.dgraph, sim.student.knowledge) # put function code into shared file is_opt = action.concept in opt_acts if not is_opt: print('ERROR {} executed non-optimal action {}'.format( sim.student.knowledge, action.concept)) # act in the real environment (ob, reward) = sim.advance_simulator(action) act_hist.append(action.concept) ob_hist.append(ob) # print('Next state: {}'.format(str(new_root.state))) return sim.student.knowledge
def best_greedy_action(self, n_rollouts): ''' For each action, samples n_rollouts number of next states and averages the immediate reward. Returns the action with the largest next average immediate reward. ''' next_rewards = [] for a in xrange(self.n_concepts): avg_reward = 0.0 conceptvec = np.zeros((self.n_concepts, )) conceptvec[a] = 1.0 action = st.StudentAction(a, conceptvec) # sample next state and reward for i in xrange(n_rollouts): new_model = self.model.copy() new_model.advance_simulator(action) avg_reward += np.sum(new_model.student.knowledge) avg_reward /= 1.0 * n_rollouts next_rewards.append(avg_reward) #print('{} next {}'.format(self, next_rewards)) return argmaxlist(next_rewards)[0]
def __init__(self, model, sim, step, horizon, r_type): ''' :param model: StudentExactSim for the model :param sim: StudentExactSim for the real world :param step: the current timestep (starts from 1) :param horizon: the horizon length :param r_type: reward type ''' self.belief = None # not going to use belief at all because we know the exact state self.model = model self.sim = sim self.step = step self.horizon = horizon self.n_concepts = model.student.knowledge.shape[0] self.r_type = r_type self.actions = [] for i in range(self.n_concepts): concepts = np.zeros((self.n_concepts, )) concepts[i] = 1 self.actions.append(st.StudentAction(i, concepts))
def test_dkt_multistep(model_id, dataset, chkpt=None): ''' Test DKT multistep error on dataset. Dataset is output from generate_data. ''' import concept_dependency_graph as cdg n_concepts = dataset[0][0][0].shape[0] horizon = len(dataset[0]) # debug #six.print_('n concepts {} horizon {} trajectory {}'.format(n_concepts, horizon, dataset[0])) dgraph = cdg.ConceptDependencyGraph() dgraph.init_default_tree(n_concepts) # create the model and simulators student2 = st.Student2(n_concepts, True) test_student = student2 stu = test_student.copy() stu.reset() stu.knowledge[0] = 1 # initialize the first concept to be known sim = st.StudentExactSim(stu, dgraph) # load the model if chkpt is not None: model = dmc.DynamicsModel(model_id=model_id, timesteps=horizon, load_checkpoint=False) model.load(chkpt) else: model = dmc.DynamicsModel(model_id=model_id, timesteps=horizon, load_checkpoint=True) # initialize the dktcache to speed up DKT queries dktcache = dict() print('Testing model multstep: {}'.format(model_id)) # make the model dktmodel = dmc.RnnStudentSim(model) # accumulate error mse_acc = 0.0 for i in six.moves.range(len(dataset)): curr_mse = 0.0 curr_traj = dataset[i] curr_state = DKTState(dktmodel, sim, 1, horizon, SPARSE, dktcache, False) for t in six.moves.range(horizon - 1): # advance the DKT, then compare prediction with the data, up to the last prediction curr_conceptvec = curr_traj[t][0] curr_concept = np.nonzero(curr_conceptvec)[0] curr_ob = int(curr_traj[t][1]) next_conceptvec = curr_traj[t + 1][0] next_concept = np.nonzero(next_conceptvec)[0] next_ob = int(curr_traj[t + 1][1]) # advance the DKT curr_state = curr_state.perform( st.StudentAction(curr_concept, curr_conceptvec)) next_probs = curr_state.get_probs() # compute and accumulate the mse diff = next_probs[next_concept] - next_ob curr_mse += diff * diff #debugging #six.print_('traj {} step {} actvec {} act {} ob {} next probs {} diff {}'.format(i,t,curr_conceptvec,curr_concept,curr_ob,next_probs,diff)) # average mse per step mse_acc += curr_mse / (horizon - 1) #six.print_('mse per step acc {}'.format(mse_acc)) # return the average MSE per step in a trajectory return mse_acc / len(dataset)
def generate_student_sample(concept_tree, seqlen=100, student=None, initial_knowledge=None, policy=None, epsilon=None, verbose=False): ''' :param n: number of concepts; if None use N_CONCEPTS :param concept_tree: Concept dependency graph :param seqlen: number of exercises the student will do. :param initial_knowledge: initial knowledge of student. If None, will be set to 0 for all concepts. :param policy: if no exercise_seq provided, use the specified policy to generate exercise sequence. :param epsilon: epsilon for egreedy policy :param verbose: if True, print out debugging / progress statements :return: array of tuples, where each tuple consists of (exercise, 0 or 1 indicating success of student on that exercise, knowledge of student after doing exercise) Note that this array will have length seqlen, inclusive ''' n_concepts = concept_tree.n if initial_knowledge is None: initial_knowledge = np.zeros((n_concepts, )) initial_knowledge[0] = 1 if student is None: s = st.Student() else: s = student s.reset() # make sure to reset to intial conditions for this sample s.knowledge = initial_knowledge # if not exercise_seq and policy == 'expert': # return _generate_student_sample_with_expert_policy(student=s, seqlen=seqlen, verbose=verbose) if (policy == 'modulo' or policy == 'random'): # for expert policy, we have to choose the next exercise online. exercise_seq = [] for i in six.moves.range(seqlen): concepts = np.zeros((n_concepts, ), dtype=np.int) if policy == 'modulo': # choose exercise with modulo op. This imposes an ordering on exercises. conceptix = i % n_concepts concepts[conceptix] = 1 elif policy == 'random': # choose one random concept for this exercise conceptix = np.random.randint(n_concepts) concepts[conceptix] = 1 ex = st.StudentAction(conceptix, concepts) exercise_seq.append(ex) # Go through sequence of exercises and record whether student solved each or not student_performance = [] student_knowledge = [] student_state = [] n_exercises_to_mastery = -1 exercises = [ ] # so we can store sequence of exercises as numpy arrays (instead of arrays of exercise objects) for i in six.moves.range(seqlen): # print (s.knowledge) # store current states student_state.append(s.get_state()) if policy == 'expert': ex = sample_expert_action(concept_tree, s.knowledge) elif policy == 'egreedy': ex = egreedy_expert(concept_tree, s.knowledge, epsilon) else: ex = exercise_seq[i] result = s.do_exercise(concept_tree, ex) exercises.append( ex.conceptvec ) # makes the assumption that an exercise is equivalent to the concepts it practices) student_performance.append(result) student_knowledge.append(copy.deepcopy(s.knowledge)) if np.sum(s.knowledge) == n_concepts and n_exercises_to_mastery == -1: # if verbose and n_exercises_to_mastery == -1: n_exercises_to_mastery = i + 1 if verbose: if n_exercises_to_mastery != -1: print("learned all concepts after {} exercises.".format( n_exercises_to_mastery)) else: print( "Did not learn all concepts after doing {} exercises.".format( seqlen)) #six.print_(student_performance) student_sample = tuple( six.moves.zip(exercises, student_performance, student_knowledge, student_state)) #six.print_(student_sample) return student_sample