def qLearningWithOptions(env, alpha, gamma, options_eps, epsilon, nSeeds, maxLengthEp, nEpisodes, verbose, useNegation, genericNumOptionsToEvaluate, loadedOptions=None): numSeeds = nSeeds numEpisodes = nEpisodes # We first discover all options options = None actionSetPerOption = None if loadedOptions == None: if verbose: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=True) else: options, actionSetPerOption = discoverOptions(env, options_eps, verbose, useNegation, plotGraphs=False) else: options = loadedOptions actionSetPerOption = [] for i in xrange(len(loadedOptions)): tempActionSet = env.getActionSet() tempActionSet.append('terminate') actionSetPerOption.append(tempActionSet) returns_eval = [] returns_learn = [] # Now I add all options to my action set. Later we decide which ones to use. i = 0 #genericNumOptionsToEvaluate = [1, 2, 4, 32, 64, 128, 256] totalOptionsToUse = [] maxNumOptions = 0 if useNegation and loadedOptions == None: maxNumOptions = int(len(options) / 2) else: maxNumOptions = len(options) while i < len(genericNumOptionsToEvaluate ) and genericNumOptionsToEvaluate[i] <= maxNumOptions: totalOptionsToUse.append(genericNumOptionsToEvaluate[i]) i += 1 for idx, numOptionsToUse in enumerate(totalOptionsToUse): returns_eval.append([]) returns_learn.append([]) if verbose: print 'Using', numOptionsToUse, 'options' for s in xrange(numSeeds): if verbose: print 'Seed: ', s + 1 returns_eval[idx].append([]) returns_learn[idx].append([]) actionSet = env.getActionSet() for i in xrange(numOptionsToUse): actionSet.append(options[i]) if useNegation and loadedOptions == None: numOptions = 2 * numOptionsToUse else: numOptions = numOptionsToUse learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s, useOnlyPrimActions=True, actionSet=actionSet, actionSetPerOption=actionSetPerOption) for i in xrange(numEpisodes): returns_learn[idx][s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval[idx][s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) returns_learn_primitive = [] returns_eval_primitive = [] for s in xrange(numSeeds): returns_learn_primitive.append([]) returns_eval_primitive.append([]) learner = QLearning(alpha=alpha, gamma=gamma, epsilon=epsilon, environment=env, seed=s) for i in xrange(numEpisodes): returns_learn_primitive[s].append( learner.learnOneEpisode(timestepLimit=maxLengthEp)) returns_eval_primitive[s].append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=maxLengthEp)) return returns_eval_primitive, returns_eval, totalOptionsToUse
getExpectedNumberOfStepsFromOption(env=env, eps=epsilon, verbose=verbose, discoverNegation=bothDirections, loadedOptions=loadedOptions) elif taskToPerform == 5: #Solve for a given goal (q-learning) returns_learn = [] returns_eval = [] learner = QLearning(alpha=0.1, gamma=0.9, epsilon=1.00, environment=env) for i in xrange(num_episodes): returns_learn.append( learner.learnOneEpisode(timestepLimit=max_length_episode)) returns_eval.append( learner.evaluateOneEpisode(eps=0.01, timestepLimit=max_length_episode)) plt.plot(returns_eval) plt.show() elif taskToPerform == 6: #Solve for a given goal w/ primitive actions (q-learning) following options returns_eval_primitive, returns_eval, totalOptionsToUse = qLearningWithOptions( env=env, alpha=0.1, gamma=0.9, options_eps=0.0, epsilon=1.0, nSeeds=num_seeds,