def createAgentCont(dimAct,dimState,numBatchToKeep, numIterPerTrain):
    from pybrain.rl.learners import NFQ        
    sizeBatch = numIterPerTrain
    learner = NFQ(sizeBatch, numBatchToKeep) # Use neuro-fitted Q learning (use Q learning with neural network instead of lookup table)    
    
    # Create a neural network model with dimState inputs and dimAct outputs
    # Then network itself has dimState + dimAct input and 1 output
    numHidden = 20
    print('Using this many hidden layer neurons: ', numHidden)
    moduleNet = ActionValueNetwork(dimState, dimAct, numHidden); moduleNet.name = 'moduleNet' 
        
    # Create a learning agent, using both the module and the learner
    agent = LearningAgent(moduleNet, learner)
    return agent
Ejemplo n.º 2
0
def initExperiment(learnalg='Q',
                   history=None,
                   binEdges='10s',
                   scriptfile='./rlRunExperiment_v2.pl',
                   resetscript='./rlResetExperiment.pl'):

    if binEdges == '10s':
        centerBinEdges = centerBinEdges_10s
    elif binEdges == '30s':
        centerBinEdges = centerBinEdges_30s
    elif binEdges == 'lessperturbed':
        centerBinEdges = centerBinEdges_10s_lessperturbed
    elif binEdges is None:
        centerBinEdges = None
    else:
        raise Exception("No bins for given binEdges setting")

    env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript)
    if history is not None:
        env.data = history['data']

    task = OmnetTask(env, centerBinEdges)
    if history is not None:
        task.allrewards = history['rewards']

    if learnalg == 'Q':
        nstates = env.numSensorBins**env.numSensors
        if history is None:
            av_table = ActionValueTable(nstates, env.numActions)
            av_table.initialize(1.)
        else:
            av_table = history['av_table']
        learner = Q(0.1, 0.9)  # alpha, gamma
        learner._setExplorer(EpsilonGreedyExplorer(0.05))  # epsilon
    elif learnalg == 'NFQ':
        av_table = ActionValueNetwork(env.numSensors, env.numActions)
        learner = NFQ()
    else:
        raise Exception("learnalg unknown")

    agent = LearningAgent(av_table, learner)

    experiment = Experiment(task, agent)
    if history is None:
        experiment.nruns = 0
    else:
        experiment.nruns = history['nruns']
    return experiment
def main():

    # 2048の全ての状態を保存するのは無理でしょ.
    #   14^16通りの状態があるよね.
    #controller = ActionValueTable(16, 4)
    #learner = Q()
    #controller.initialize(1.)

    controller = ActionValueNetwork(16, 4)
    learner = NFQ()
    #learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    score_list = []
    for i in range(10000):
        # if os.path.exists('./agent.dump'):
        #     with open('./agent.dump') as f:
        #         agent = pickle.load(f)

        print i, 'playing ...'
        score = play(agent)
        score_list.append(score)

        # ここで,
        #   TypeError: only length-1 arrays can be converted to Python scalars
        #   pybrain/rl/learners/valuebased/q.py
        #   => learnerをQからNFQにしたら行けた.
        #   => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work
        print i, 'learning ...'
        agent.learn()
        agent.reset()

        print i, 'evaluate sample ...'
        data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]]
        agent.integrateObservation(numpy.array(data).ravel())
        move = agent.getAction()
        print "                           ", i, int(
            numpy.mean(score_list)), max(score_list), move

        if i % 20 == 0:
            print i, 'saving ...'
            with open('./agent.dump', 'w') as f:
                pickle.dump(agent, f)
            with open('./score.dump', 'w') as f:
                pickle.dump(score_list, f)
Ejemplo n.º 4
0
from scipy import *
import sys, time

from pybrain.rl.learners.valuebased import ActionValueNetwork
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners import Q, SARSA, NFQ
from pybrain.rl.experiments.episodic import EpisodicExperiment
from pybrain.rl.environments import Task
from tasktest import TestTask
from envtest import TestEnv

env = TestEnv()
task = TestTask(env)

controller = ActionValueNetwork(200, 3)
learner = NFQ()
agent = LearningAgent(controller, learner)

experiment = EpisodicExperiment(task, agent)

i = 0
while True:
    experiment.doEpisodes(10)
    print "Learning"
    agent.learn()
    agent.reset()
    i += 1
    print "Cycle: %d" % i
    if i > 60:
        agent.learning = False
Ejemplo n.º 5
0
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
from pybrain.rl.experiments import EpisodicExperiment

from training import NFQTraining

task = DiscreteBalanceTask(CartPoleEnvironment(), 100)
action_value_function = ActionValueNetwork(4, 3,
        name='CartPoleNFQActionValueNetwork')
learner = NFQ()
#learner.gamma = 0.99
learner.explorer.epsilon = 0.4
task.discount = learner.gamma
agent = LearningAgent(action_value_function, learner)
performance_agent = LearningAgent(action_value_function, None)
experiment = EpisodicExperiment(task, agent)

tr = NFQTraining('cartpole_nfq', experiment, performance_agent)

tr.train(7000, performance_interval=1, n_performance_episodes=5)

Ejemplo n.º 6
0
from numpy import array, arange, meshgrid, pi, zeros, mean
from matplotlib import pyplot as plt

# switch this to True if you want to see the cart balancing the pole (slower)
render = False
#render = True

plt.ion()

env = CartPoleEnvironment()
if render:
    renderer = CartPoleRenderer()
    env.setRenderer(renderer)
    renderer.start()

module = ActionValueNetwork(4, 3)

task = DiscreteBalanceTask(env, 100)
learner = NFQ()
learner.explorer.epsilon = 0.4

agent = LearningAgent(module, learner)
testagent = LearningAgent(module, None)
experiment = EpisodicExperiment(task, agent)


def plotPerformance(values, fig):
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
Ejemplo n.º 7
0
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
from pybrain.rl.experiments import EpisodicExperiment

from environment import Environment
from tasks import BalanceTask
from training import NFQTraining

task = BalanceTask()
action_value_function = ActionValueNetwork(task.outdim, task.nactions,
        name='BalanceNFQActionValueNetwork')
learner = NFQ()
learner.gamma = 0.9999
learner.explorer.epsilon = 0.9
task.discount = learner.gamma
agent = LearningAgent(action_value_function, learner)
performance_agent = LearningAgent(action_value_function, None)
experiment = EpisodicExperiment(task, agent)

tr = NFQTraining('balance_nfq', experiment, performance_agent)

tr.train(7000, performance_interval=1, n_performance_episodes=1, plotsave_interval=10, plot_action_history=True)

Ejemplo n.º 8
0
if len( sys.argv ) < 2 or ( int( sys.argv[1] ) < 0 or int( sys.argv[1] ) > 4 ):
    print 'Must supply a model type:\n\t1 = Uncert&Salience, 2 = Salience, 3 = Uncert, 4 = Activation!'
    sys.exit()

if len( sys.argv ) < 3:
    print 'Must supply an output file!'
    sys.exit()



type = int( sys.argv[1] ) # 1 = Uncert&Salience, 2 = Salience, 3 = Uncert, 4 = Activation
env = DistractorRatio() # Create an instance of the D-R task
# Create an action/value neural net with an state space of 100 and an action space of 8
if type == 1:
    module = ActionValueNetwork( 99, 7 )
else:
    module = ActionValueNetwork( 51, 4 )
learner = NFQ()
learner.offPolicy = False # Disable off policy learning
#learner.explorer = HumanExplorer()
learner.explorer.epsilon = 0.4
#learner.explorer.decay = 0.99
agent = HumanAgent( module, learner, type ) # Create an agent that learns with NFQ
testagent = HumanAgent( module, None, type ) # Create a testing agent
experiment = CustomEpisodicExperiment( env, agent ) # Put the agent in the environment

if len( sys.argv ) == 4:
    print 'Loading saved net...'
    module.network = NetworkReader.readFrom( sys.argv[3] )
Ejemplo n.º 9
0
from scipy import *
import sys, time

from pybrain.rl.learners.valuebased import ActionValueNetwork
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners import Q, SARSA, NFQ
from pybrain.rl.experiments import Experiment
from pybrain.rl.environments import Task
from tasktest import TestTask
from envtest import TestEnv

env = TestEnv()
task = TestTask(env)

controller = ActionValueNetwork(1, 2)
learner = NFQ()
agent = LearningAgent(controller, learner)
experiment = Experiment(task, agent)

while True:
    experiment.doInteractions(100)
    agent.learn()
    agent.reset()
    print("Cycle")
def q_learning_nfq(**args):

    # estimate
    best_score = 0
    best_turn = 1000
    best_agent = None

    score_list = []
    turn_list = []
    #for i in range(2):
    for i in range(50):

        # agent 初期化
        # rand = 1.0
        # learner = NFQ(maxEpochs=100)
        # learner._setExplorer(EpsilonGreedyExplorer(rand))
        controller = ActionValueNetwork(12, 4)
        learner = NFQ()
        agent = LearningAgent(controller, learner)

        # training
        print
        print "==========================="
        print 'before training'
        print_state(agent.module.getValue)
        training(agent, args)
        print 'after training'
        print_state(agent.module.getValue)
        agent.learner._setExplorer(EpsilonGreedyExplorer(0.3))

        score, turn = play(agent, 'neural', args, [2, 2])

        score_list.append(score)
        turn_list.append(turn)

        print
        print i, int(numpy.mean(score_list)), max(score_list), score, turn

        # if i % args['episodes'] == 0:
        #     try:
        #         agent.learn()
        #     except:
        #         pass
        #     finally:
        #         agent.reset()
        #         # rand = fit_greedy(i)
        #         # agent.learner._setExplorer(EpsilonGreedyExplorer(rand))
        #         # if not i == 0 :
        #         #     import sys
        #         #     sys.exit()
        # print i, int(numpy.mean(score_list)) , max(score_list) , score, turn

        if best_score < score or best_turn > turn:
            best_score = score
            best_turn = turn
            best_agent = agent
        with open(args['path'] + '/result.dump', 'w') as f:
            pickle.dump([score_list, turn_list, best_agent], f)
    print
    print "==========================="
    print 'best score : ', best_score
    print 'best turn : ', best_turn
    print_state(agent.module.getValue)