Beispiel #1
0
    def optimize(self):

        fitness, index = ascendent_sort(self.samplefitness)  # sort the fitness
        self.avgfit = np.average(fitness)  # compute the average fitness

        self.bfit = fitness[(self.batchSize * 2) - 1]
        bidx = index[(self.batchSize * 2) - 1]
        if ((bidx % 2) == 0):  # regenerate the genotype of the best samples
            bestid = int(bidx / 2)
            self.bestsol = self.center + self.samples[bestid] * self.noiseStdDev
        else:
            bestid = int(bidx / 2)
            self.bestsol = self.center - self.samples[bestid] * self.noiseStdDev

        if self.rank == 0:
            self.updateBest(
                self.bfit,
                self.bestsol)  # Stored if it is the best obtained so far

        popsize = self.batchSize * 2  # compute a vector of utilities [-0.5,0.5]
        utilities = zeros(popsize)
        for i in range(popsize):
            utilities[index[i]] = i
        utilities /= (popsize - 1)
        utilities -= 0.5

        weights = zeros(
            self.batchSize
        )  # Assign the weights (utility) to samples on the basis of their fitness rank
        for i in range(self.batchSize):
            idx = 2 * i
            weights[i] = (utilities[idx] - utilities[idx + 1]
                          )  # merge the utility of symmetric samples

        g = 0.0
        i = 0
        while i < self.batchSize:  # Compute the gradient (the dot product of the samples for their utilities)
            gsize = -1
            if self.batchSize - i < 500:  # if the popsize is larger than 500, compute the gradient for multiple sub-populations
                gsize = self.batchSize - i
            else:
                gsize = 500
            g += dot(weights[i:i + gsize], self.samples[i:i + gsize, :])
            i += gsize
        g /= popsize  # normalize the gradient for the popsize

        if self.wdecay == 1:
            globalg = -g + 0.005 * self.center  # apply weight decay
        else:
            globalg = -g

        # adam stochastic optimizer
        a = self.stepsize * sqrt(1.0 - self.beta2**self.cgen) / (
            1.0 - self.beta1**self.cgen)
        self.m = self.beta1 * self.m + (1.0 - self.beta1) * globalg
        self.v = self.beta2 * self.v + (1.0 - self.beta2) * (globalg * globalg)
        dCenter = -a * self.m / (sqrt(self.v) + self.epsilon)

        self.center += dCenter  # move the center in the direction of the momentum vectors
        self.avecenter = np.average(np.absolute(self.center))
Beispiel #2
0
    def evaluate(self):
        cseed = self.seed + self.cgen * self.batchSize  # Set the seed for current generation (master and workers have the same seed)
        self.rs = np.random.RandomState(cseed)
        self.samples = self.rs.randn(self.batchSize, self.nparams)
        self.cgen += 1

        # evaluate samples
        candidate = np.arange(self.nparams, dtype=np.float64)
        for b in range(self.batchSize):               
            for bb in range(2):
                if (bb == 0):
                    candidate = self.center + self.samples[b,:] * self.noiseStdDev
                else:
                    candidate = self.center - self.samples[b,:] * self.noiseStdDev
                self.policy.set_trainable_flat(candidate)
                self.policy.nn.normphase(0) # normalization data is collected during the post-evaluation of the best sample of he previous generation
                eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, seed=(self.seed + (self.cgen * self.batchSize) + b))
                self.samplefitness[b*2+bb] = eval_rews
                self.steps += eval_length

        fitness, self.index = ascendent_sort(self.samplefitness)       # sort the fitness
        self.avgfit = np.average(fitness)                         # compute the average fitness                   

        self.bfit = fitness[(self.batchSize * 2) - 1]
        bidx = self.index[(self.batchSize * 2) - 1]  
        if ((bidx % 2) == 0):                                     # regenerate the genotype of the best samples
            bestid = int(bidx / 2)
            self.bestsol = self.center + self.samples[bestid] * self.noiseStdDev  
        else:
            bestid = int(bidx / 2)
            self.bestsol = self.center - self.samples[bestid] * self.noiseStdDev

        self.updateBest(self.bfit, self.bestsol)                  # Stored if it is the best obtained so far 
                
        # postevaluate best sample of the last generation
        # in openaiesp.py this is done the next generation, move this section before the section "evaluate samples" to produce identical results
        gfit = 0
        if self.bestsol is not None:
            self.policy.set_trainable_flat(self.bestsol)
            self.tnormepisodes += self.inormepisodes
            for t in range(self.policy.nttrials):
                if self.policy.normalize == 1 and self.normepisodes < self.tnormepisodes:
                    self.policy.nn.normphase(1)
                    self.normepisodes += 1  # we collect normalization data
                    self.normalizationdatacollected = True
                else:
                    self.policy.nn.normphase(0)
                eval_rews, eval_length = self.policy.rollout(1, seed=(self.seed + 100000 + t))
                gfit += eval_rews               
                self.steps += eval_length
            gfit /= self.policy.nttrials    
            self.updateBestg(gfit, self.bestsol)
Beispiel #3
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        batchSize = self.batchSize
        if batchSize == 0:
            # 4 + floor(3 * log(N))
            batchSize = int(4 + math.floor(3 * math.log(nparams)))
        # Symmetric weights in the range [-0.5,0.5]
        weights = zeros(batchSize)

        ceval = 0  # current evaluation
        cgen = 0  # current generation
        # Parameters for Adam policy
        m = zeros(nparams)
        v = zeros(nparams)
        epsilon = 1e-08  # To avoid numerical issues with division by zero...
        beta1 = 0.9
        beta2 = 0.999

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        rs = np.random.RandomState(self.seed)
        fitbestsample = [0, 0]

        print(
            "Salimans2: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, batchSize, self.stepsize,
               self.noiseStdDev, self.wdecay, self.sameenvcond, nparams))

        # main loop
        elapsed = 0
        while (ceval < maxsteps):
            cgen += 1

            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = rs.randn(batchSize, nparams)
            # buffer vector for candidate
            candidate = np.arange(nparams, dtype=np.float64)
            # allocate the fitness vector (fitness2 is the sum on the two behaviors)
            fitness = zeros(batchSize * 2)
            fitness2 = zeros(batchSize * 2)
            # If normalize=1 we update the normalization vectors
            if (self.policy.normalize == 1):
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.env.seed(self.policy.get_seed + cgen)
            self.policy.nn.seed(self.policy.get_seed + cgen)
            # Evaluate offspring 2 times (on behavior 1 and 2)
            g1 = 0.0
            g2 = 0.0
            for beh in range(2):
                for b in range(batchSize):
                    for bb in range(2):
                        if (bb == 0):
                            candidate = center + samples[
                                b, :] * self.noiseStdDev
                        else:
                            candidate = center - samples[
                                b, :] * self.noiseStdDev
                        # Set policy parameters
                        self.policy.set_trainable_flat(candidate)
                        # Evaluate the offspring
                        eval_rews, eval_length, rews1, rews2 = self.policy.rollout(
                            self.policy.ntrials,
                            seed=(self.seed + (cgen * self.batchSize) + b),
                            timestep_limit=beh)
                        # store the fitness
                        fitness[b * 2 + bb] = eval_rews
                        fitness2[b * 2 + bb] += (eval_rews / 2.0)
                        # Update the number of evaluations
                        ceval += eval_length

                # Sort by fitness and compute weighted mean into center
                fitness, index = ascendent_sort(fitness)
                fitbestsample[beh] = fitness[batchSize * 2 - 1]
                # Now me must compute the symmetric weights in the range [-0.5,0.5]
                utilities = zeros(batchSize * 2)
                for i in range(batchSize * 2):
                    utilities[index[i]] = i
                utilities /= (batchSize * 2 - 1)
                utilities -= 0.5
                # Now we assign the weights to the samples
                for i in range(batchSize):
                    idx = 2 * i
                    weights[i] = (utilities[idx] - utilities[idx + 1]
                                  )  # pos - neg
                i = 0
                if (beh == 0):
                    while i < batchSize:
                        gsize = -1
                        if batchSize - i < 500:
                            gsize = batchSize - i
                        else:
                            gsize = 500
                        g1 += dot(weights[i:i + gsize],
                                  samples[i:i + gsize, :])  # weights * samples
                        i += gsize
                    g1 /= (batchSize * 2)
                else:
                    while i < batchSize:
                        gsize = -1
                        if batchSize - i < 500:
                            gsize = batchSize - i
                        else:
                            gsize = 500
                        g2 += dot(weights[i:i + gsize],
                                  samples[i:i + gsize, :])  # weights * samples
                        i += gsize
                    g2 /= (batchSize * 2)

            # sum the gradient computed on behavior 1 and 2
            glob = g1 + g2

            # Weight decay
            if (self.wdecay == 1):
                globalg = -glob + 0.005 * center
            else:
                globalg = -glob

            # Sort by using the sum of the fitness obtained on the two behaviors
            fitness2, index = ascendent_sort(fitness2)
            centroidfit = 0
            if (self.policy.nttrials > 0):
                bestsamid = index[batchSize * 2 - 1]
                if ((bestsamid % 2) == 0):
                    bestid = int(bestsamid / 2)
                    candidate = center + samples[bestid] * self.noiseStdDev
                else:
                    bestid = int(bestsamid / 2)
                    candidate = center - samples[bestid] * self.noiseStdDev

                # Update data if the current offspring is better than current best
                self.updateBest(fitness2[bestsamid], candidate)
                # post-evaluate the best sample to compute the generalization
                self.env.seed(self.policy.get_seed + 100000)
                self.policy.nn.seed(self.policy.get_seed + 100000)
                self.policy.set_trainable_flat(candidate)
                eval_rews, eval_length, rews1, rews2 = self.policy.rollout(
                    self.policy.nttrials, timestep_limit=2, post_eval=True)
                gfit = eval_rews
                ceval += eval_length
                # eveltually store the new best generalization individual
                self.updateBestg(gfit, candidate)

            # ADAM policy
            # Compute how much the center moves
            a = self.stepsize * sqrt(1.0 - beta2**cgen) / (1.0 - beta1**cgen)
            m = beta1 * m + (1.0 - beta1) * globalg
            v = beta2 * v + (1.0 - beta2) * (globalg * globalg)
            dCenter = -a * m / (sqrt(v) + epsilon)
            # update center
            center += dCenter

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, center, centroidfit,
                            fitness[batchSize * 2 - 1], elapsed, maxsteps)
            corr = stats.pearsonr(g1, g2)
            print(
                'Seed %d (%.1f%%) gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.1f %.1f) avg %.2f weightsize %.2f gradientcorr %.2f'
                % (self.seed, ceval / float(maxsteps), cgen, ceval / 1000000,
                   self.bestfit, self.bestgfit, fitness2[batchSize * 2 - 1],
                   fitbestsample[0], fitbestsample[1], np.average(fitness2),
                   np.average(np.absolute(center)), corr[0]))

            # Save centroid and associated vectors
            if (self.saveeachg > 0 and cgen > 0):
                if ((cgen % self.saveeachg) == 0):
                    # save best, bestg, and stat
                    self.save(cgen, ceval, centroidfit, center,
                              fitness[batchSize * 2 - 1],
                              (time.time() - start_time))
                    # save summary statistics
                    fname = self.filedir + "/S" + str(self.seed) + ".fit"
                    fp = open(fname, "w")
                    fp.write(
                        'Seed %d gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.2f %.2f) avg %.2f weightsize %.2f gradientcorr %.2f \n'
                        % (self.seed, cgen, ceval / 1000000, self.bestfit,
                           self.bestgfit, fitness2[batchSize * 2 - 1],
                           fitbestsample[0], fitbestsample[1],
                           np.average(fitness2), np.average(
                               np.absolute(center)), corr[0]))
                    fp.close()

        # save best, bestg, and stat
        self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1],
                  (time.time() - start_time))
        # save summary statistics
        fname = self.filedir + "/S" + str(self.seed) + ".fit"
        fp = open(fname, "w")
        fp.write(
            'Seed %d gen %d msteps %d bestfit %.2f bestgfit %.2f bestsam %.2f (%.2f %.2f) avg %.2f weightsize %.2f gradientcorr %.2f \n'
            % (self.seed, cgen, ceval / 1000000, self.bestfit, self.bestgfit,
               fitness2[batchSize * 2 - 1], fitbestsample[0], fitbestsample[1],
               np.average(fitness2), np.average(np.absolute(center)), corr[0]))
        fp.close()

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Beispiel #4
0
    def runphase(self, sind, nparams):
        
        epsilon = 1e-08 
        beta1 = 0.9
        beta2 = 0.999
        weights = zeros(self.batchSize)

        for it in range (20):
            ave_rews = 0
            # evaluate the centroid
            for i in range(self.selsize):
                if (self.evopop == 0):
                    self.policy.set_trainable_flat(np.concatenate((self.selp[sind], self.selcomp[i])))
                    eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                    # sanity check
                    if (it == 0 and eval_rews != self.fmatrix[self.seli[sind],self.selc[i]]):
                        print("warning: sanity check failed")
                    ave_rews += eval_rews
                else:
                    self.policy.set_trainable_flat(np.concatenate((self.selcomp[i], self.selp[sind])))
                    eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                    # sanity check
                    if (it == 0 and eval_rews != self.fmatrix[self.selc[i],self.seli[sind]]):
                        print("warning: sanity check failed")
                    ave_rews += (1.0  - eval_rews)
            ave_rews /= float(self.selsize)
            #print("centroid ", end ='')
            #for g in range(10):
                #print("%.4f " % (self.selp[sind][g+20]), end='')
            #print("");
            if (it == 0):
                print("evopop %d ind %2d : " % (self.evopop, self.seli[sind]), end = '')
            print("%.2f " % (ave_rews), end='')

            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = self.rs.randn(self.batchSize, nparams)
            fitness = zeros(self.batchSize * 2)
            # Evaluate offspring
            for b in range(self.batchSize):
                for bb in range(2):
                    if (bb == 0):
                        for g in range(nparams):
                            self.candidate[g] = self.selp[sind][g] + samples[b,g] * self.noiseStdDev
                    else:
                        for g in range(nparams):
                            self.candidate[g] = self.selp[sind][g] - samples[b,g] * self.noiseStdDev
                    #print("candidad ", end ='')
                    #for g in range(10):
                        #print("%.4f " % (self.candidate[g+20]), end='')
                    #print("");
                    # evaluate offspring
                    ave_rews = 0
                    for c in range(self.selsize):
                        if (self.evopop == 0):
                            self.policy.set_trainable_flat(np.concatenate((self.candidate, self.selcomp[c])))
                            eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                            ave_rews += eval_rews
                        else:
                            self.policy.set_trainable_flat(np.concatenate((self.selcomp[c], self.candidate)))
                            eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                            ave_rews += (1.0 - eval_rews)
                        #print("f %.2f" % eval_rews)
                    fitness[b*2+bb] = ave_rews / float(self.selsize)
                    #print("%.2f " % (ave_rews / float(self.selsize)), end = '')
            # Sort by fitness and compute weighted mean into center
            fitness, index = ascendent_sort(fitness)
            # Now me must compute the symmetric weights in the range [-0.5,0.5]
            utilities = zeros(self.batchSize * 2)
            for i in range(self.batchSize * 2):
                utilities[index[i]] = i
            utilities /= (self.batchSize * 2 - 1)
            utilities -= 0.5
            # Now we assign the weights to the samples
            for i in range(self.batchSize):
                idx = 2 * i
                weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg

            # Compute the gradient
            g = 0.0
            i = 0
            while i < self.batchSize:
                gsize = -1
                if self.batchSize - i < 500:
                    gsize = self.batchSize - i
                else:
                    gsize = 500
                g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples
                i += gsize
            # Normalization over the number of samples
            g /= (self.batchSize * 2)
            # Weight decay
            if (self.wdecay == 1):
                globalg = -g + 0.005 * self.selp[sind]
            else:
                globalg = -g
            # ADAM stochastic optimizer
            # a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen)
            a = self.stepsize # bias correction is not implemented
            self.selm[sind] = beta1 * self.selm[sind] + (1.0 - beta1) * globalg
            self.selv[sind] = beta2 * self.selv[sind] + (1.0 - beta2) * (globalg * globalg)
            dCenter = -a * self.selm[sind] / (sqrt(self.selv[sind]) + epsilon)
            # update center
            self.selp[sind] += dCenter
            #for g in range(10):
                 #print("%.4f " % (self.selp[sind][g+20]), end='')
            #print("");

        # evaluate the evolving individual at the end of the evolution phase
        ave_rews = 0
        for i in range(self.selsize):
            if (self.evopop == 0):
                self.policy.set_trainable_flat(np.concatenate((self.selp[sind], self.selcomp[i])))
                eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                ave_rews += eval_rews
            else:
                self.policy.set_trainable_flat(np.concatenate((self.selcomp[i], self.selp[sind])))
                eval_rews, eval_length = self.policy.rollout(1, timestep_limit=1000)
                ave_rews += (1.0  - eval_rews)
        ave_rews /= float(self.selsize)
        print("%.2f" % (ave_rews))
Beispiel #5
0
    def run(self, maxsteps):

        start_time = time.time()

        ##Osipov########################################
    
        ## Hyperparameters for our network
        number_of_inputs = 3
        number_of_hiddens = 50
        number_of_outputs = 5
        batch_size_train = 32

        # Learning rate
        lr = 0.001
        epochs = 100

        # initialize two network with xavier initialization
        net_1 = NET_1(number_of_inputs, number_of_outputs, number_of_hiddens)
        net_2 = NET_2(number_of_inputs, number_of_outputs, number_of_hiddens)
        
        #FIX parameters of net_1
        for param in net_1.parameters():
            param.requires_grad = False

        # Loss for backpropgation
        criterion = nn.MSELoss()
        #Adam optimaizer
        optimizer = optim.Adam(net_2.parameters(), lr=lr)

        ##Osipov########################################

        # initialize the solution center
        center = self.policy.get_trainable_flat()
        
        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        batchSize = self.batchSize
        if batchSize == 0:
            # 4 + floor(3 * log(N))
            batchSize = int(4 + math.floor(3 * math.log(nparams)))
        # Symmetric weights in the range [-0.5,0.5]
        weights = zeros(batchSize)

        ceval = 0                    # current evaluation
        cgen = 0                # current generation
        # Parameters for Adam policy
        m = zeros(nparams)
        v = zeros(nparams)
        epsilon = 1e-08 # To avoid numerical issues with division by zero...
        beta1 = 0.9
        beta2 = 0.999
    
        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        rs = np.random.RandomState(self.seed)

        print("Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams))

        if (self.fromgeneration > 0):
            cgen = self.fromgeneration
            filename = "S%dG%d.npy" % (self.seed, cgen)
            filedata = np.load(filename)
            filename = "S%dG%dm.npy" % (self.seed, cgen)
            m = np.load(filename)
            filename = "S%dG%dv.npy" % (self.seed, cgen)
            v = np.load(filename)
            fname = "statS%d.npy" % (self.seed)
            self.stat = np.load(fname)
            if (self.policy.normalize == 1):
                filename = "S%dG%dn.npy" % (self.seed, cgen)
                self.policy.normvector = np.load(fname)
                self.policy.nn.setNormalizationVectors()


        # main loop
        elapsed = 0
        ##Osipov#########################
        Max_observations = 10000
        train_set = []
        labels_list = []
        ##Osipov#########################   
        while (ceval < maxsteps):
            cgen += 1


            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = rs.randn(batchSize, nparams)
            # buffer vector for candidate
            candidate = np.arange(nparams, dtype=np.float64)
            # Evaluate offspring
            fitness = zeros(batchSize * 2)
            # If normalize=1 we update the normalization vectors
            if (self.policy.normalize == 1):
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.env.seed(self.policy.get_seed + cgen)
            self.policy.nn.seed(self.policy.get_seed + cgen)
            # Evaluate offspring
            for b in range(batchSize):
                for bb in range(2):
                    if (bb == 0):
                        candidate = center + samples[b,:] * self.noiseStdDev
                    else:
                        candidate = center - samples[b,:] * self.noiseStdDev                            
                    # Set policy parameters 
                    self.policy.set_trainable_flat(candidate) 
                    # Sample of the same generation experience the same environmental conditions
                    if (self.sameenvcond == 1):
                        self.env.seed(self.policy.get_seed + cgen)
                        self.policy.nn.seed(self.policy.get_seed + cgen)
                    # Evaluate the offspring
                    eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.ntrials, timestep_limit=1000)
                    # Get the fitness
                    fitness[b*2+bb] = eval_rews
                    # Update the number of evaluations
                    ceval += eval_length
                    # Update data if the current offspring is better than current best
                    self.updateBest(fitness[b*2+bb], candidate) 

            # Sort by fitness and compute weighted mean into center
            fitness, index = ascendent_sort(fitness)
            # Now me must compute the symmetric weights in the range [-0.5,0.5]
            utilities = zeros(batchSize * 2)
            for i in range(batchSize * 2):
                utilities[index[i]] = i
            utilities /= (batchSize * 2 - 1)
            utilities -= 0.5
            # Now we assign the weights to the samples
            for i in range(batchSize):
                idx = 2 * i
                weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg

            # Evaluate the centroid
            if (self.sameenvcond == 1):
                self.env.seed(self.policy.get_seed + cgen)
                self.policy.nn.seed(self.policy.get_seed + cgen)
            self.policy.set_trainable_flat(center)

            ##Osipov###################################
            eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.ntrials, timestep_limit=1000)
            
            
            if len(train_set) < Max_observations:
                train_set += observations
                labels_list += outputs_net_1
            else:
                train_set = train_set[len(observations):]
                train_set += observations
                labels_list = labels_list[len(outputs_net_1):]
                labels_list += outputs_net_1

            
            dataset = BehaviourDataset(train_set, labels_list)
            train_loader = DataLoader(dataset, batch_size=batch_size_train,
                                                    shuffle=False, num_workers=4)
            for epoch in range(1, epochs + 1):
                print('Train Epoch: ', epoch)
                train(net_2, optimizer, train_loader, criterion)
            ##Osipov#####################################

            centroidfit = eval_rews
            ceval += eval_length
            # Update data if the centroid is better than current best
            self.updateBest(centroidfit, center)

            # Evaluate generalization
            if (self.policy.nttrials > 0):
                if centroidfit > fitness[batchSize * 2 - 1]:
                    # the centroid is tested for generalization
                    candidate = np.copy(center)
                else:
                    # the best sample is tested for generalization
                    bestsamid = index[batchSize * 2 - 1]
                    if ((bestsamid % 2) == 0):
                        bestid = int(bestsamid / 2)
                        candidate = center + samples[bestid] * self.noiseStdDev
                    else:
                        bestid = int(bestsamid / 2)
                        candidate = center - samples[bestid] * self.noiseStdDev
                self.env.seed(self.policy.get_seed + 100000)
                self.policy.nn.seed(self.policy.get_seed + 100000)
                self.policy.set_trainable_flat(candidate) 
                eval_rews, eval_length, observations, outputs_net_1 = self.policy.rollout(net_1, net_2, self.policy.nttrials, timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # eveltually store the new best generalization individual
                self.updateBestg(gfit, candidate)

            # Compute the gradient
            g = 0.0
            i = 0
            while i < batchSize:
                gsize = -1
                if batchSize - i < 500:
                    gsize = batchSize - i
                else:
                    gsize = 500
                g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples
                i += gsize
            # Normalization over the number of samples
            g /= (batchSize * 2)
            # Weight decay
            if (self.wdecay == 1):
                globalg = -g + 0.005 * center
            else:
                globalg = -g
            # ADAM policy
            # Compute how much the center moves
            a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen)
            m = beta1 * m + (1.0 - beta1) * globalg
            v = beta2 * v + (1.0 - beta2) * (globalg * globalg)
            dCenter = -a * m / (sqrt(v) + epsilon)
            # update center
            center += dCenter

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, center, centroidfit, fitness[batchSize * 2 - 1], elapsed, maxsteps)

            # Save centroid and associated vectors
            if (self.saveeachg > 0 and cgen > 0):
                if ((cgen % self.saveeachg) == 0):
                    filename = "S%dG%d.npy" % (self.seed, cgen)
                    np.save(filename, center)
                    filename = "S%dG%dm.npy" % (self.seed, cgen)
                    np.save(filename, m)
                    filename = "S%dG%dv.npy" % (self.seed, cgen)
                    np.save(filename, v)
                    if (self.policy.normalize == 1):
                        filename = "S%dG%dn.npy" % (self.seed, cgen)
                        np.save(filename, self.policy.normvector)  

        # save data
        self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Beispiel #6
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        center = self.policy.get_trainable_flat()
        
        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        batchSize = self.batchSize
        if batchSize == 0:
            # 4 + floor(3 * log(N))
            batchSize = int(4 + math.floor(3 * math.log(nparams)))
        # Symmetric weights in the range [-0.5,0.5]
        weights = zeros(batchSize)

        ceval = 0                    # current evaluation
        cgen = 0                # current generation
        # Parameters for Adam policy
        m = zeros(nparams)
        v = zeros(nparams)
        epsilon = 1e-08 # To avoid numerical issues with division by zero...
        beta1 = 0.9
        beta2 = 0.999
    
        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        rs = np.random.RandomState(self.seed)

        print("Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d" % (self.seed, maxsteps / 1000000, batchSize, self.stepsize, self.noiseStdDev, self.wdecay, self.sameenvcond, nparams))

        if (self.fromgeneration > 0):
            cgen = self.fromgeneration
            filename = "S%dG%d.npy" % (self.seed, cgen)
            filedata = np.load(filename)
            filename = "S%dG%dm.npy" % (self.seed, cgen)
            m = np.load(filename)
            filename = "S%dG%dv.npy" % (self.seed, cgen)
            v = np.load(filename)
            fname = "statS%d.npy" % (self.seed)
            self.stat = np.load(fname)
            if (self.policy.normalize == 1):
                filename = "S%dG%dn.npy" % (self.seed, cgen)
                self.policy.normvector = np.load(fname)
                self.policy.nn.setNormalizationVectors()


        # main loop
        elapsed = 0
        while (ceval < maxsteps):
            cgen += 1


            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = rs.randn(batchSize, nparams)
            # buffer vector for candidate
            candidate = np.arange(nparams, dtype=np.float64)
            # Evaluate offspring
            fitness = zeros(batchSize * 2)
            # If normalize=1 we update the normalization vectors
            if (self.policy.normalize == 1):
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.env.seed(self.policy.get_seed + cgen)
            self.policy.nn.seed(self.policy.get_seed + cgen)
            # Evaluate offspring
            for b in range(batchSize):
                if self.policy.strategy == 'symmetric':
                    rand = np.random.uniform(0, 1)
                    if rand < 0.5:
                        self.env.robot.behavior1 = 5.0
                        self.env.robot.behavior2 = 0.0
                    else:
                        self.env.robot.behavior1 = 0.0
                        self.env.robot.behavior2 = 5.0
                for bb in range(2):
                    if (bb == 0):
                        candidate = center + samples[b,:] * self.noiseStdDev
                    else:
                        candidate = center - samples[b,:] * self.noiseStdDev                            
                    # Set policy parameters 
                    self.policy.set_trainable_flat(candidate) 
                    # Sample of the same generation experience the same environmental conditions
                    if (self.sameenvcond == 1):
                        self.env.seed(self.policy.get_seed + cgen)
                        self.policy.nn.seed(self.policy.get_seed + cgen)
                    # Evaluate the offspring
                    eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, timestep_limit=1000)
                    # Get the fitness
                    fitness[b*2+bb] = eval_rews
                    # Update the number of evaluations
                    ceval += eval_length
                    # Update data if the current offspring is better than current best
                    self.updateBest(fitness[b*2+bb], candidate) 

            # Sort by fitness and compute weighted mean into center
            fitness, index = ascendent_sort(fitness)
            # Now me must compute the symmetric weights in the range [-0.5,0.5]
            utilities = zeros(batchSize * 2)
            for i in range(batchSize * 2):
                utilities[index[i]] = i
            utilities /= (batchSize * 2 - 1)
            utilities -= 0.5
            # Now we assign the weights to the samples
            for i in range(batchSize):
                idx = 2 * i
                weights[i] = (utilities[idx] - utilities[idx + 1]) # pos - neg

            # Evaluate the centroid
            if (self.sameenvcond == 1):
                self.env.seed(self.policy.get_seed + cgen)
                self.policy.nn.seed(self.policy.get_seed + cgen)
            self.policy.set_trainable_flat(center)
            eval_rews, eval_length = self.policy.rollout(self.policy.ntrials, timestep_limit=1000)
            centroidfit = eval_rews
            ceval += eval_length
            # Update data if the centroid is better than current best
            self.updateBest(centroidfit, center)

            # Evaluate generalization
            if (self.policy.nttrials > 0):
                if centroidfit > fitness[batchSize * 2 - 1]:
                    # the centroid is tested for generalization
                    candidate = np.copy(center)
                else:
                    # the best sample is tested for generalization
                    bestsamid = index[batchSize * 2 - 1]
                    if ((bestsamid % 2) == 0):
                        bestid = int(bestsamid / 2)
                        candidate = center + samples[bestid] * self.noiseStdDev
                    else:
                        bestid = int(bestsamid / 2)
                        candidate = center - samples[bestid] * self.noiseStdDev
                self.env.seed(self.policy.get_seed + 100000)
                self.policy.nn.seed(self.policy.get_seed + 100000)
                self.policy.set_trainable_flat(candidate) 
                eval_rews, eval_length = self.policy.rollout(self.policy.nttrials, timestep_limit=1000, post_eval=True)
                gfit = eval_rews
                ceval += eval_length
                # eveltually store the new best generalization individual
                self.updateBestg(gfit, candidate)

            # Compute the gradient
            g = 0.0
            i = 0
            while i < batchSize:
                gsize = -1
                if batchSize - i < 500:
                    gsize = batchSize - i
                else:
                    gsize = 500
                g += dot(weights[i:i + gsize], samples[i:i + gsize,:]) # weights * samples
                i += gsize
            # Normalization over the number of samples
            g /= (batchSize * 2)
            # Weight decay
            if (self.wdecay == 1):
                globalg = -g + 0.005 * center
            else:
                globalg = -g
            # ADAM policy
            # Compute how much the center moves
            a = self.stepsize * sqrt(1.0 - beta2 ** cgen) / (1.0 - beta1 ** cgen)
            m = beta1 * m + (1.0 - beta1) * globalg
            v = beta2 * v + (1.0 - beta2) * (globalg * globalg)
            dCenter = -a * m / (sqrt(v) + epsilon)
            # update center
            center += dCenter

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, center, centroidfit, fitness[batchSize * 2 - 1], elapsed, maxsteps)

            # Save centroid and associated vectors
            if (self.saveeachg > 0 and cgen > 0):
                if ((cgen % self.saveeachg) == 0):
                    filename = "S%dG%d.npy" % (self.seed, cgen)
                    np.save(filename, center)
                    filename = "S%dG%dm.npy" % (self.seed, cgen)
                    np.save(filename, m)
                    filename = "S%dG%dv.npy" % (self.seed, cgen)
                    np.save(filename, v)
                    if (self.policy.normalize == 1):
                        filename = "S%dG%dn.npy" % (self.seed, cgen)
                        np.save(filename, self.policy.normvector)  

        # save data
        self.save(cgen, ceval, centroidfit, center, fitness[batchSize * 2 - 1], (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Beispiel #7
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        self.center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        centerLearningRate = 1.0
        covLearningRate = 0.6 * (3 + log(nparams)) / 3.0 / sqrt(nparams)
        if self.batchSize == 0:
            # Use default value: 4 + floor(3 * log(N)), where N is the number of parameters
            self.batchSize = int(
                4 +
                floor(3 * log(nparams)))  # population size, offspring number
            if "Tf" in type(self.policy).__name__:
                # Update the number of rollout calls in policy
                self.policy.updaten(self.batchSize)
        initVar = 1.0
        mu = int(floor(self.batchSize /
                       2))  # number of parents/points for recombination
        self.stepsize = 1.0 / mu
        weights = zeros(self.batchSize)
        w = self.stepsize
        for i in range(mu):
            weights[self.batchSize - mu + i] = w
            w += self.stepsize
        weights /= sum(weights)  # normalize recombination weights array
        # initialize variance array
        _sigmas = ones(nparams) * initVar

        ceval = 0  # current evaluation
        cgen = 0  # current generation

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        np.random.seed(self.seed)

        print(
            "sNES: seed %d maxmsteps %d batchSize %d stepsize %.2f sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize,
               self.sameenvcond, nparams))

        # Set evolution mode
        self.policy.runEvo()

        # main loop
        elapsed = 0
        while ceval < maxsteps:
            cgen += 1

            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = np.random.randn(self.batchSize, nparams)
            S = samples.transpose()
            # Generate offspring
            offspring = tile(
                self.center.reshape(1, nparams),
                (self.batchSize, 1)) + tile(_sigmas.reshape(1, nparams),
                                            (self.batchSize, 1)) * samples
            # Evaluate offspring
            fitness = zeros(self.batchSize)
            # If normalize=1 we update the normalization vectors
            if self.policy.normalize == 1:
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.policy.setSeed(self.policy.get_seed + cgen)
            # Set generalization flag to False
            self.policy.doGeneralization(False)
            # Evaluate offspring
            for k in range(self.batchSize):
                # Set policy parameters (corresponding to the current offspring)
                self.policy.set_trainable_flat(offspring[k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the offspring
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current offspring is better than current best
                self.updateBest(fitness[k], offspring[k])

            # Sort by fitness and compute weighted mean into center
            fitness, index = ascendent_sort(fitness)
            S = S[:, index]

            # Update center
            dCenter = dot(weights, S.transpose())
            self.center += dCenter

            # Update variances
            Ssq = S * S
            SsqMinusOne = Ssq - ones((nparams, self.batchSize))
            covGrad = dot(weights, SsqMinusOne.transpose())
            dSigma = 0.5 * covLearningRate * covGrad
            _sigmas = _sigmas * exp(dSigma).transpose()

            centroidfit = -999999999.0
            if self.evalCenter != 0:
                # Evaluate the centroid
                self.policy.set_trainable_flat(self.center)
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                centroidfit = eval_rews
                ceval += eval_length
                # Update data if the centroid is better than current best
                self.updateBest(centroidfit, self.center)

            # Now perform generalization
            if self.policy.generalize:
                candidate = None
                if centroidfit > fitness[self.batchSize - 1]:
                    # Centroid undergoes generalization test
                    candidate = np.copy(self.center)
                else:
                    # Best sample undergoes generalization test
                    bestsamid = index[self.batchSize - 1]
                    candidate = np.copy(offspring[bestsamid])
                # Set the seed
                self.policy.set_trainable_flat(
                    candidate)  # Parameters must be updated by the algorithm!!
                self.policy.setSeed(self.policy.get_seed + 1000000)
                self.policy.doGeneralization(True)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # Update data if the candidate is better than current best generalizing individual
                self.updateBestg(gfit, candidate)

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, self.center, centroidfit,
                            fitness[self.batchSize - 1], elapsed, maxsteps)

        # save data
        self.save(cgen, ceval, centroidfit, self.center,
                  fitness[self.batchSize - 1], (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))
Beispiel #8
0
    def run(self, maxsteps):

        start_time = time.time()

        # initialize the solution center
        self.center = self.policy.get_trainable_flat()

        # Extract the number of parameters
        nparams = self.policy.nparams
        # setting parameters
        if self.batchSize == 0:
            # 4 + floor(3 * log(N))
            self.batchSize = int(4 + math.floor(3 * math.log(nparams)))
        # Symmetric weights in the range [-0.5,0.5]
        weights = zeros(self.batchSize)

        ceval = 0  # current evaluation
        cgen = 0  # current generation
        # Parameters for Adam policy
        m = zeros(nparams)
        v = zeros(nparams)
        epsilon = 1e-08  # To avoid numerical issues with division by zero...
        beta1 = 0.9
        beta2 = 0.999

        # RandomState for perturbing the performed actions (used only for samples, not for centroid)
        np.random.seed(self.seed)

        print(
            "Salimans: seed %d maxmsteps %d batchSize %d stepsize %lf noiseStdDev %lf wdecay %d sameEnvCond %d nparams %d"
            % (self.seed, maxsteps / 1000000, self.batchSize, self.stepsize,
               self.noiseStdDev, self.wdecay, self.sameenvcond, nparams))

        # Set evolution mode
        self.policy.runEvo()

        # main loop
        elapsed = 0
        while ceval < maxsteps:
            cgen += 1

            # Extract half samples from Gaussian distribution with mean 0.0 and standard deviation 1.0
            samples = np.random.randn(self.batchSize, nparams)
            # We generate simmetric variations for the offspring
            symmSamples = zeros((self.batchSize * 2, nparams))
            for i in range(self.batchSize):
                sampleIdx = 2 * i
                for g in range(nparams):
                    symmSamples[sampleIdx, g] = samples[i, g]
                    symmSamples[sampleIdx + 1, g] = -samples[i, g]
            # Generate offspring
            offspring = tile(
                self.center.reshape(1, nparams),
                (self.batchSize * 2, 1)) + self.noiseStdDev * symmSamples
            # Evaluate offspring
            fitness = zeros(self.batchSize * 2)
            # If normalize=1 we update the normalization vectors
            if self.policy.normalize == 1:
                self.policy.nn.updateNormalizationVectors()
            # Reset environmental seed every generation
            self.policy.setSeed(self.policy.get_seed + cgen)
            # Set generalization flag to False
            self.policy.doGeneralization(False)
            # Evaluate offspring
            for k in range(self.batchSize * 2):
                # Set policy parameters (corresponding to the current offspring)
                self.policy.set_trainable_flat(offspring[k])
                # Sample of the same generation experience the same environmental conditions
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                # Evaluate the offspring
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                # Get the fitness
                fitness[k] = eval_rews
                # Update the number of evaluations
                ceval += eval_length
                # Update data if the current offspring is better than current best
                self.updateBest(fitness[k], offspring[k])

            # Sort by fitness and compute weighted mean into center
            fitness, index = ascendent_sort(fitness)
            # Now me must compute the symmetric weights in the range [-0.5,0.5]
            utilities = zeros(self.batchSize * 2)
            for i in range(self.batchSize * 2):
                utilities[index[i]] = i
            utilities /= (self.batchSize * 2 - 1)
            utilities -= 0.5
            # Now we assign the weights to the samples
            for i in range(self.batchSize):
                idx = 2 * i
                weights[i] = (utilities[idx] - utilities[idx + 1])  # pos - neg

            # Compute the gradient
            g = 0.0
            i = 0
            while i < self.batchSize:
                gsize = -1
                if self.batchSize - i < 500:
                    gsize = self.batchSize - i
                else:
                    gsize = 500
                g += dot(weights[i:i + gsize],
                         samples[i:i + gsize, :])  # weights * samples
                i += gsize
            # Normalization over the number of samples
            g /= (self.batchSize * 2)
            # Weight decay
            if (self.wdecay == 1):
                globalg = -g + 0.005 * self.center
            else:
                globalg = -g
            # ADAM policy
            # Compute how much the center moves
            a = self.stepsize * sqrt(1.0 - beta2**cgen) / (1.0 - beta1**cgen)
            m = beta1 * m + (1.0 - beta1) * globalg
            v = beta2 * v + (1.0 - beta2) * (globalg * globalg)
            dCenter = -a * m / (sqrt(v) + epsilon)
            # update center
            self.center += dCenter

            centroidfit = -999999999.0
            if self.evalCenter != 0:
                # Evaluate the centroid
                self.policy.set_trainable_flat(self.center)
                if self.sameenvcond == 1:
                    self.policy.setSeed(self.policy.get_seed + cgen)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                centroidfit = eval_rews
                ceval += eval_length
                # Update data if the centroid is better than current best
                self.updateBest(centroidfit, self.center)

            # Now perform generalization
            if self.policy.generalize:
                candidate = None
                if centroidfit > fitness[self.batchSize * 2 - 1]:
                    # Centroid undergoes generalization test
                    candidate = np.copy(self.center)
                else:
                    # Best sample undergoes generalization test
                    bestsamid = index[self.batchSize * 2 - 1]
                    candidate = np.copy(offspring[bestsamid])
                # Set the seed
                self.policy.set_trainable_flat(
                    candidate)  # Parameters must be updated by the algorithm!!
                self.policy.setSeed(self.policy.get_seed + 1000000)
                self.policy.doGeneralization(True)
                eval_rews, eval_length = self.policy.rollout(
                    timestep_limit=1000)
                gfit = eval_rews
                ceval += eval_length
                # Update data if the candidate is better than current best generalizing individual
                self.updateBestg(gfit, candidate)

            # Compute the elapsed time (i.e., how much time the generation lasted)
            elapsed = (time.time() - start_time)

            # Update information
            self.updateInfo(cgen, ceval, fitness, self.center, centroidfit,
                            fitness[self.batchSize * 2 - 1], elapsed, maxsteps)

        # save data
        self.save(cgen, ceval, centroidfit, self.center,
                  fitness[self.batchSize * 2 - 1], (time.time() - start_time))

        # print simulation time
        end_time = time.time()
        print('Simulation time: %dm%ds ' % (divmod(end_time - start_time, 60)))