print(target.cpu().numpy()[:10]) # Target pattern to be reconstructed print( inputs.cpu().numpy()[-1][0][:10] ) # Last input contains the degraded pattern fed to the network at test time print(y.data.cpu().numpy()[:10]) # Final output of the network previoustime = nowtime nowtime = time.time() print("Time spent on last", print_every, "iters: ", nowtime - previoustime) total_loss /= print_every all_losses.append(total_loss) print("Mean loss over last", print_every, "iters:", total_loss) print("") if (numiter + 1) % save_every == 0: fname = 'loss_binary_lstm_nbiter_' + str( params['nbiter'] ) + '_nbhneur_' + str(NBHIDDENNEUR) + '_clamp_' + str( CLAMPING) + '_lr_' + str(ADAMLEARNINGRATE) + '_prestime_' + str( PRESTIME) + '_ipd_' + str(INTERPRESDELAY) + '_rngseed_' + str( RNGSEED) + '.txt' with open(fname, 'w') as fo: for item in all_losses: fo.write("%s\n" % item) # Uber-only (comment out if not at Uber) if checkHdfs(): print("Transfering to HDFS...") transferFileToHdfsDir(fname, '/ailabs/tmiconi/simple/') total_loss = 0
def train(paramdict): #params = dict(click.get_current_context().params) print("Starting training...") params = {} #params.update(defaultParams) params.update(paramdict) print("Passed params: ", params) print(platform.uname()) #params['nbsteps'] = params['nbshots'] * ((params['prestime'] + params['interpresdelay']) * params['nbclasses']) + params['prestimetest'] # Total number of steps per episode suffix = "maze_" + "".join([ str(x) + "_" if pair[0] is not 'nbsteps' and pair[0] is not 'rngseed' and pair[0] is not 'save_every' and pair[0] is not 'test_every' else '' for pair in sorted(zip(params.keys(), params.values()), key=lambda x: x[0]) for x in pair ])[:-1] + "_rngseed_" + str( params['rngseed'] ) # Turning the parameters into a nice suffix for filenames # Initialize random seeds (first two redundant?) print("Setting random seeds") np.random.seed(params['rngseed']) random.seed(params['rngseed']) torch.manual_seed(params['rngseed']) print("Initializing network") net = Network(params) print("Shape of all optimized parameters:", [x.size() for x in net.parameters()]) allsizes = [torch.numel(x.data.cpu()) for x in net.parameters()] print("Size (numel) of all optimized elements:", allsizes) print("Total size (numel) of all optimized elements:", sum(allsizes)) print("Initializing optimizer") optimizer = torch.optim.Adam(net.parameters(), lr=1.0 * params['lr'], eps=1e-4) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=params['gamma'], step_size=params['steplr']) LABSIZE = params['labsize'] lab = np.ones((LABSIZE, LABSIZE)) CTR = LABSIZE // 2 # Simple cross maze #lab[CTR, 1:LABSIZE-1] = 0 #lab[1:LABSIZE-1, CTR] = 0 # Double-T maze #lab[CTR, 1:LABSIZE-1] = 0 #lab[1:LABSIZE-1, 1] = 0 #lab[1:LABSIZE-1, LABSIZE - 2] = 0 # Grid maze lab[1:LABSIZE - 1, 1:LABSIZE - 1].fill(0) for row in range(1, LABSIZE - 1): for col in range(1, LABSIZE - 1): if row % 2 == 0 and col % 2 == 0: lab[row, col] = 1 lab[CTR, CTR] = 0 # Not really necessary, but nicer to not start on a wall, and perhaps helps localization by introducing a detectable irregularity in the center? all_losses = [] all_losses_objective = [] all_losses_eval = [] all_losses_v = [] lossbetweensaves = 0 nowtime = time.time() print("Starting episodes...") sys.stdout.flush() pos = 0 hidden = net.initialZeroState() hebb = net.initialZeroHebb() # Starting episodes! for numiter in range(params['nbiter']): PRINTTRACE = 0 if (numiter + 1) % (1 + params['print_every']) == 0: PRINTTRACE = 1 # Note: it doesn't matter if the reward is on the center (reward is only computed after an action is taken). All we need is not to put it on a wall or pillar (lab=1) rposr = 0 rposc = 0 if params['rp'] == 0: # If we want to constrain the reward to fall on the periphery of the maze while lab[rposr, rposc] == 1: rposr = np.random.randint(1, LABSIZE - 1) rposc = np.random.randint(1, LABSIZE - 1) elif params['rp'] == 1: while lab[rposr, rposc] == 1 or (rposr != 1 and rposr != LABSIZE - 2 and rposc != 1 and rposc != LABSIZE - 2): rposr = np.random.randint(1, LABSIZE - 1) rposc = np.random.randint(1, LABSIZE - 1) #print("Reward pos:", rposr, rposc) # Agent always starts an episode from the center posc = CTR posr = CTR optimizer.zero_grad() loss = 0 lossv = 0 hidden = net.initialZeroState() hebb = net.initialZeroHebb() reward = 0.0 rewards = [] vs = [] logprobs = [] sumreward = 0.0 dist = 0 for numstep in range(params['eplen']): inputsN = np.zeros((1, TOTALNBINPUTS), dtype='float32') inputsN[0, 0:RFSIZE * RFSIZE] = lab[posr - RFSIZE // 2:posr + RFSIZE // 2 + 1, posc - RFSIZE // 2:posc + RFSIZE // 2 + 1].flatten() inputs = torch.from_numpy(inputsN).cuda() # Previous chosen action #inputs[0][numactionchosen] = 1 inputs[0][-1] = 1 # Bias neuron inputs[0][-2] = numstep inputs[0][-3] = reward # Running the network y, v, hidden, hebb = net( Variable(inputs, requires_grad=False), hidden, hebb ) # y should output probabilities; v is the value prediction distrib = torch.distributions.Categorical(y) actionchosen = distrib.sample( ) # sample() returns a Pytorch tensor of size 1; this is needed for the backprop below numactionchosen = actionchosen.data[0] # Turn to scalar # Target position, based on the selected action tgtposc = posc tgtposr = posr if numactionchosen == 0: # Up tgtposr -= 1 elif numactionchosen == 1: # Down tgtposr += 1 elif numactionchosen == 2: # Left tgtposc -= 1 elif numactionchosen == 3: # Right tgtposc += 1 else: raise ValueError("Wrong Action") reward = 0.0 if lab[tgtposr][tgtposc] == 1: reward = -.1 else: dist += 1 posc = tgtposc posr = tgtposr # Did we hit the reward location ? Increase reward and teleport! # Note that it doesn't matter if we teleport onto the reward, since reward hitting is only evaluated after the (obligatory) move if rposr == posr and rposc == posc: reward += 10 if params['randstart'] == 1: posr = np.random.randint(1, LABSIZE - 1) posc = np.random.randint(1, LABSIZE - 1) while lab[posr, posc] == 1: posr = np.random.randint(1, LABSIZE - 1) posc = np.random.randint(1, LABSIZE - 1) else: posr = CTR posc = CTR # Store the obtained reward, value prediction, and log-probabilities, for this time step rewards.append(reward) sumreward += reward vs.append(v) logprobs.append(distrib.log_prob(actionchosen)) # A3C/A2C has an entropy reward on the output probabilities, to # encourage exploration. Our version of PyTorch does not have an # entropy() function for Distribution, so we use a penalty on the # sum of squares instead, which has the same basic property # (discourages concentration). It really does help! loss += params['bentropy'] * y.pow(2).sum() #if PRINTTRACE: # print("Probabilities:", y.data.cpu().numpy(), "Picked action:", numactionchosen, ", got reward", reward) # Do the A2C ! (essentially copied from V. Mnih, https://arxiv.org/abs/1602.01783, Algorithm S3) R = 0 gammaR = params['gr'] for numstepb in reversed(range(params['eplen'])): R = gammaR * R + rewards[numstepb] lossv += (vs[numstepb][0] - R).pow(2) loss -= logprobs[numstepb] * (R - vs[numstepb].data[0][0]) if PRINTTRACE: print("lossv: ", lossv.data.cpu().numpy()[0]) print("Total reward for this episode:", sumreward, "Dist:", dist) # Do we want to squash rewards for stabilization? if params['squash'] == 1: if sumreward < 0: sumreward = -np.sqrt(-sumreward) else: sumreward = np.sqrt(sumreward) elif params['squash'] == 0: pass else: raise ValueError("Incorrect value for squash parameter") # Mixing the reward loss and the value-prediction loss loss += params['blossv'] * lossv loss /= params['eplen'] loss.backward() #scheduler.step() optimizer.step() #torch.cuda.empty_cache() lossnum = loss.data[0] lossbetweensaves += lossnum if (numiter + 1) % 10 == 0: all_losses_objective.append(lossnum) all_losses_eval.append(sumreward) all_losses_v.append(lossv.data[0]) # Algorithm done. Now print statistics and save files. if (numiter + 1) % params['print_every'] == 0: print(numiter, "====") print("Mean loss: ", lossbetweensaves / params['print_every']) lossbetweensaves = 0 previoustime = nowtime nowtime = time.time() print("Time spent on last", params['print_every'], "iters: ", nowtime - previoustime) if params['type'] == 'plastic' or params['type'] == 'lstmplastic': print("ETA: ", net.eta.data.cpu().numpy(), "alpha[0,1]: ", net.alpha.data.cpu().numpy()[0, 1], "w[0,1]: ", net.w.data.cpu().numpy()[0, 1]) elif params['type'] == 'rnn': print("w[0,1]: ", net.w.data.cpu().numpy()[0, 1]) if (numiter + 1) % params['save_every'] == 0: print("Saving files...") losslast100 = np.mean(all_losses_objective[-100:]) print("Average loss over the last 100 episodes:", losslast100) print("Saving local files...") with open('params_' + suffix + '.dat', 'wb') as fo: pickle.dump(params, fo) with open('lossv_' + suffix + '.txt', 'w') as thefile: for item in all_losses_v: thefile.write("%s\n" % item) with open('loss_' + suffix + '.txt', 'w') as thefile: for item in all_losses_eval: thefile.write("%s\n" % item) torch.save(net.state_dict(), 'torchmodel_' + suffix + '.dat') # Uber-only print("Saving HDFS files...") if checkHdfs(): print("Transfering to HDFS...") transferFileToHdfsDir('loss_' + suffix + '.txt', '/ailabs/tmiconi/gridlab/') transferFileToHdfsDir('torchmodel_' + suffix + '.dat', '/ailabs/tmiconi/gridlab/') transferFileToHdfsDir('params_' + suffix + '.dat', '/ailabs/tmiconi/gridlab/')