def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): cache = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) print('grad:',grad) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) for k in grad: cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma) nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps) return nn
def sgd(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format( iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: nn.model[layer] -= alpha * grad[layer] return nn
def nesterov(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): velocity = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] nn_ahead = copy.deepcopy(nn) nn_ahead.model.update({k: v + gamma * velocity[k] for k, v in nn.model.items()}) grad, loss = nn_ahead.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) print('grad:',grad) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) for layer in grad: velocity[layer] = gamma * velocity[layer] + alpha * grad[layer] nn.model[layer] -= velocity[layer] return nn
def nesterov(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): velocity = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] nn_ahead = copy.deepcopy(nn) nn_ahead.model.update({k: v + gamma * velocity[k] for k, v in nn.model.items()}) grad, loss = nn_ahead.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: velocity[layer] = gamma * velocity[layer] + alpha * grad[layer] nn.model[layer] -= velocity[layer] return nn
def adam(nn, X_train, y_train, val_set=None, alpha=0.001, mb_size=256, n_iter=2000, print_after=100): M = {k: np.zeros_like(v) for k, v in nn.model.items()} R = {k: np.zeros_like(v) for k, v in nn.model.items()} beta1 = .9 beta2 = .999 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): t = iter idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for k in grad: M[k] = util.exp_running_avg(M[k], grad[k], beta1) R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2) m_k_hat = M[k] / (1. - beta1**(t)) r_k_hat = R[k] / (1. - beta2**(t)) nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps) return nn
def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): cache = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for k in grad: cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma) nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps) return nn
def sgd(nn, X_train, y_train, f, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches = get_minibatch(X_train, y_train, mb_size) accu = [] if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) accu.append(val_acc) print('Iter-{} loss: {:.4f} validation: {:4f}'.format( iter, loss, val_acc)) f.write('Iter-{} loss: {:.4f} validation: {:4f}'.format( iter, loss, val_acc)) np.set_printoptions(threshold=np.NaN) f.write('grad[W1] {}:{}'.format(iter, '\n')) f.write('{} {}'.format(grad['W1'], '\n')) f.write('grad[b1] {}:{}'.format(iter, '\n')) f.write('{} {}'.format(grad['b1'], '\n')) ''' f.write('grad[W2] {}:{}'.format(iter,'\n')) f.write('{} {}'.format(grad['W2'],'\n')) f.write('grad[b2] {}:{}'.format(iter,'\n')) f.write('{} {}'.format(grad['b2'],'\n')) f.write('grad[W3] {}:{}'.format(iter,'\n')) f.write('{} {}'.format(grad['W3'],'\n')) f.write('grad[b3] {}:{}'.format(iter,'\n')) f.write('{} {}'.format(grad['b3'],'\n')) ''' else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:', grad) for layer in grad: nn.model[layer] -= alpha * grad[layer] for content in accu: f.write(str(content)) return nn
def sgd(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set start = time.time() for iter in range(1, n_iter + 1): if iter != 0 and iter % 20000 == 0: print('Learning rate halved') alpha /= 2 idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini, iter) if iter % print_after == 0: if val_set: end = time.time() val_acc = util.accuracy(y_val, nn.predict(X_val)) test_acc = util.accuracy(y_mini, nn.predict(X_mini)) print( 'Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}' .format(iter, loss, test_acc, end - start, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: nn.model[layer] -= alpha * grad[layer] return nn
def momentum1(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): gamma = .9 velocity = [[]for i in range(worker_num)] minibatches =[[]for i in range(worker_num)] X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)] X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)] grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)] val_acc =[[] for i in range(worker_num)] index = ['W1', 'W2', 'W4', 'W5', 'b1', 'b2', 'b4', 'b5', 'gamma4', 'gamma5', 'beta4', 'beta5'] except_index=[] average_grad = dict() accu = [[] for i in range(worker_num)] for k in range(worker_num): minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size) velocity[k] = {k: np.zeros_like(v) for k, v in nn[k].model.items()} if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): for k in range(worker_num): idx = np.random.randint(0, len(minibatches[k])) X_mini[k], y_mini[k] = minibatches[k][idx] grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k]) if iter % print_after == 0: if val_set: val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss[k], val_acc[k])) #print('grad:',grad) else: print('Iter-{} loss: {:.4f}'.format(iter, loss[k])) #print('grad:',grad) for k in range(worker_num): for layer in grad[0]: if iter%15 ==0: velocity[k][layer] = gamma * (velocity[k][layer])+ alpha * (grad[0][layer]+grad[1][layer]+grad[2][layer]+grad[3][layer])/worker_num else: velocity[k][layer] = gamma*(velocity[k][layer])+alpha*grad[k][layer] nn[k].model[layer] -= velocity[k][layer] return nn
def momentum(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100, max_norm=None): velocity = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format( iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: velocity[layer] = gamma * velocity[layer] + alpha * grad[layer] nn.model[layer] -= velocity[layer] if max_norm != None: nn.model[layer] = reg.limit_norm(nn.model[layer], max_val=max_norm) return nn
def sgd(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: nn.model[layer] -= alpha * grad[layer] return nn
def adagrad(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100, max_norm=None): cache = {k: np.zeros_like(v) for k, v in nn.model.items()} minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format( iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for k in grad: cache[k] += grad[k]**2 nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps) if max_norm != None: nn.model[k] = reg.limit_norm(nn.model[k], max_val=max_norm) return nn
def sgd3(nn, X_train, y_train, worker_num, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches = [[] for i in range(worker_num)] X_mini, y_mini = [[] for i in range(worker_num) ], [[] for i in range(worker_num)] X_val, y_val = [[] for i in range(worker_num) ], [[] for i in range(worker_num)] grad, loss = [[] for i in range(worker_num)], [[] for i in range(worker_num)] val_acc = [[] for i in range(worker_num)] share_time = 15 start_time = [[[] for j in range(share_time)] for i in range(worker_num)] totoal_time = [[] for i in range(worker_num)] for k in range(worker_num): minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size) if val_set: #X_val[k], y_val[k] = val_set[k] X_val, y_val = val_set def f(x, y): return x + a[y] for iter in range(1, n_iter + 1): for k in range(worker_num): start_time[k][iter % share_time] = time.time() idx = np.random.randint(0, len(minibatches[k])) X_mini[k], y_mini[k] = minibatches[k][idx] grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k]) if iter % print_after == 0: if val_set: #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k])) val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val)) print( 'Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'. format(iter, k + 1, loss[k], val_acc[k], '\n')) #f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n')) #np.set_printoptions(threshold=np.NaN) #np.set_printoptions(precision=8) #f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['W1'],'\n')) #f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['b1'],'\n')) #f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['W2'],'\n')) #f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['b2'],'\n')) #f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['W3'],'\n')) #f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n')) #f[k].write('{}{}'.format(grad[k]['b3'],'\n')) #f[k].write('\n') ''' #print('gamma 3 ',grad[k]['gamma3']) #print('beta3 ',grad[k]['beta3']) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) if(k+1==worker_num): print('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss))) #f[k].write('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss))) ''' if iter % 15 != 0: for layer in grad[0]: nn[k].model[layer] -= alpha * (grad[k][layer]) start_time[k][iter % share_time] = time.time() - start_time[k][ iter % share_time] if iter % 15 == 0 and k == worker_num - 1: average_grad = dict() for layer in grad[0]: average_grad[layer] = 0 for i in range(worker_num): average_grad[layer] += grad[i][layer] for j in range(worker_num): for layer in grad[0]: nn[j].model[ layer] -= alpha * average_grad[layer] / worker_num start_time[j][iter % 15] = time.time() - start_time[j][iter % 15] a = start_time[j] totoal_time[j] = reduce(f, range(15), 0) print('worker{} {}-{} totoal cost time {}ms'.format( j + 1, iter - 14, iter, totoal_time[j] * 1000)) ''' for k in range(worker_num): for layer in grad[k]: if iter%15==0: nn[k].model[layer] -= alpha *average_grad[layer]/worker_num a = start_time[k] print(a) totoal_time[k] = reduce(f,range(15),0) print('worker{} {}-{} totoal cost time {}ms'.format(k+1,iter-14,iter,totoal_time[k])) else: nn[k].model[layer]-=alpha*grad[k][layer] print("per time {} {} ,{}".format(k+1,iter,start_time[k][iter%share_time])) ''' return nn
def sgd3(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches =[[]for i in range(worker_num)] X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)] X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)] grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)] val_acc =[[] for i in range(worker_num)] for k in range(worker_num): minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size) if val_set: #X_val[k], y_val[k] = val_set[k] X_val,y_val = val_set for iter in range(1, n_iter + 1): for k in range(worker_num): idx = np.random.randint(0, len(minibatches[k])) X_mini[k], y_mini[k] = minibatches[k][idx] grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k]) if iter % print_after == 0: if val_set: #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k])) val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val)) print('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n')) f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n')) np.set_printoptions(threshold=np.NaN) np.set_printoptions(precision=8) f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W1'],'\n')) f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b1'],'\n')) f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W2'],'\n')) f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b2'],'\n')) f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W3'],'\n')) f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b3'],'\n')) f[k].write('\n') #print('gamma 3 ',grad[k]['gamma3']) #print('beta3 ',grad[k]['beta3']) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) if(k+1==worker_num): print('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss))) f[k].write('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss))) average_grad = dict() for layer in grad[0]: average_grad[layer]=0 for i in range(worker_num): average_grad[layer]+=grad[i][layer] for k in range(worker_num): for layer in grad[k]: #average = 0 #for j in range(10): # average +=grad[k][layer] #nn[k].model[layer] -= alpha * (grad[0][layer]+grad[1][layer]+grad[2][layer]+grad[3][layer]+grad[4][layer]+grad[5][layer]+grad[6][layer]+grad[7][layer]+grad[8][layer]+grad[9][layer])/10 nn[k].model[layer] -= alpha *average_grad[layer]/worker_num return nn
def interleaving(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): ITER_FOR_DOUBLE = 2500 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set start = time.time() for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini, iter) if iter % print_after == 0: # for layer in grad: # print(np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer])) if val_set: end = time.time() val_acc = util.accuracy(y_val, nn.predict(X_val)) test_acc = util.accuracy(y_mini, nn.predict(X_mini)) print( 'Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}' .format(iter, loss, test_acc, end - start, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: nn.model[layer] -= alpha * grad[layer] if iter == ITER_FOR_DOUBLE: # Implement nn.freezeLastLayer() # Create dataset with data passed through first neural network, no train # Create neural network which takes that input nn2 = neuralnet.ResNet(nn.H, nn.C, nn.H, num_layers=4) nn2.model['Wf'] = nn.model['Wf'] nn2.model['bf'] = nn.model['bf'] nn2.freezeClassificationLayer() if iter > ITER_FOR_DOUBLE: new_X_mini = nn.passDataNoClass(X_mini) grad, loss = nn2.train_step(new_X_mini, y_mini, iter) # No print, because validation is vague # if iter % print_after == 0: # # for layer in grad: # # print(np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer])) # if val_set: # end = time.time() # val_acc = util.accuracy(y_val, nn2.predict(new_X_val)) # test_acc = util.accuracy(y_mini, nn2.predict(new_X_mini)) # print('nn2: Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}'.format(iter, loss, test_acc, end-start, val_acc)) # else: # print('Iter-{} loss: {:.4f}'.format(iter, loss)) for layer in grad: nn2.model[layer] -= alpha * grad[layer] if val_set: val_acc = util.accuracy(y_val, nn2.predict(nn.passDataNoClass(X_val))) print('Final validation: {:4f}'.format(val_acc)) # shouldHalve = True # for layer in grad: # epsi = 0.01*alpha # if np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer]) > epsi: # shouldHalve = False # if shouldHalve: # alpha /= 2 # print('Halved learning rate') # if alpha <= 0.0001: # print('Finished learning as step size too small') # return nn return nn
def sgd3(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): minibatches =[[]for i in range(worker_num)] X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)] X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)] grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)] val_acc =[[] for i in range(worker_num)] index = ['W1', 'W2', 'W4', 'W5', 'b1', 'b2', 'b4', 'b5', 'gamma4', 'gamma5', 'beta4', 'beta5'] except_index=[] average_grad = dict() for k in range(worker_num): minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size) if val_set: #X_val[k], y_val[k] = val_set[k] X_val,y_val = val_set for iter in range(1, n_iter + 1): for k in range(worker_num): idx = np.random.randint(0, len(minibatches[k])) X_mini[k], y_mini[k] = minibatches[k][idx] grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k]) if iter % print_after == 0: if val_set: #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k])) val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val)) print('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n')) f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n')) #np.set_printoptions(threshold=np.NaN) #np.set_printoptions(precision=8) f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W1'],'\n')) f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b1'],'\n')) ''' f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W2'],'\n')) f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b2'],'\n')) f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['W3'],'\n')) f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n')) f[k].write('{}{}'.format(grad[k]['b3'],'\n')) f[k].write('\n') ''' #print('gamma 3 ',grad[k]['gamma3']) #print('beta3 ',grad[k]['beta3']) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) if(k+1==worker_num): print('Iter-{} average loss {} '.format(iter ,sum(loss)/len(loss))) f[k].write('Iter-{} average loss: {} '.format(iter, sum(loss)/len(loss))) except_index = random.sample(index,6) for layer in except_index: nn[k].model[layer] -= alpha*grad[k][layer] available_index = [x for x in grad[0] if x not in except_index] for layer in available_index: average_grad[layer]=0 for i in range(worker_num): average_grad[layer] += grad[i][layer] for layer in available_index: for k in range(worker_num): nn[k].model[layer] -= alpha *average_grad[layer]/worker_num return nn