def run(CONFIG): torch.manual_seed(9669) # Initialize checkpoints dir save_chkpt = CONFIG['save_path'] + '/chkpt_' + str(CONFIG['id']) + '.pth.tar' best_save_chkpt = CONFIG['save_path'] + '/best_chkpt_' + str(CONFIG['id']) + '.pth.tar' # Initialize logger dir if CONFIG['end_log']: if not os.path.exists(CONFIG['save_path'] + '/log'): os.mkdir(CONFIG['save_path'] + '/log') log_dir = CONFIG['save_path'] + '/log/' + 'logger_' + str(CONFIG['id']) + '.pkl' logger = Logger(log_dir) # Prepare dataloader and preprocessor train_ldr, dev_ldr = load_data(CONFIG['batch_size'], CONFIG['subset']) int2char = int_to_char('swbd/int2char.swbd.pkl') print(int2char) print(len(int2char)) model = Model(input_dim=123, num_class=len(int2char), CONFIG=CONFIG) # Load pretrain model if CONFIG['pre_train']: pretrained_model_dir = CONFIG['pretrained_save_path'] + '/asr_best_chkpt_' + str(CONFIG['pretrained_id']) + '.pth.tar' checkpoints = torch.load(pretrained_model_dir) pretrained_dict = checkpoints['state_dict'] model.load_state_dict(pretrained_dict) # Turn on CUDA if CONFIG['cuda']: model = model.cuda() # Initialize optimizer SGD or ADAM optimizer = None if CONFIG['optimizer']['opt'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=CONFIG['optimizer']['lr'], momentum=CONFIG['optimizer']['mom'], weight_decay=CONFIG['optimizer']['l2'], nesterov=CONFIG['optimizer']['nes']) elif CONFIG['optimizer']['opt'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=CONFIG['optimizer']['lr'], betas=(CONFIG['optimizer']['beta1'], CONFIG['optimizer']['beta2']), eps=CONFIG['optimizer']['eps'], weight_decay=CONFIG['optimizer']['l2']) # Set best_cer to infinite best_cer = float('inf') # Set starting epochs for resume setting START_EPOCH = 0 # Resume training from a previous checkpoints if CONFIG['resume']: checkpoint = torch.load(save_chkpt) START_EPOCH = checkpoint['epoch'] best_cer = checkpoint['best_cer'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) # Initialize a scheduler for learning rate decay if CONFIG['optimizer']['lr_decay']: scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=CONFIG['optimizer']['step'], gamma=CONFIG['optimizer']['factor']) try: print('Starts training') for epoch in range(START_EPOCH, CONFIG['epoch']): start_time = time.time() # Learning rate decay if CONFIG['optimizer']['lr_decay']: if epoch >= 10: scheduler.step() # Training train_loss, eval_loss, eval_cer = train(train_ldr, dev_ldr, model, optimizer, int2char) # Evaluating end_epoch_eval_loss, end_epoch_eval_cer = eval(dev_ldr, model, int2char) # Print stats echo(epoch, train_loss, end_epoch_eval_loss, end_epoch_eval_cer, start_time) # Save model in each epoch save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_cer': best_cer, 'optimizer': optimizer.state_dict()}, save_chkpt) # Save model in each of 10 epoch count_epoch = epoch+1 if count_epoch <= 20: if count_epoch % 2 == 0: save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_cer': best_cer, 'optimizer': optimizer.state_dict()}, CONFIG['save_path'] + '/chkpt_' + str(CONFIG['id']) + '_' + str(count_epoch)+'.pth.tar') else: if count_epoch % 10 == 0: save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_cer': best_cer, 'optimizer': optimizer.state_dict()}, CONFIG['save_path'] + '/chkpt_' + str(CONFIG['id']) + '_' + str(count_epoch) + '.pth.tar') # Save best model if end_epoch_eval_cer < best_cer: save_checkpoint({'state_dict': model.state_dict(), 'epoch': epoch + 1, 'eval_cer': end_epoch_eval_cer, 'eval_loss': end_epoch_eval_loss}, best_save_chkpt) best_cer = end_epoch_eval_cer # Save loss into logger if CONFIG['end_log']: eval_loss.append(end_epoch_eval_loss) eval_cer.append(end_epoch_eval_cer) logger.add_loss(epoch=epoch, loss_list=eval_loss) logger.add_cer(epoch=epoch, cer_list=eval_cer) logger.save() except KeyboardInterrupt: print('=' * 89) print('Exiting from training early') # Load best model and calculate test CER checkpoint = torch.load(best_save_chkpt) model_state_dict = checkpoint['state_dict'] model.load_state_dict(model_state_dict) eval_loss, eval_cer = eval(dev_ldr, model, int2char, beam_size=10) stop_epoch = checkpoint['epoch'] # Evaluate on test set #test_loss, test_cer = eval(test_ldr, model, int2char, beam_size=10) test_loss, test_cer = 0, 0 print('=' * 89) print('======= Test Model =======') print('Early stopping epoch: %s' % stop_epoch) print('Eval Loss: %s' % eval_loss) print('CER: %s' % eval_cer) print('Test Loss: %s' % test_loss) print('CER: %s' % test_cer) # Write loss to the google sheet write_google_sheet(CONFIG, eval_loss, eval_cer, test_loss, test_cer, stop_epoch)
class Network: def __init__(self, hidden_shapes, input_shape=784, output_shape=10, weight_scale=1e-3, xavier=False): """Network designet to work on MNIST dataset. Arguments: hidden_shapes -- list of shapes of each layer Keyword Arguments: input_shape -- (default: {784}) output_shape -- (default: {10}) weight_scale -- Weights will be multiplied by this number during initialization(default: {1e-3}) xavier -- Xavier initialization (default: {False}) """ shapes = [input_shape] + hidden_shapes + [output_shape ] # Shapes of all layers self.shapes = shapes self.L = len(hidden_shapes) + 1 # Length of the network self.params = self.initialize_params( shapes, weight_scale, xavier) # Weights and biases (ang gammas, betas) self.logger = Logger() self.use_batchnorm = False self.use_dropout = False self.bn_params = None def initialize_method(self, optim_parameters): """Pick gradient descent method""" step = None method = optim_parameters.get('method', 'sgd') # method of optimalization if str.lower(method) == 'sgd': step = optimize.SGD_step elif str.lower(method) == 'adam': step = optimize.Adam_step beta1 = optim_parameters.get('beta1', 0.9) beta2 = optim_parameters.get('beta2', 0.99) s = {key: 0 for key in self.params.keys()} v = {key: 0 for key in self.params.keys()} optim_parameters['s'] = s optim_parameters['v'] = v optim_parameters['beta1'] = beta1 optim_parameters['beta2'] = beta2 else: print("Wrong optimalization method") return step def initialize_params(self, shapes, weight_scale, xavier): """Initialize weights and biases""" params = {} for l in range(1, len(shapes)): W = np.random.randn(shapes[l - 1], shapes[l]) b = np.zeros((1, shapes[l])) if xavier: W *= np.sqrt(2 / shapes[l - 1]) else: W *= weight_scale params['W' + str(l)] = W params['b' + str(l)] = b return params def initialize_batchnorm(self, shapes, params): bn_params = [] for l in range(1, self.L): params['gamma' + str(l)] = np.ones(shapes[l]) params['beta' + str(l)] = np.zeros(shapes[l]) bn_params.append({}) return params, bn_params def forward_linear(self, A, W, b): Z = A.dot(W) + b cache = (A, W, b) return Z, cache def forward_activation(self, Z): A = ReLU.apply(Z) cache = Z return A, cache def backward_linear(self, dZ, linear_cache): A_prev, W, b = linear_cache m = A_prev.shape[0] dW = A_prev.T.dot(dZ) db = np.sum(dZ, axis=0, keepdims=True) dA_prev = dZ.dot(W.T) return dW, db, dA_prev def backward_activation(self, dA, relu_cache): Z = relu_cache dZ = ReLU.derivative(Z) * dA return dZ def forward_pass(self, X, params, bn_params, dropout=1, use_batchnorm=False, mode='train'): """Compute forward pass. Layers: (Linear -> Relu) * L - 1 -> Linear -> Softmax Arguments: X -- Input matrix params -- Parameters dict (Weights and biases Returns: scores -- Matrix of scores of each class (It can be interpreted as probabilities) caches -- List of caches for each layer """ L = self.L # number of layers caches = [] A_prev = X for l in range(1, L + 1): W = params['W' + str(l)] b = params['b' + str(l)] Z, linear_cache = self.forward_linear(A_prev, W, b) relu_cache, batchnorm_cache, dropout_cache = None, None, None # Skip the activation function if l == L: A = Z else: A, relu_cache = self.forward_activation(Z) if use_batchnorm: bn_param = bn_params[l - 1] A, batchnorm_cache = self.batchnorm_forward( A, params['beta' + str(l)], params['gamma' + str(l)], bn_param, mode) if self.use_dropout: A, dropout_cache = self.dropout_forward(A, dropout) caches.append( (linear_cache, relu_cache, batchnorm_cache, dropout_cache)) A_prev = A return A, caches def backward_pass(self, scores, Y, params, caches, use_batchnorm=False, reg=0): """Perform backpropagation. Return gradiets of weights and biases. Arguments: scores -- Result of forward propagation Y -- Ground truth labels params -- Dict of weights and biases caches -- List od caches (from forward propagation) Returns: grads -- Dict of gradients for each layer loss -- Loss value """ L = self.L # compute loss and derivative of cost function loss = Softmax.apply(scores, Y) dscores = Softmax.derivative(scores, Y) grads = {} dA = None # Propagate backwards through layers L, L-1, ..., 1 for l in reversed(range(1, L + 1)): linear_cache, relu_cache, batchnorm_cache, dropout_cache = caches[ l - 1] # If it's a last layer, perform linear backprop if l == L: dW, db, dA = self.backward_linear(dscores, linear_cache) # Do activation -> linear backprop else: if self.use_dropout: dA = self.dropout_backward(dA, dropout_cache) if use_batchnorm: dZ, dbeta, dgamma = self.batchnorm_backward( dA, batchnorm_cache) grads['beta' + str(l)] = dbeta grads['gamma' + str(l)] = dgamma dZ = self.backward_activation(dA, relu_cache) dW, db, dA = self.backward_linear(dZ, linear_cache) loss += 0.5 * reg * np.sum(np.sum(params['W' + str(l)]**2)) dW += reg * params['W' + str(l)] grads['W' + str(l)] = dW grads['b' + str(l)] = db return grads, loss def dropout_forward(self, A, p): """Dropout layer should be placed after activation function. Neurons in activation layer will be kept with probability of p Arguments: A -- activations p -- probability of keeping neuron """ mask = np.ones_like(A) probs = np.random.rand(A.shape[0], A.shape[1]) mask[probs > p] = 0 A *= mask / p return A, mask def dropout_backward(self, dA, dropout_cache): dA *= dropout_cache return dA def forward_cost(self, X, Y, params): """for gradient check""" scores, _ = self.forward_pass(X, params, self.bn_params, mode='test') loss = Softmax.apply(scores, Y) return loss def batchnorm_forward(self, x, beta, gamma, bn_param, mode='train', eps=1e-7): """Batch-normalization layer. If mode is set to 'train' it computes mean on mini-batch of data and uses it to normalize activations. When mode is set to 'test' it uses the running avarages to approximate mean and variance""" N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype)) running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype)) momentum = bn_param.get('momentum', 0.9) cache = None if mode == 'train': mean = np.mean(x, axis=0) var = np.var(x, axis=0) inv_var = 1 / np.sqrt(var + eps) x_hat = (x - mean) * inv_var out = x_hat * gamma + beta # Save running averages for a test pass bn_param['running_mean'] = momentum * running_mean + ( 1.0 - momentum) * mean bn_param['running_var'] = momentum * running_var + (1.0 - momentum) * var cache = (inv_var, x_hat, gamma) else: # Get running averages and use them to normalize activations mean = running_mean var = running_var inv_var = 1 / np.sqrt(var + eps) x_hat = (x - mean) * inv_var out = x_hat * gamma + beta return out, cache def batchnorm_backward(self, dout, cache): inv_var, x_hat, gamma = cache N, D = dout.shape dx_hat = dout * gamma dgamma = np.sum(x_hat * dout, axis=0) dbeta = np.sum(dout, axis=0) dx = (1 / N) * inv_var * (N * dx_hat - np.sum(dx_hat, axis=0) - x_hat * np.sum(dx_hat * x_hat, axis=0)) return dx, dbeta, dgamma def train(self, X_train, Y_train, batch_size, epochs, optim_parameters={}, verbose=False, X_test=None, Y_test=None): """Optimize loss function. Arguments: X_train -- Matrix of inputs Y_train -- Vector of labels batch_size -- size of mini-batch epochs -- number of epochs (walks through dataset) Keyword Arguments: optim_parameters -- {'method': 'sgd' or 'adam', 'learning_rate": real number, 'use_bachnorm': boolean, 'dropout': number between [0, 1], propability of keeping a neuron, // adam parameters 'beta1', 'beta2', number between [0, 1]} (default: {{}}) X_test -- test data Y_test -- test labels verbose -- verbose (default: {False}) """ reg = optim_parameters.get('reg', 0) dropout = optim_parameters.get('dropout', 1) lr_decay = optim_parameters.get('lr_decay', 1) use_batchnorm = optim_parameters.get('use_batchnorm', False) # Set flags self.use_batchnorm = use_batchnorm self.use_dropout = False if dropout == 1 else True # Init batchnorm parameters if necessary if use_batchnorm: self.params, self.bn_params = self.initialize_batchnorm( self.shapes, self.params) else: self.bn_params = None # set method of gradient descent step = self.initialize_method(optim_parameters) # set batchnorm mode to train mode = 'train' it = 0 for epoch in range(epochs): # Shuffle X, Y and make mini batches X_mini_batches, Y_mini_batches = self.batchify( X_train, Y_train, batch_size) for x_mb, y_mb in zip(X_mini_batches, Y_mini_batches): scores, caches = self.forward_pass(x_mb, self.params, self.bn_params, dropout=dropout, mode=mode, use_batchnorm=use_batchnorm) grads, loss = self.backward_pass(scores, y_mb, self.params, caches, reg=reg, use_batchnorm=use_batchnorm) # log loss self.logger.add_loss(loss) # Perform gradient descent step (method in optim_paramters) self.params, optim_parameters = step(self.params, grads, optim_parameters) if verbose and it % 100 == 0: print("Epoch {}/{}, loss: {}, lr: {}".format( epoch, epochs, loss, optim_parameters['learning_rate'])) it += 1 # log errors if X_test is not None and Y_test is not None: val_error = 1 - self.accuracy(X_test, Y_test) train_error = 1 - self.accuracy(X_train, Y_train) self.logger.add_errors(val_error, train_error) # decay learing rate optim_parameters['learning_rate'] *= lr_decay def batchify(self, X, Y, batch_size): """ Shuffle and make mini batches""" indexes = np.arange(X.shape[0]) np.random.shuffle(indexes) X_shuffeled = X[indexes] Y_shuffeled = Y[indexes] X_mini_batches = [] Y_mini_batches = [] for k in np.arange(0, X.shape[0], batch_size): X_mini_batches.append(X_shuffeled[k:k + batch_size, :]) Y_mini_batches.append(Y_shuffeled[k:k + batch_size]) return X_mini_batches, Y_mini_batches def predict(self, X): """Predict Y labels Arguments: X -- Matrix of inputs Returns: vector -- Predicted labels """ scores, _ = self.forward_pass(X, self.params, self.bn_params, use_batchnorm=self.use_batchnorm, mode='test') y_predict = np.argmax(scores, axis=1) return y_predict def accuracy(self, X, Y_true): y_predict = self.predict(X) return np.mean(y_predict == Y_true) def get_logger(self): return self.logger