def train_lstm( dim_proj=None, xdim=None, ydim=None, patience=10, # Number of epoch to wait before early stop if no progress n_epochs=500, # The maximum number of epoch to run decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.001, # Learning rate for sgd (not used for adadelta and rmsprop) # n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. trainpath='../data/cv/', trainlist='../cvlist/JK-ch-1234-songwise.txt', validset='../data/cv/C-ch-songwise.mat', dumppath='../model/blstmrnn_model.npz', # The best model will be saved there batch_size=100, # The batch size during training. # Parameter for extra option noise_std=0., earlystop=True, dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. ): # Model options model_options = locals().copy() print "model options", model_options print 'Loading data' # the dateset is organized as: # X - n_songs * n_timesteps * dim_proj (dim_proj = 24 for chromagram based dataset) # y - n_songs * n_timesteps * 1 train, valid = load_data_song(trainpath=trainpath, trainlist=trainlist, validset=validset) print 'data loaded' model_options['xdim'] = xdim model_options['dim_proj'] = dim_proj model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout # the model takes input of: # x -- n_timesteps * dim_proj * n_samples (in a simpler case, n_samples = 1 in ctc) # y -- n_timesteps * 1 * n_samples (in a simpler case, n_samples = 1 in ctc) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U']**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, y], cost, name='f_cost') grads = T.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, y], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost) print 'Optimization' print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) best_validation_loss = numpy.inf best_p = None # 6000 is a scaling factor assuming every track contains 5000 frames on average n_train_batches = len( train[0]) * 5000 / batch_size / 10 # 10 is a scaling factor patience = min(10 * n_train_batches, 15000) # look as this many examples regardless patience_increase = 1.3 # wait this much longer when a new best is found done_looping = False improvement_threshold = 0.996 # a relative improvement of this much is validation_frequency = 100 # note here we manually set the validation freq training_history = [] iter = 0 best_iter = 0 start_time = time.time() for epoch in xrange(n_epochs): if earlystop and done_looping: print 'early-stopping' break n_samples = 0 # Get random sample a piece of length batch_size from a song idx0 = numpy.random.randint(0, len(train[0])) batch_size_ = batch_size while len(train[0][idx0]) <= batch_size_: batch_size_ = batch_size_ / 2 idx1 = numpy.random.randint(0, len(train[0][idx0]) - batch_size_) # 500 in our case iter += 1 use_noise.set_value(1.) # Select the random examples for this minibatch x = train[0][idx0][idx1:idx1 + batch_size_] y = train[1][idx0][idx1:idx1 + batch_size_] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) n_samples += 1 cost = f_grad_shared(x, y) f_update(lrate) if numpy.mod(iter, validation_frequency) == 0: use_noise.set_value(0.) this_validation_loss = pred_error(f_pred, valid) training_history.append([iter, this_validation_loss]) print('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) print('iter = %d' % iter) print('patience = %d' % patience) if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) params = unzip(tparams) numpy.savez(dumppath, training_history=training_history, best_validation_loss=best_validation_loss, **params) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print('best_validation_loss %f' % best_validation_loss) if patience <= iter: done_looping = True if earlystop: break end_time = time.time() # final save numpy.savez(dumppath, training_history=training_history, best_validation_loss=best_validation_loss, **params) print(('Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ') % (best_validation_loss * 100., best_iter + 1)) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_lstm( dim_proj=None, xdim=None, ydim=None, patience=10, # Number of epoch to wait before early stop if no progress max_epochs=500, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.001, # Learning rate for sgd (not used for adadelta and rmsprop) # n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. dumppath='ctc_model.npz', # The best model will be saved there validFreq=400, # Compute the validation error after this number of update. saveFreq=1000, # Save the parameters after every saveFreq updates maxlen=None, # Sequence longer then this get ignored batch_size=100, # The batch size during training. valid_batch_size=100, # The batch size used for validation/test set. dataset=None, # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. scaling=1, ): # Model options model_options = locals().copy() print "model options", model_options print 'Loading data' # the dateset is organized as: # X - n_songs * n_timesteps * dim_proj (dim_proj = 24 for chromagram based dataset) # y - n_songs * n_timesteps * 1 train, valid, test = load_data_song(dataset=dataset, valid_portion=0.1, test_portion=0.1) print 'data loaded' model_options['xdim'] = xdim model_options['dim_proj'] = dim_proj model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano T Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout # the model takes input of: # x -- n_timesteps * dim_proj * n_samples (in a simpler case, n_samples = 1 in ctc) # y -- n_timesteps * 1 * n_samples (in a simpler case, n_samples = 1 in ctc) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U']**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, y], cost, name='f_cost') grads = T.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, y], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost) print 'Optimization' print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in xrange(max_epochs): n_samples = 0 # Get random sample a piece of length batch_size from a song idx0 = numpy.random.randint(0, len(train[0])) batch_size_ = batch_size while len(train[0][idx0]) <= batch_size_: batch_size_ = batch_size_ / 2 idx1 = numpy.random.randint(0, len(train[0][idx0]) - batch_size_) # 500 in our case uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch x = train[0][idx0][idx1:idx1 + batch_size_] y = train[1][idx0][idx1:idx1 + batch_size_] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) n_samples += 1 cost = f_grad_shared(x, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost if dumppath and numpy.mod(uidx, saveFreq) == 0: print 'Saving...', # save the best param set to date (best_p) if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(dumppath, history_errs=history_errs, **params) # pkl.dump(model_options, open('%s.pkl' % dumppath, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # train_err = pred_error(f_pred, train) valid_err = pred_error(f_pred, valid) # test_err = pred_error(f_pred, test) # history_errs.append([valid_err, test_err]) history_errs.append([valid_err, 1]) # save param only if the validation error is less than the history minimum if (uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 # print ('Train ', train_err, 'Valid ', valid_err, # 'Test ', test_err) print('Valid ', valid_err) # early stopping if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break # print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) train_err = pred_error(f_pred, train) valid_err = pred_error(f_pred, valid) test_err = pred_error(f_pred, test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if dumppath: numpy.savez(dumppath, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
def train_lstm( dim_proj=None, xdim=None, ydim=None, patience=10, # Number of epoch to wait before early stop if no progress max_epochs=500, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.001, # Learning rate for sgd (not used for adadelta and rmsprop) # n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. dumppath='bctc_model.npz', # The best model will be saved there validFreq=5000, # Compute the validation error after this number of update. saveFreq=10000, # Save the parameters after every saveFreq updates maxlen=None, # Sequence longer then this get ignored batch_size=100, # The batch size during training. valid_batch_size=100, # The batch size used for validation/test set. dataset=None, # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. scaling=1, ): # Model options model_options = locals().copy() print "model options", model_options print 'Loading data' # the dateset is organized as: # X - n_songs * n_timesteps * dim_proj (dim_proj = 24 for chromagram based dataset) # y - n_songs * n_timesteps * 1 train, valid, test = load_data_song(dataset=dataset, valid_portion=0.1, test_portion=0.1) print 'data loaded' model_options['xdim'] = xdim model_options['dim_proj'] = dim_proj model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout # the model takes input of: # x -- n_timesteps * dim_proj * n_samples (in a simpler case, n_samples = 1 in ctc) # y -- n_timesteps * 1 * n_samples (in a simpler case, n_samples = 1 in ctc) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, y], cost, name='f_cost') grads = T.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, y], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost) print 'Optimization' print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in xrange(max_epochs): n_samples = 0 # Get random sample a piece of length batch_size from a song idx0 = numpy.random.randint(0,len(train[0])) batch_size_ = batch_size while len(train[0][idx0]) <= batch_size_: batch_size_ = batch_size_ / 2 idx1 = numpy.random.randint(0,len(train[0][idx0])-batch_size_) # 500 in our case uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch x = train[0][idx0][idx1:idx1+batch_size_] y = train[1][idx0][idx1:idx1+batch_size_] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) n_samples += 1 cost = f_grad_shared(x, y) f_update(lrate) # if numpy.isnan(cost) or numpy.isinf(cost): # print 'NaN detected' # return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost if dumppath and numpy.mod(uidx, saveFreq) == 0: print 'Saving...', # save the best param set to date (best_p) if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(dumppath, history_errs=history_errs, **params) # pkl.dump(model_options, open('%s.pkl' % dumppath, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # train_err = pred_error(f_pred, train) valid_err = pred_error(f_pred, valid) # test_err = pred_error(f_pred, test) # history_errs.append([valid_err, test_err]) history_errs.append([valid_err, 1]) # save param only if the validation error is less than the history minimum if (uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 # print ('Train ', train_err, 'Valid ', valid_err, # 'Test ', test_err) print ('Valid', valid_err) # early stopping if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break # print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) train_err = pred_error(f_pred, train) valid_err = pred_error(f_pred, valid) test_err = pred_error(f_pred, test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if dumppath: numpy.savez(dumppath, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
def train_lstm( dim_proj=None, xdim=None, ydim=None, patience=10, # Number of epoch to wait before early stop if no progress n_epochs=500, # The maximum number of epoch to run decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.001, # Learning rate for sgd (not used for adadelta and rmsprop) # n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. trainpath='../data/cv/', trainlist='../cvlist/JK-ch-1234-songwise.txt', validset='../data/cv/C-ch-songwise.mat', dumppath='../model/blstmrnn_model.npz', # The best model will be saved there batch_size=100, # The batch size during training. # Parameter for extra option noise_std=0., earlystop=True, dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. ): # Model options model_options = locals().copy() print "model options", model_options print 'Loading data' # the dateset is organized as: # X - n_songs * n_timesteps * dim_proj (dim_proj = 24 for chromagram based dataset) # y - n_songs * n_timesteps * 1 train, valid = load_data_song(trainpath=trainpath,trainlist=trainlist,validset=validset) print 'data loaded' model_options['xdim'] = xdim model_options['dim_proj'] = dim_proj model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout # the model takes input of: # x -- n_timesteps * dim_proj * n_samples (in a simpler case, n_samples = 1 in ctc) # y -- n_timesteps * 1 * n_samples (in a simpler case, n_samples = 1 in ctc) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, y], cost, name='f_cost') grads = T.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, y], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, y, cost) print 'Optimization' print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) best_validation_loss = numpy.inf best_p = None # 6000 is a scaling factor assuming every track contains 5000 frames on average n_train_batches = len(train[0]) * 5000 / batch_size / 10 # 10 is a scaling factor patience = min(10 * n_train_batches,15000) # look as this many examples regardless patience_increase = 1.3 # wait this much longer when a new best is found done_looping = False improvement_threshold = 0.996 # a relative improvement of this much is validation_frequency = 100 # note here we manually set the validation freq training_history = [] iter = 0 best_iter = 0 start_time = time.time() # SP is a (songidx, pos) dictionary ordered by class label. It has ydim entries SP = balanced(ydim,train[1]) classorder = numpy.random.permutation(ydim) for epoch in xrange(n_epochs): if earlystop and done_looping: print 'early-stopping' break # generate a random order of a balanced class distribution #print 'epoch',epoch classidx = epoch % ydim #print 'classidx', classidx if classidx == 0: # reshuffle the class order classorder = numpy.random.permutation(ydim) #print 'class order', classorder classorderidx = classorder[classidx] #print 'classorderidx',classorderidx # check the SP[classidx] not empty if not SP[classorderidx]: #print 'SP[classidx] empty' continue ranidx = numpy.random.randint(len(SP[classorderidx])) #print 'randix',ranidx (idx0,idx1) = SP[classorderidx][ranidx] #print 'idx0,idx1',idx0,idx1 n_samples = 0 # Get random sample a piece of length batch_size from a song # idx0 = numpy.random.randint(0,len(train[0])) # batch_size_ = batch_size # while len(train[0][idx0]) <= batch_size_: # batch_size_ = batch_size_ / 2 # idx1 = numpy.random.randint(0,len(train[0][idx0])-batch_size_) # 500 in our case iter += 1 use_noise.set_value(1.) # Select the random examples for this minibatch endbound = min(idx1+batch_size,len(train[1][idx0])) x = train[0][idx0][idx1:idx1+batch_size] y = train[1][idx0][idx1:idx1+batch_size] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) n_samples += 1 cost = f_grad_shared(x, y) f_update(lrate) # if numpy.isnan(cost) or numpy.isinf(cost): # print 'NaN detected' # return 1., 1., 1. if numpy.mod(iter, validation_frequency) == 0: use_noise.set_value(0.) this_validation_loss = pred_error(f_pred, valid) training_history.append([iter,this_validation_loss]) print('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) print('iter = %d' % iter) print('patience = %d' % patience) if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) params = unzip(tparams) numpy.savez(dumppath, training_history=training_history, best_validation_loss=best_validation_loss,**params) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter print('best_validation_loss %f' % best_validation_loss) if patience <= iter: done_looping = True if earlystop: break end_time = time.time() # final save numpy.savez(dumppath, training_history=training_history, best_validation_loss=best_validation_loss, **params) print( ( 'Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ' ) % (best_validation_loss * 100., best_iter + 1) ) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))