def load_data( maxlen=3000 ): ''' Load dataset ''' train, valid, test = imdb.load_data() tr_inp, _, tr_targ = imdb.prepare_data( train[0], train[1], maxlen=maxlen ) te_inp, _, te_targ = imdb.prepare_data( test[0], test[1], maxlen=maxlen ) v_inp, _, v_targ = imdb.prepare_data( valid[0], valid[1], maxlen=maxlen ) train = shuffle( np.transpose( tr_inp ), reformat( np.asarray( tr_targ ), 2 ) ) test = shuffle( np.transpose( te_inp ), reformat( np.asarray( te_targ ), 2 ) ) valid = shuffle( np.transpose( v_inp ), reformat( np.asarray( v_targ ), 2 ) ) print "Train shape : {}, {}".format( train[0].shape, train[1].shape ) print "Test shape : {}, {}".format( test[0].shape, test[1].shape ) print "Valid shape : {}, {}".format( valid[0].shape, valid[1].shape ) imdb_dict = pickle.load( open('imdb.dict.pkl','rb') ) return train, test, valid, imdb_dict
def main(unused_args): maxlen = 100 n_words = 10000 print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) train = imdb.prepare_data(train[0], train[1], maxlen=maxlen) valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen) test = imdb.prepare_data(test[0], test[1], maxlen=maxlen) for data in [train, valid, test]: print(data[0].shape, data[1].shape, data[2].shape) config = get_config() eval_config = get_config() #eval_config.batch_size = 1 #eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = SentimentModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = SentimentModel(is_training=False, config=config) mtest = SentimentModel(is_training=False, config=config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) start_time = time.time() train_acc = run_epoch(session, m, train, m.train_op) print("Training Accuracy = %.4f, time = %.3f seconds\n" % (train_acc, time.time() - start_time)) valid_acc = run_epoch(session, mvalid, valid, tf.no_op()) print("Valid Accuracy = %.4f\n" % valid_acc) test_acc = run_epoch(session, mtest, test, tf.no_op()) print("Test Accuracy = %.4f\n" % test_acc)
def generate_data(self): '''Load the dataset Generate train, valid and test dataset ''' print("Loading data...") train, valid, _ = load_data(path=self.path) self.X_train, self.X_mask_train, self.Y_train = prepare_data(train[0], train[1], maxlen=self.maxlen) self.X_valid, self.X_mask_valid, self.Y_valid = prepare_data(valid[0], valid[1], maxlen=self.maxlen) del train, valid print(len(self.X_train), 'train sequences') print(len(self.X_valid), 'valid sequences') print("Pad sequences (samples x time)") self.X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen) self.X_valid = sequence.pad_sequences(self.X_valid, maxlen=self.maxlen) print('X_train shape:', self.X_train.shape) print('X_valid shape:', self.X_valid.shape)
def main(unused_args): maxlen = 100 n_words = 10000 print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) train = imdb.prepare_data(train[0], train[1], maxlen=maxlen) valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen) test = imdb.prepare_data(test[0], test[1], maxlen=maxlen) for data in [train, valid, test]: print(data[0].shape, data[1].shape, data[2].shape) config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = SentimentModel(is_training=True, config=config) with tf.variable_scope("model", reuse = True, initializer=initializer): mvalid = SentimentModel(is_training=False, config=config) mtest = SentimentModel(is_training=False, config=config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) start_time = time.time() train_acc = run_epoch(session, m, train, m.train_op) print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time)) valid_acc = run_epoch(session, mvalid, valid, tf.no_op()) print("Valid Accuracy = %.4f\n" % valid_acc) test_acc = run_epoch(session, mtest, test, tf.no_op()) print("Test Accuracy = %.4f\n" % test_acc)
print('Build model ') X,Mask,Y,\ cost,err, \ train_function, valid_function, predict_function = build_model(vocab_size=vocab_size, embsize=embsize, hiddensize=hiddensize) print('Training ') for eidx in range(max_epochs): kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True) costs = [] errs = [] for _, train_index in kf: # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = imdb.prepare_data(x, y) cost, err = train_function(x, mask, y) if np.isnan(cost) or np.isnan(err): continue costs.append(float(cost)) errs.append(float(err)) costs = np.array(costs) errs = np.array(errs) print "Epoch {0}: Cost {1} Err {2}".format(eidx, np.mean(costs), np.mean(errs))
def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=50, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='save/lstm_model.npz', # The best model will be saved there validFreq=370, # Compute the validation error after this number of update. saveFreq=50, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print("model options", model_options) #load_data, prepare_data = get_dataset(dataset) print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = np.arange(len(test[0])) np.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = np.max(train[1]) + 1 model_options['ydim'] = ydim print('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = T.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = T.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=list(tparams.values())) f_grad = T.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print('Optimization') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) // batch_size if saveFreq == -1: saveFreq = len(train[0]) // batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = imdb.prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if np.isnan(cost) or np.isinf(cost): print('bad cost detected: ', cost) return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost) if saveto and np.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, imdb.prepare_data, train, kf) valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid) test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) if (best_p is None or valid_err <= np.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print('Early Stop!') estop = True break print('Seen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print("Training interupted") end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, imdb.prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid) test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test) print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err ) if saveto: np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print('The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))) print( ('Training took %.1fs' % (end_time - start_time)), file=sys.stderr) return train_err, valid_err, test_err
def prepare_data_sp(x, y, maxlen=None): x, mask, y = imdb.prepare_data(x, y, maxlen) return (x.transpose(), mask.transpose(), y)
train_function, valid_function, predict_function = build_model(vocab_size=vocab_size, embsize=embsize, hiddensize=hiddensize) print('Training ') for eidx in range(max_epochs): kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True) costs = [] errs = [] for _, train_index in kf: # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = imdb.prepare_data(x, y) cost, err = train_function(x, mask, y) if np.isnan(cost) or np.isnan(err): continue costs.append(float(cost)) errs.append(float(err)) costs = np.array(costs) errs = np.array(errs) print "Epoch {0}: Cost {1} Err {2}".format(eidx, np.mean(costs), np.mean(errs))
from imdb import load_data, prepare_data import numpy as np import pickle as pkl train, valid, test = load_data(n_words=10, valid_portion=0.05) x = [train[0][t] for t in range(0, len(train[0]))] y = [train[1][t] for t in range(0, len(train[1]))] x, mask, y = prepare_data(x, y) y = np.array(y) feat_train = np.zeros((x.shape[0], x.shape[1], 10)) for i in range(0, x.shape[0]): print "num: " + str(i) for j in range(0, x.shape[1]): feat_train[i][j][x[i][j]] = 1 np.save("data/feats_train.npy", feat_train) np.save("data/labels_train.npy", y) np.save("data/mask_train.npy", mask)