def main(): print('Loading data...') # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('dev shape:', x_dev.shape) # reshpae for convolution input x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2])) x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2])) num_embed = x_train.shape[-1] sentence_size = x_train.shape[2] print('sentence max words', sentence_size) print('embedding size', num_embed) batch_size = 50 cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5) train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def main(): print 'Loading data...' # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev)) print 'train shape:', x_train.shape print 'dev shape:', x_dev.shape # reshpae for convolution input x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2])) x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2])) num_embed = x_train.shape[-1] sentence_size = x_train.shape[2] print 'sentence max words', sentence_size print 'embedding size', num_embed batch_size = 50 cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5) train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): logger.info('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshpae for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embed_size = x.shape[-1] sentence_size = x.shape[2] vocab_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embed_size = num_embed sentence_size = x.shape[1] vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) logger.info('train shape: %(shape)s', {'shape': x_train.shape}) logger.info('valid shape: %(shape)s', {'shape': x_dev.shape}) logger.info('sentence max words: %(shape)s', {'shape': sentence_size}) logger.info('embedding size: %(msg)s', {'msg': embed_size}) logger.info('vocab size: %(msg)s', {'msg': vocab_size}) train = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter(x_dev, y_dev, batch_size) return (train, valid, sentence_size, embed_size, vocab_size)
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): """Construct data iter Parameters ---------- batch_size: int num_embed: int pre_trained_word2vec: boolean identify the pre-trained layers or not Returns ---------- train_set: DataIter Train DataIter valid: DataIter Valid DataIter sentences_size: int array dimensions embedded_size: int array dimensions vocab_size: int array dimensions """ print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshape for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embedded_size = x.shape[-1] sentences_size = x.shape[2] vocabulary_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embedded_size = num_embed sentences_size = x.shape[1] vocabulary_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentences_size) print('embedding size', embedded_size) print('vocab size', vocabulary_size) train_set = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return train_set, valid, sentences_size, embedded_size, vocabulary_size
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshpae for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embed_size = x.shape[-1] sentence_size = x.shape[2] vocab_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embed_size = num_embed sentence_size = x.shape[1] vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentence_size) print('embedding size', embed_size) print('vocab size', vocab_size) train = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return (train, valid, sentence_size, embed_size, vocab_size)