def main(): print('Loading data...') # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('dev shape:', x_dev.shape) # reshpae for convolution input x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2])) x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2])) num_embed = x_train.shape[-1] sentence_size = x_train.shape[2] print('sentence max words', sentence_size) print('embedding size', num_embed) batch_size = 50 cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5) train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def main(): print 'Loading data...' # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev)) print 'train shape:', x_train.shape print 'dev shape:', x_dev.shape # reshpae for convolution input x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2])) x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2])) num_embed = x_train.shape[-1] sentence_size = x_train.shape[2] print 'sentence max words', sentence_size print 'embedding size', num_embed batch_size = 50 cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5) train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): logger.info('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshpae for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embed_size = x.shape[-1] sentence_size = x.shape[2] vocab_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embed_size = num_embed sentence_size = x.shape[1] vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) logger.info('train shape: %(shape)s', {'shape': x_train.shape}) logger.info('valid shape: %(shape)s', {'shape': x_dev.shape}) logger.info('sentence max words: %(shape)s', {'shape': sentence_size}) logger.info('embedding size: %(msg)s', {'msg': embed_size}) logger.info('vocab size: %(msg)s', {'msg': vocab_size}) train = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter(x_dev, y_dev, batch_size) return (train, valid, sentence_size, embed_size, vocab_size)
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): """Construct data iter Parameters ---------- batch_size: int num_embed: int pre_trained_word2vec: boolean identify the pre-trained layers or not Returns ---------- train_set: DataIter Train DataIter valid: DataIter Valid DataIter sentences_size: int array dimensions embedded_size: int array dimensions vocab_size: int array dimensions """ print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshape for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embedded_size = x.shape[-1] sentences_size = x.shape[2] vocabulary_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embedded_size = num_embed sentences_size = x.shape[1] vocabulary_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentences_size) print('embedding size', embedded_size) print('vocab size', vocabulary_size) train_set = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return train_set, valid, sentences_size, embedded_size, vocabulary_size
def data_iter(batch_size, num_embed, pre_trained_word2vec=False): print('Loading data...') if pre_trained_word2vec: word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec') x, y = data_helpers.load_data_with_word2vec(word2vec) # reshpae for convolution input x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2])) embed_size = x.shape[-1] sentence_size = x.shape[2] vocab_size = -1 else: x, y, vocab, vocab_inv = data_helpers.load_data() embed_size = num_embed sentence_size = x.shape[1] vocab_size = len(vocab) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/valid set x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev))) print('train shape:', x_train.shape) print('valid shape:', x_dev.shape) print('sentence max words', sentence_size) print('embedding size', embed_size) print('vocab size', vocab_size) train = mx.io.NDArrayIter( x_train, y_train, batch_size, shuffle=True) valid = mx.io.NDArrayIter( x_dev, y_dev, batch_size) return (train, valid, sentence_size, embed_size, vocab_size)
m.cnn_exec.forward(is_train=False) num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1)) num_total += len(batchY) dev_acc = num_correct * 100 / float(num_total) print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \ --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc), file=logs) if __name__ == '__main__': mix_model = mix_cnn() print('Loading data...') # word2vec = data_helpers.load_google_word2vec('/Users/guo/TrainData/google300/GoogleNews-vectors-negative300.bin') word2vec = data_helpers.load_pretrained_word2vec('VecForMR_.txt') sentences, labels = data_helpers.load_data_and_labels() sentences_padded = data_helpers.pad_sentences(sentences) x, y = data_helpers.build_input_data_with_word2vec(sentences_padded, labels, word2vec) mix_model.dic = data_helpers.buildGram(sentences, min1=6, min2=7) mix_model.initTheta() x_sent, mix_model.idf = data_helpers.buildDocsTFIDF(mix_model.dic, sentences) x_sent = np.array(x_sent) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) cv = 1 cv_length = len(y)/10 sample_test = shuffle_indices[cv_length * cv:cv_length * (cv + 1)] sample_train = np.concatenate((shuffle_indices[:cv_length * cv], shuffle_indices[cv_length * (cv + 1):]))