def evaluate(self, embedding, train_data, validation_data, test_data, num_classes): """ Evaluates the 'embedding' using a convolutional neural network for NLP (from Yoon Kim [2014]) on 'dataset' Parameters ---------- embedding : An embedding which implements the Embedding interface train_data ; A tuple of lists (docs, y) that constitutes the training data validation_data: A tuple of lists (docs, y) that constitutes the validation data Returns : A float, with the top accuracy achieved ------- """ # Load dataset train_set = zip(*train_data) validation_set = zip(*validation_data) test_set = zip(*test_data) longest_doc = 0 for (doc, y) in train_set + validation_set + test_set: l = len(doc.split(" ")) if l > longest_doc: longest_doc = l # Train CNN perf = conv_net_sentence.train_conv_net( datasets=(train_set, validation_set, test_set), embedding=embedding, longest_doc=longest_doc, lr_decay=0.95, filter_hs=[3, 4, 5], conv_non_linear="relu", hidden_units=[100, 2], shuffle_batch=True, n_epochs=self.n_epochs, sqr_norm_lim=9, non_static=False, batch_size=self.batch_size, dropout_rate=[0.5], ) print "perf: " + str(perf) return perf
def RoB_CNN_theano(maxlen=2000): ''' Process data for CNN classification via the theano implementation (Kim, 2014) ''' ### # read in data (this also fits a vectorizer for us) train_docs, y_train, vectorizer = read_RoB_data( path="train-Xy-Random-sequence-generation.txt", y_tuples=False, max_features=50000) vocab_size = len(vectorizer.vocabulary_) print("vocabulary size of training data: " + str(vocab_size)) test_docs, y_test = read_RoB_data( path="test-Xy-Random-sequence-generation.txt", fit_vectorizer=False, y_tuples=False) y_train = np.array(y_train) y_test = np.array(y_test) # map to word indices X_train = to_token_indices(train_docs, vectorizer) ##convert raw texts to word indices X_test = to_token_indices(test_docs, vectorizer) # read in pretrained word vectors wv = load_trained_w2v_model() wv_dim = wv.vector_size print("dimension of word embedding: " + str(wv_dim)) # set initial word vectors for all token indices init_vectors, unk_vecs = _get_init_vectors(vectorizer, wv) W = init_vectors.astype(theano.config.floatX) W = np.vstack((np.zeros(wv_dim), W)) #####add zeros vectors as the 0th word embedding W = W.astype(theano.config.floatX) print("dimension of W matrix is: " + str(W.shape)) #####should be 50001 times 200 # zero-pad sentences ## @TMP only one set of filters #filter_heights = [3,4,5] filter_heights = [3, 4, 5] pad_len = maxlen + 2 * (max(filter_heights) - 1) X_train = pad_sequences(X_train, maxlen=maxlen, padding="post", dtype=np.int32) ##X_train is indices X_test = pad_sequences(X_test, maxlen=maxlen, padding="post", dtype=np.int32) print('X_train shape: ', X_train.shape) print('X_test shape: ', X_test.shape) ''' Kim's code expects datasets to contain train and test matrices, *with the labels as the last entries!* ''' X_y_train = np.array(np.hstack((X_train, np.matrix(y_train).T))) X_y_test = np.array(np.hstack((X_test, np.matrix(y_test).T))) datasets = [X_y_train, X_y_test] # @TMP setting these to small values n_filters = 100 # number of feature maps per height batch_size = 50 n_epochs = 20 perf = conv_net_sentence.train_conv_net(datasets, W, img_w=W.shape[1], lr_decay=0.95, filter_hs=filter_heights, conv_non_linear="relu", hidden_units=[n_filters, 2], shuffle_batch=True, n_epochs=n_epochs, sqr_norm_lim=9, non_static=True, batch_size=batch_size, dropout_rate=[0.5]) return perf
def RoB_CNN_theano(maxlen=4000): ''' Process data for CNN classification via the theano implementation (Kim, 2014) ''' ### # read in data (this also fits a vectorizer for us) train_docs, y_train, vectorizer = read_RoB_data(path="train-Xy-Random-sequence-generation.txt", y_tuples=False, max_features=50000) vocab_size = len(vectorizer.vocabulary_) test_docs, y_test = read_RoB_data(path="test-Xy-Random-sequence-generation.txt", fit_vectorizer=False, y_tuples=False) y_train = np.array(y_train) y_test = np.array(y_test) # map to word indices X_train = to_token_indices(train_docs, vectorizer) X_test = to_token_indices(test_docs, vectorizer) # read in pretrained word vectors wv = load_trained_w2v_model() wv_dim = wv.vector_size # set initial word vectors for all token indices init_vectors, unk_vecs = _get_init_vectors(vectorizer, wv) W = init_vectors.astype(np.float32) # zero-pad sentences ## @TMP only one set of filters #filter_heights = [3,4,5] filter_heights = [9,10,11] pad_len = maxlen + 2*(max(filter_heights)-1) X_train = pad_sequences(X_train, maxlen=pad_len, padding="post", dtype=np.int32) X_test = pad_sequences(X_test, maxlen=pad_len, padding="post", dtype=np.int32) print('X_train shape: ', X_train.shape) ''' Kim's code expects datasets to contain train and test matrices, *with the labels as the last entries!* ''' X_y_train = np.array(np.hstack((X_train, np.matrix(y_train).T))) X_y_test = np.array(np.hstack((X_test, np.matrix(y_test).T))) datasets = [X_y_train, X_y_test] # @TMP setting these to small values n_filters = 100 # number of feature maps per height batch_size = 25 n_epochs = 10 perf = conv_net_sentence.train_conv_net(datasets, W, img_w = W.shape[1], lr_decay=0.95, filter_hs=filter_heights, conv_non_linear="relu", hidden_units=[n_filters,2], shuffle_batch=True, n_epochs=n_epochs, sqr_norm_lim=9, non_static=True, batch_size=batch_size, dropout_rate=[0.25]) return perf