Example #1
0
    def evaluate(self, embedding, train_data, validation_data, test_data, num_classes):
        """

        Evaluates the 'embedding' using a convolutional neural network for NLP (from Yoon Kim [2014]) on 'dataset'

        Parameters
        ----------
        embedding      :     An embedding which implements the Embedding interface
        train_data     ;     A tuple of lists (docs, y) that constitutes the training data
        validation_data:     A tuple of lists (docs, y) that constitutes the validation data

        Returns        :     A float, with the top accuracy achieved
        -------

        """

        # Load dataset
        train_set = zip(*train_data)
        validation_set = zip(*validation_data)
        test_set = zip(*test_data)

        longest_doc = 0
        for (doc, y) in train_set + validation_set + test_set:
            l = len(doc.split(" "))
            if l > longest_doc:
                longest_doc = l

        # Train CNN
        perf = conv_net_sentence.train_conv_net(
            datasets=(train_set, validation_set, test_set),
            embedding=embedding,
            longest_doc=longest_doc,
            lr_decay=0.95,
            filter_hs=[3, 4, 5],
            conv_non_linear="relu",
            hidden_units=[100, 2],
            shuffle_batch=True,
            n_epochs=self.n_epochs,
            sqr_norm_lim=9,
            non_static=False,
            batch_size=self.batch_size,
            dropout_rate=[0.5],
        )
        print "perf: " + str(perf)

        return perf
def RoB_CNN_theano(maxlen=2000):
    '''
    Process data for CNN classification via the 
    theano implementation (Kim, 2014)
    '''

    ###
    # read in data (this also fits a vectorizer for us)
    train_docs, y_train, vectorizer = read_RoB_data(
        path="train-Xy-Random-sequence-generation.txt",
        y_tuples=False,
        max_features=50000)
    vocab_size = len(vectorizer.vocabulary_)
    print("vocabulary size of training data: " + str(vocab_size))
    test_docs, y_test = read_RoB_data(
        path="test-Xy-Random-sequence-generation.txt",
        fit_vectorizer=False,
        y_tuples=False)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    # map to word indices
    X_train = to_token_indices(train_docs,
                               vectorizer)  ##convert raw texts to word indices
    X_test = to_token_indices(test_docs, vectorizer)

    # read in pretrained word vectors
    wv = load_trained_w2v_model()
    wv_dim = wv.vector_size
    print("dimension of word embedding: " + str(wv_dim))
    # set initial word vectors for all token indices
    init_vectors, unk_vecs = _get_init_vectors(vectorizer, wv)
    W = init_vectors.astype(theano.config.floatX)
    W = np.vstack((np.zeros(wv_dim),
                   W))  #####add zeros vectors as the 0th word embedding
    W = W.astype(theano.config.floatX)
    print("dimension of W matrix is: " +
          str(W.shape))  #####should be 50001 times 200
    # zero-pad sentences
    ## @TMP only one set of filters
    #filter_heights = [3,4,5]
    filter_heights = [3, 4, 5]
    pad_len = maxlen + 2 * (max(filter_heights) - 1)
    X_train = pad_sequences(X_train,
                            maxlen=maxlen,
                            padding="post",
                            dtype=np.int32)  ##X_train is indices
    X_test = pad_sequences(X_test,
                           maxlen=maxlen,
                           padding="post",
                           dtype=np.int32)
    print('X_train shape: ', X_train.shape)
    print('X_test shape: ', X_test.shape)
    ''' 
    Kim's code expects datasets to contain train and test 
    matrices, *with the labels as the last entries!*
    '''
    X_y_train = np.array(np.hstack((X_train, np.matrix(y_train).T)))
    X_y_test = np.array(np.hstack((X_test, np.matrix(y_test).T)))
    datasets = [X_y_train, X_y_test]

    # @TMP setting these to small values
    n_filters = 100  # number of feature maps per height
    batch_size = 50
    n_epochs = 20
    perf = conv_net_sentence.train_conv_net(datasets,
                                            W,
                                            img_w=W.shape[1],
                                            lr_decay=0.95,
                                            filter_hs=filter_heights,
                                            conv_non_linear="relu",
                                            hidden_units=[n_filters, 2],
                                            shuffle_batch=True,
                                            n_epochs=n_epochs,
                                            sqr_norm_lim=9,
                                            non_static=True,
                                            batch_size=batch_size,
                                            dropout_rate=[0.5])

    return perf
Example #3
0
def RoB_CNN_theano(maxlen=4000):
    '''
    Process data for CNN classification via the 
    theano implementation (Kim, 2014)
    '''
    
    ###
    # read in data (this also fits a vectorizer for us)
    train_docs, y_train, vectorizer = read_RoB_data(path="train-Xy-Random-sequence-generation.txt", 
                                        y_tuples=False, max_features=50000)
    vocab_size = len(vectorizer.vocabulary_) 
    test_docs, y_test = read_RoB_data(path="test-Xy-Random-sequence-generation.txt", 
                                        fit_vectorizer=False,
                                        y_tuples=False)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    # map to word indices
    X_train = to_token_indices(train_docs, vectorizer)
    X_test  = to_token_indices(test_docs, vectorizer)

    # read in pretrained word vectors
    wv = load_trained_w2v_model()
    wv_dim = wv.vector_size 
    # set initial word vectors for all token indices
    init_vectors, unk_vecs = _get_init_vectors(vectorizer, wv)
    W = init_vectors.astype(np.float32) 

    # zero-pad sentences
    ## @TMP only one set of filters
    #filter_heights = [3,4,5]
    filter_heights = [9,10,11]
    pad_len = maxlen + 2*(max(filter_heights)-1)
    X_train = pad_sequences(X_train, maxlen=pad_len, padding="post", dtype=np.int32)
    X_test  = pad_sequences(X_test, maxlen=pad_len, padding="post", dtype=np.int32)
    print('X_train shape: ', X_train.shape)

    ''' 
    Kim's code expects datasets to contain train and test 
    matrices, *with the labels as the last entries!*
    '''
    X_y_train = np.array(np.hstack((X_train, np.matrix(y_train).T)))
    X_y_test  = np.array(np.hstack((X_test, np.matrix(y_test).T)))
    datasets = [X_y_train, X_y_test]

    # @TMP setting these to small values 
    n_filters  = 100 # number of feature maps per height
    batch_size = 25 
    n_epochs   = 10 
    perf = conv_net_sentence.train_conv_net(datasets,
                      W,
                      img_w = W.shape[1],
                      lr_decay=0.95,
                      filter_hs=filter_heights,
                      conv_non_linear="relu",
                      hidden_units=[n_filters,2], 
                      shuffle_batch=True, 
                      n_epochs=n_epochs, 
                      sqr_norm_lim=9,
                      non_static=True,
                      batch_size=batch_size, 
                      dropout_rate=[0.25])
    
    return perf