Exemple #1
0
    def __init__(self, batch_size, vocab_size, left_context, right_context,
            emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345):
        self.name = 'vLBL'
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.left_context = left_context
        self.right_context = right_context
        self.context_size = self.left_context + self.right_context
        self.emb_size = emb_size
        self.k = k
        self.unigram = unigram
        self.p_n = debug_print(theano.shared(value=unigram, name='noise_probab'),
                'noise')

        self.l1_weight = l1_weight
        self.l2_weight = l2_weight
        self.nce_seed = nce_seed

        # Create context and target embeddings
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                floatX, np.random.RandomState(1234))
        self.R = theano.shared(value=rand_values, name='R')
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                floatX, np.random.RandomState(4321))
        self.Q = theano.shared(value=rand_values, name='Q')
        b_values = zero_value((self.vocab_size,), dtype=floatX)
        self.bias = theano.shared(value=b_values, name='bias')

        # The learning rates are created the first time set_learning_rate is
        # called.
        self.lr = None
Exemple #2
0
    def __init__(self, b_values=None, **kwargs):
        super(BiasedHiddenLayer, self).__init__(**kwargs)

        if b_values is None:
            output_dim = self.weights.eval().shape[1]
            b_values = zero_value((output_dim, ), type=floatX)

        self.bias = theano.shared(value=b_values, name='bias_' + self.name)
Exemple #3
0
    def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7,5],
                    L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, 
                    k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, embedding_size=48, train_scheme=1):
        self.ini_learning_rate=learning_rate
        self.n_epochs=n_epochs
        self.nkerns=nkerns
        self.batch_size=batch_size
        self.useAllSamples=useAllSamples
        
        self.ktop=ktop
        self.filter_size=filter_size
        self.L2_weight=L2_weight
        self.dropout_p=dropout_p
        self.useEmb=useEmb
        self.task=task
        self.corpus=corpus
        self.dataMode=dataMode
        self.maxSentLength=maxSentLength
        self.kmax=self.maxSentLength/2+5
        self.sentEm_length=sentEm_length
        self.window=window
        self.k=k
        self.only_left_context=only_left_context
        if self.only_left_context:
            self.context_size=self.window
        else:
            self.context_size=2*self.window
        self.nce_seed=nce_seeds
        self.embedding_size=0
        self.train_scheme=train_scheme
        
        root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
        wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized"
        embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
        embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
        datasets, unigram, train_lengths, dev_lengths, word_count=load_model_for_training(wiki_path, root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt',self.maxSentLength, self.dataMode, self.train_scheme)
        

        
        
        self.datasets=datasets
        self.embedding_size=embedding_size
        self.vocab_size=word_count
        rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234))
        rand_values[0]=numpy.array(numpy.zeros(self.embedding_size))
        self.embeddings_R=theano.shared(value=rand_values)                                                    
        rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321))
        rand_values[0]=numpy.array(numpy.zeros(self.embedding_size))
        self.embeddings_Q=theano.shared(value=rand_values)   
        self.unigram=unigram
        self.p_n=theano.shared(value=self.unigram)
        self.train_lengths=train_lengths
        self.dev_lengths=dev_lengths
        b_values = zero_value((len(unigram),), dtype=theano.config.floatX)
        self.bias = theano.shared(value=b_values, name='bias')
        self.vali_cost_list_length=vali_cost_list_length
Exemple #4
0
 def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=20, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5],
                 L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, 
                 k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20):
     self.ini_learning_rate=learning_rate
     self.n_epochs=n_epochs
     self.nkerns=nkerns
     self.batch_size=batch_size
     self.useAllSamples=useAllSamples
     self.kmax=kmax
     self.ktop=ktop
     self.filter_size=filter_size
     self.L2_weight=L2_weight
     self.dropout_p=dropout_p
     self.useEmb=useEmb
     self.task=task
     self.corpus=corpus
     self.dataMode=dataMode
     self.maxSentLength=maxSentLength
     self.sentEm_length=sentEm_length
     self.window=window
     self.k=k
     self.only_left_context=only_left_context
     if self.only_left_context:
         self.context_size=self.window
     else:
         self.context_size=2*self.window
     self.nce_seed=nce_seeds
     self.embedding_size=0
     
     root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
     embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
     embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
     datasets, embedding_size, embeddings_R, embeddings_Q, unigram, train_lengths, dev_lengths, test_lengths=read_data_WP(root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt', root+str(self.task)+'classes/'+str(self.corpus)+'test.txt', embeddingPath,self.maxSentLength, self.useEmb, self.dataMode)
     self.datasets=datasets
     self.embedding_size=embedding_size
     self.embeddings_R=embeddings_R 
     self.embeddings_Q=embeddings_Q
     self.unigram=unigram
     self.p_n=theano.shared(value=self.unigram)
     self.train_lengths=train_lengths
     self.dev_lengths=dev_lengths
     self.test_lengths=test_lengths
     b_values = zero_value((len(unigram),), dtype=theano.config.floatX)
     self.bias = theano.shared(value=b_values, name='bias')
     self.vali_cost_list_length=vali_cost_list_length
Exemple #5
0
    def __init__(self,
                 batch_size,
                 vocab_size,
                 left_context,
                 right_context,
                 emb_size,
                 k,
                 unigram,
                 l1_weight=0,
                 l2_weight=0,
                 nce_seed=2345):
        self.name = 'vLBL'
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.left_context = left_context
        self.right_context = right_context
        self.context_size = self.left_context + self.right_context
        self.emb_size = emb_size
        self.k = k
        self.unigram = unigram
        self.p_n = debug_print(
            theano.shared(value=unigram, name='noise_probab'), 'noise')

        self.l1_weight = l1_weight
        self.l2_weight = l2_weight
        self.nce_seed = nce_seed

        # Create context and target embeddings
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                                          floatX, np.random.RandomState(1234))
        self.R = theano.shared(value=rand_values, name='R')
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                                          floatX, np.random.RandomState(4321))
        self.Q = theano.shared(value=rand_values, name='Q')
        b_values = zero_value((self.vocab_size, ), dtype=floatX)
        self.bias = theano.shared(value=b_values, name='bias')

        # The learning rates are created the first time set_learning_rate is
        # called.
        self.lr = None
Exemple #6
0
    def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7,5],
                    L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=600, sentEm_length=48, window=3, 
                    k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, context_embedding_size=48, train_scheme=1, max_size=10):
        self.write_file_name_suffix='_nk'+str(nkerns[0])+'&'+str(nkerns[1])+'_bs'+str(batch_size)+'_fs'+str(filter_size[0])+'&'+str(filter_size[1])\
        +'_maxSL'+str(maxSentLength)+'_window'+str(window)+'_noise'+str(k)+'_wait'+str(vali_cost_list_length)+'_conEm'+str(context_embedding_size)\
        +'_maxS'+str(max_size)
        #print self.write_file_name_suffix
        #exit(0)
        
        self.ini_learning_rate=learning_rate
        self.n_epochs=n_epochs
        self.nkerns=nkerns
        self.batch_size=batch_size
        self.useAllSamples=useAllSamples
        
        self.ktop=ktop
        self.filter_size=filter_size
        self.L2_weight=L2_weight
        self.dropout_p=dropout_p
        self.useEmb=useEmb
        self.task=task
        self.corpus=corpus
        self.dataMode=dataMode
        self.maxSentLength=maxSentLength
        self.kmax=self.maxSentLength/2+5
        self.sentEm_length=sentEm_length
        self.window=window
        self.k=k
        self.only_left_context=only_left_context
        if self.only_left_context:
            self.context_size=self.window
        else:
            self.context_size=2*self.window
        self.nce_seed=nce_seeds
        self.context_embedding_size=context_embedding_size
        self.train_scheme=train_scheme
        '''
        root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
        wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized"
        embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
        embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
        '''
        self.max_size=max_size
        datasets, unigram, train_lengths, target_lengths, trigram_count, context_matrix, target_matrix, target_id2word, id2trigram=yinwikireformat3(self.maxSentLength, self.window, self.max_size)
        #exit(0)

        
        
        self.datasets=datasets
        self.context_matrix=context_matrix
        self.target_matrix=target_matrix
        self.trigram_size=trigram_count
        #print 'trigram_size is: '+str(trigram_count)
        self.target_id2word=target_id2word
        self.id2trigram=id2trigram
        '''
        self.target_embedding_size=200                                  
        rand_values=random_value_normal((len(target_id2word), self.target_embedding_size), theano.config.floatX, numpy.random.RandomState(4321))
        #rand_values[0]=numpy.array(numpy.zeros(self.embedding_size))
        self.embeddings_Q=theano.shared(value=rand_values)  
        ''' 
        #self.embeddings_Q=self.load_glove(target_id2word) # target embedding matrix
        self.target_embedding_size=0
        embed_R, embed_Q=self.new_load_glove(target_id2word)
        self.embeddings_Q=theano.shared(value=embed_Q)
        print 'target_embedding_size: '+str(self.target_embedding_size)
        
        #print 'self.embeddings_Q:'
        #print self.embeddings_Q.get_value()
        #rand_values=random_value_normal((self.trigram_size+1, self.context_embedding_size), theano.config.floatX, numpy.random.RandomState(1234))
        embed_R[0]=numpy.array(numpy.zeros(self.context_embedding_size))
        self.embeddings_R=theano.shared(value=embed_R)  
        #print 'self.embeddings_R:'
        #print self.embeddings_R.get_value()
        
        
        self.unigram=unigram   # is still a np.array()
        #print 'unigram:'
        #print self.unigram
        self.p_n=theano.shared(value=self.unigram)
        self.train_lengths=train_lengths
        '''
        print 'train_lengths:'
        print train_lengths
        '''
        self.target_lengths=target_lengths
        '''
        print 'target_lengths:'
        print self.target_lengths
        '''
        b_values = zero_value((len(unigram),), dtype=theano.config.floatX)
        self.bias = theano.shared(value=b_values, name='bias')
        self.vali_cost_list_length=vali_cost_list_length
    def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=True, ktop=4, filter_size=[7,5],
                    L2_weight=0.00005, useEmb=0, maxSentLength=60, sentEm_length=48, window=3, 
                    k=5, nce_seeds=2345, only_left_context=False, wait_iter=20, embedding_size=48, newd=[100, 100], train_file_style=1, from_scratch=False, stop=1e-2):
        self.write_file_name_suffix='_lr'+str(learning_rate)+'_nk'+str(nkerns[0])+'&'+str(nkerns[1])+'_bs'+str(batch_size)+'_fs'+str(filter_size[0])+'&'+str(filter_size[1])\
        +'_maxSL'+str(maxSentLength)+'_win'+str(window)+'_noi'+str(k)+'_wait'+str(wait_iter)+'_wdEm'+str(embedding_size)\
        +'_stEm'+str(sentEm_length)+'_ts'+str(from_scratch)+'_newd'+str(newd[0])+'&'+str(newd[1])+'_trFi'+str(train_file_style)+'stop'+str(stop)
        model_options = locals().copy()
        print "model options", model_options
        self.ini_learning_rate=learning_rate
        self.n_epochs=n_epochs
        self.nkerns=nkerns
        self.batch_size=batch_size
        self.useAllSamples=useAllSamples
        
        self.ktop=ktop
        self.filter_size=filter_size
        self.L2_weight=L2_weight
        self.useEmb=useEmb
        self.maxSentLength=maxSentLength
        self.kmax=self.maxSentLength/2+5
        self.sentEm_length=sentEm_length
        self.window=window
        self.k=k
        self.only_left_context=only_left_context
        if self.only_left_context:
            self.context_size=self.window
        else:
            self.context_size=2*self.window
        self.nce_seed=nce_seeds
        self.embedding_size=0
        self.train_file_style=train_file_style
        #we define "train_file_style" as: 0 (wiki), 11(sent_train), 12 (senti_dev), 13 (senti_test)
        
        senti_trainfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2train.txt"
        senti_devfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2dev.txt"
        senti_testfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2test.txt"
        wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized"
        embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
        embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
        root='/mounts/data/proj/wenpeng/Thang/'
        if self.train_file_style !=0:
            datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(senti_trainfile,self.maxSentLength,self.train_file_style)
        elif self.train_file_style == 0:
            #datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(root+'train.txt',self.maxSentLength,self.train_file_style)
            datasets, unigram, train_lengths, dev_lengths, word_count, self.id2word=load_data_for_training(root+'train.txt', root+'dev_dev93.txt',self.maxSentLength)
        

        
        
        self.datasets=datasets
        self.embedding_size=embedding_size
        self.vocab_size=word_count
        self.rand_values_R=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234))
        self.rand_values_R[0]=numpy.array(numpy.zeros(self.embedding_size))
                                                           
        self.rand_values_Q=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321))
        self.rand_values_Q[0]=numpy.array(numpy.zeros(self.embedding_size))
          
        self.from_scratch=from_scratch
        if not self.from_scratch:
            self.load_pretrained_embeddings()
        self.embeddings_R=theano.shared(value=self.rand_values_R) 
        self.embeddings_Q=theano.shared(value=self.rand_values_Q) 
        self.unigram=unigram # we use the average of unigram as probability of new word in dev set
        self.extend_unigram=numpy.append(unigram, [sum(unigram)/len(unigram)])
        #print 'unigram, p_n length:', len(unigram), len(self.extend_unigram)
        self.p_n=theano.shared(value=self.extend_unigram)
        self.train_lengths=train_lengths
        self.vali_lengths=dev_lengths
        b_values = zero_value((len(unigram)+1,), dtype=theano.config.floatX)#the last bias is for new words in dev data
        #print 'bias length:', len(b_values)
        self.bias = theano.shared(value=b_values, name='bias')
        self.wait_iter=wait_iter
        self.newd=newd
        self.stop=stop
Exemple #8
0
    def __init__(self,
                 learning_rate=0.2,
                 n_epochs=2000,
                 nkerns=[6, 14],
                 batch_size=10,
                 useAllSamples=0,
                 ktop=4,
                 filter_size=[7, 5],
                 L2_weight=0.00005,
                 dropout_p=0.8,
                 useEmb=0,
                 task=2,
                 corpus=1,
                 dataMode=3,
                 maxSentLength=60,
                 sentEm_length=48,
                 window=3,
                 k=5,
                 nce_seeds=2345,
                 only_left_context=False,
                 vali_cost_list_length=20,
                 embedding_size=48,
                 train_scheme=1):
        self.ini_learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.nkerns = nkerns
        self.batch_size = batch_size
        self.useAllSamples = useAllSamples

        self.ktop = ktop
        self.filter_size = filter_size
        self.L2_weight = L2_weight
        self.dropout_p = dropout_p
        self.useEmb = useEmb
        self.task = task
        self.corpus = corpus
        self.dataMode = dataMode
        self.maxSentLength = maxSentLength
        self.kmax = self.maxSentLength / 2 + 5
        self.sentEm_length = sentEm_length
        self.window = window
        self.k = k
        self.only_left_context = only_left_context
        if self.only_left_context:
            self.context_size = self.window
        else:
            self.context_size = 2 * self.window
        self.nce_seed = nce_seeds
        self.embedding_size = 0
        self.train_scheme = train_scheme

        root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
        wiki_path = "/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized"
        embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
        embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
        datasets, unigram, train_lengths, dev_lengths, word_count = load_model_for_training(
            wiki_path, root + str(self.task) + 'classes/' + str(self.corpus) +
            'train.txt',
            root + str(self.task) + 'classes/' + str(self.corpus) + 'dev.txt',
            self.maxSentLength, self.dataMode, self.train_scheme)

        self.datasets = datasets
        self.embedding_size = embedding_size
        self.vocab_size = word_count
        rand_values = random_value_normal(
            (self.vocab_size + 1, self.embedding_size), theano.config.floatX,
            numpy.random.RandomState(1234))
        rand_values[0] = numpy.array(numpy.zeros(self.embedding_size))
        self.embeddings_R = theano.shared(value=rand_values)
        rand_values = random_value_normal(
            (self.vocab_size + 1, self.embedding_size), theano.config.floatX,
            numpy.random.RandomState(4321))
        rand_values[0] = numpy.array(numpy.zeros(self.embedding_size))
        self.embeddings_Q = theano.shared(value=rand_values)
        self.unigram = unigram
        self.p_n = theano.shared(value=self.unigram)
        self.train_lengths = train_lengths
        self.dev_lengths = dev_lengths
        b_values = zero_value((len(unigram), ), dtype=theano.config.floatX)
        self.bias = theano.shared(value=b_values, name='bias')
        self.vali_cost_list_length = vali_cost_list_length