def __init__(self, batch_size, vocab_size, left_context, right_context, emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345): self.name = 'vLBL' self.batch_size = batch_size self.vocab_size = vocab_size self.left_context = left_context self.right_context = right_context self.context_size = self.left_context + self.right_context self.emb_size = emb_size self.k = k self.unigram = unigram self.p_n = debug_print(theano.shared(value=unigram, name='noise_probab'), 'noise') self.l1_weight = l1_weight self.l2_weight = l2_weight self.nce_seed = nce_seed # Create context and target embeddings rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(1234)) self.R = theano.shared(value=rand_values, name='R') rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(4321)) self.Q = theano.shared(value=rand_values, name='Q') b_values = zero_value((self.vocab_size,), dtype=floatX) self.bias = theano.shared(value=b_values, name='bias') # The learning rates are created the first time set_learning_rate is # called. self.lr = None
def __init__(self, b_values=None, **kwargs): super(BiasedHiddenLayer, self).__init__(**kwargs) if b_values is None: output_dim = self.weights.eval().shape[1] b_values = zero_value((output_dim, ), type=floatX) self.bias = theano.shared(value=b_values, name='bias_' + self.name)
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7,5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, embedding_size=48, train_scheme=1): self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.dropout_p=dropout_p self.useEmb=useEmb self.task=task self.corpus=corpus self.dataMode=dataMode self.maxSentLength=maxSentLength self.kmax=self.maxSentLength/2+5 self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 self.train_scheme=train_scheme root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, unigram, train_lengths, dev_lengths, word_count=load_model_for_training(wiki_path, root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt',self.maxSentLength, self.dataMode, self.train_scheme) self.datasets=datasets self.embedding_size=embedding_size self.vocab_size=word_count rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_R=theano.shared(value=rand_values) rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) rand_values[0]=numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_Q=theano.shared(value=rand_values) self.unigram=unigram self.p_n=theano.shared(value=self.unigram) self.train_lengths=train_lengths self.dev_lengths=dev_lengths b_values = zero_value((len(unigram),), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length=vali_cost_list_length
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=20, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20): self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.kmax=kmax self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.dropout_p=dropout_p self.useEmb=useEmb self.task=task self.corpus=corpus self.dataMode=dataMode self.maxSentLength=maxSentLength self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, embedding_size, embeddings_R, embeddings_Q, unigram, train_lengths, dev_lengths, test_lengths=read_data_WP(root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt', root+str(self.task)+'classes/'+str(self.corpus)+'test.txt', embeddingPath,self.maxSentLength, self.useEmb, self.dataMode) self.datasets=datasets self.embedding_size=embedding_size self.embeddings_R=embeddings_R self.embeddings_Q=embeddings_Q self.unigram=unigram self.p_n=theano.shared(value=self.unigram) self.train_lengths=train_lengths self.dev_lengths=dev_lengths self.test_lengths=test_lengths b_values = zero_value((len(unigram),), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length=vali_cost_list_length
def __init__(self, batch_size, vocab_size, left_context, right_context, emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345): self.name = 'vLBL' self.batch_size = batch_size self.vocab_size = vocab_size self.left_context = left_context self.right_context = right_context self.context_size = self.left_context + self.right_context self.emb_size = emb_size self.k = k self.unigram = unigram self.p_n = debug_print( theano.shared(value=unigram, name='noise_probab'), 'noise') self.l1_weight = l1_weight self.l2_weight = l2_weight self.nce_seed = nce_seed # Create context and target embeddings rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(1234)) self.R = theano.shared(value=rand_values, name='R') rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(4321)) self.Q = theano.shared(value=rand_values, name='Q') b_values = zero_value((self.vocab_size, ), dtype=floatX) self.bias = theano.shared(value=b_values, name='bias') # The learning rates are created the first time set_learning_rate is # called. self.lr = None
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7,5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=600, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, context_embedding_size=48, train_scheme=1, max_size=10): self.write_file_name_suffix='_nk'+str(nkerns[0])+'&'+str(nkerns[1])+'_bs'+str(batch_size)+'_fs'+str(filter_size[0])+'&'+str(filter_size[1])\ +'_maxSL'+str(maxSentLength)+'_window'+str(window)+'_noise'+str(k)+'_wait'+str(vali_cost_list_length)+'_conEm'+str(context_embedding_size)\ +'_maxS'+str(max_size) #print self.write_file_name_suffix #exit(0) self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.dropout_p=dropout_p self.useEmb=useEmb self.task=task self.corpus=corpus self.dataMode=dataMode self.maxSentLength=maxSentLength self.kmax=self.maxSentLength/2+5 self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.context_embedding_size=context_embedding_size self.train_scheme=train_scheme ''' root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' ''' self.max_size=max_size datasets, unigram, train_lengths, target_lengths, trigram_count, context_matrix, target_matrix, target_id2word, id2trigram=yinwikireformat3(self.maxSentLength, self.window, self.max_size) #exit(0) self.datasets=datasets self.context_matrix=context_matrix self.target_matrix=target_matrix self.trigram_size=trigram_count #print 'trigram_size is: '+str(trigram_count) self.target_id2word=target_id2word self.id2trigram=id2trigram ''' self.target_embedding_size=200 rand_values=random_value_normal((len(target_id2word), self.target_embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) #rand_values[0]=numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_Q=theano.shared(value=rand_values) ''' #self.embeddings_Q=self.load_glove(target_id2word) # target embedding matrix self.target_embedding_size=0 embed_R, embed_Q=self.new_load_glove(target_id2word) self.embeddings_Q=theano.shared(value=embed_Q) print 'target_embedding_size: '+str(self.target_embedding_size) #print 'self.embeddings_Q:' #print self.embeddings_Q.get_value() #rand_values=random_value_normal((self.trigram_size+1, self.context_embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) embed_R[0]=numpy.array(numpy.zeros(self.context_embedding_size)) self.embeddings_R=theano.shared(value=embed_R) #print 'self.embeddings_R:' #print self.embeddings_R.get_value() self.unigram=unigram # is still a np.array() #print 'unigram:' #print self.unigram self.p_n=theano.shared(value=self.unigram) self.train_lengths=train_lengths ''' print 'train_lengths:' print train_lengths ''' self.target_lengths=target_lengths ''' print 'target_lengths:' print self.target_lengths ''' b_values = zero_value((len(unigram),), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length=vali_cost_list_length
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=True, ktop=4, filter_size=[7,5], L2_weight=0.00005, useEmb=0, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, wait_iter=20, embedding_size=48, newd=[100, 100], train_file_style=1, from_scratch=False, stop=1e-2): self.write_file_name_suffix='_lr'+str(learning_rate)+'_nk'+str(nkerns[0])+'&'+str(nkerns[1])+'_bs'+str(batch_size)+'_fs'+str(filter_size[0])+'&'+str(filter_size[1])\ +'_maxSL'+str(maxSentLength)+'_win'+str(window)+'_noi'+str(k)+'_wait'+str(wait_iter)+'_wdEm'+str(embedding_size)\ +'_stEm'+str(sentEm_length)+'_ts'+str(from_scratch)+'_newd'+str(newd[0])+'&'+str(newd[1])+'_trFi'+str(train_file_style)+'stop'+str(stop) model_options = locals().copy() print "model options", model_options self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.useEmb=useEmb self.maxSentLength=maxSentLength self.kmax=self.maxSentLength/2+5 self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 self.train_file_style=train_file_style #we define "train_file_style" as: 0 (wiki), 11(sent_train), 12 (senti_dev), 13 (senti_test) senti_trainfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2train.txt" senti_devfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2dev.txt" senti_testfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2test.txt" wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' root='/mounts/data/proj/wenpeng/Thang/' if self.train_file_style !=0: datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(senti_trainfile,self.maxSentLength,self.train_file_style) elif self.train_file_style == 0: #datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(root+'train.txt',self.maxSentLength,self.train_file_style) datasets, unigram, train_lengths, dev_lengths, word_count, self.id2word=load_data_for_training(root+'train.txt', root+'dev_dev93.txt',self.maxSentLength) self.datasets=datasets self.embedding_size=embedding_size self.vocab_size=word_count self.rand_values_R=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) self.rand_values_R[0]=numpy.array(numpy.zeros(self.embedding_size)) self.rand_values_Q=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) self.rand_values_Q[0]=numpy.array(numpy.zeros(self.embedding_size)) self.from_scratch=from_scratch if not self.from_scratch: self.load_pretrained_embeddings() self.embeddings_R=theano.shared(value=self.rand_values_R) self.embeddings_Q=theano.shared(value=self.rand_values_Q) self.unigram=unigram # we use the average of unigram as probability of new word in dev set self.extend_unigram=numpy.append(unigram, [sum(unigram)/len(unigram)]) #print 'unigram, p_n length:', len(unigram), len(self.extend_unigram) self.p_n=theano.shared(value=self.extend_unigram) self.train_lengths=train_lengths self.vali_lengths=dev_lengths b_values = zero_value((len(unigram)+1,), dtype=theano.config.floatX)#the last bias is for new words in dev data #print 'bias length:', len(b_values) self.bias = theano.shared(value=b_values, name='bias') self.wait_iter=wait_iter self.newd=newd self.stop=stop
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7, 5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, embedding_size=48, train_scheme=1): self.ini_learning_rate = learning_rate self.n_epochs = n_epochs self.nkerns = nkerns self.batch_size = batch_size self.useAllSamples = useAllSamples self.ktop = ktop self.filter_size = filter_size self.L2_weight = L2_weight self.dropout_p = dropout_p self.useEmb = useEmb self.task = task self.corpus = corpus self.dataMode = dataMode self.maxSentLength = maxSentLength self.kmax = self.maxSentLength / 2 + 5 self.sentEm_length = sentEm_length self.window = window self.k = k self.only_left_context = only_left_context if self.only_left_context: self.context_size = self.window else: self.context_size = 2 * self.window self.nce_seed = nce_seeds self.embedding_size = 0 self.train_scheme = train_scheme root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" wiki_path = "/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, unigram, train_lengths, dev_lengths, word_count = load_model_for_training( wiki_path, root + str(self.task) + 'classes/' + str(self.corpus) + 'train.txt', root + str(self.task) + 'classes/' + str(self.corpus) + 'dev.txt', self.maxSentLength, self.dataMode, self.train_scheme) self.datasets = datasets self.embedding_size = embedding_size self.vocab_size = word_count rand_values = random_value_normal( (self.vocab_size + 1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_R = theano.shared(value=rand_values) rand_values = random_value_normal( (self.vocab_size + 1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) rand_values[0] = numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_Q = theano.shared(value=rand_values) self.unigram = unigram self.p_n = theano.shared(value=self.unigram) self.train_lengths = train_lengths self.dev_lengths = dev_lengths b_values = zero_value((len(unigram), ), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length = vali_cost_list_length