def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = sparse.shared(data_x.astype(theano.config.floatX), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # one-hot encoded labels as {-1, 1} n_classes = len(np.unique(data_y)) # dangerous? y1 = -1 * np.ones((data_y.shape[0], n_classes)) y1[np.arange(data_y.shape[0]), data_y] = 1 shared_y1 = theano.shared(np.asarray(y1, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32'), T.cast(shared_y1, 'int32')
def _build_mask(self): big_m = np.zeros((self.v_dim, self.v_dim), dtype=theano.config.floatX) k = 0 for i in xrange(len(self.v_ranges)): for j in xrange(len(self.v_ranges[i])): big_m[k, self.v_ranges[i][j]] = 1 k += 1 # self.big_mask = theano.shared(big_m, name='big_mask') # Sparse mask self.big_mask = sparse.shared(sp.csc_matrix(big_m), name='big_mask')
def run(jobman, debug=False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_bow = T.matrix() s_idx = T.iscalar() s_tf = T.scalar() s_posit = T.matrix() #theano.sparse.csr_matrix() s_negat = T.matrix() #theano.sparse.csr_matrix() sentences = cPickle.load( open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load( open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten( ).tolist() hashtab = dict(zip(gsubset, range(len(gsubset)))) tfidf_data = numpy.load('/scratch/rifaisal/data/guten/guten_tfidf.npy' ).item().tocsr().astype('float32') #tfidf = cPickle.load(open('/scratch/rifaisal/repos/senna/gutentokenizer.pkl')) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[10000:10010] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=T.tanh) L = logistic(i_size=hp['hsize'], h_size=1, act=identity) S = logistic(i_size=hp['embedsize'], h_size=nsenna, act=T.nnet.softmax) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity) valid_embedding.params['weights'] = sp.shared( value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value( borrow=True))) valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape( (1, hp['embedsize'] * hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape( (hp['nneg'], hp['embedsize'] * hp['wsize'])) valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape( (nsenna, hp['embedsize'] * hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) s_bow_pred = S.encode(embedding.encode(s_bow)) pred = s_tf * nllsoft(s_bow_pred, s_idx) CC = (rect(C)).mean() + hp['lambda'] * pred opt = theano.function( [s_posit, s_negat, s_bow, s_idx, s_tf], [(rect(C)).mean(), pred], updates=dict( S.update(CC, lr) + L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))) #validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 freq_idx = cPickle.load( open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [hashtab[idx] for idx in freq_idx] fname = '' for e in range(hp['epoch']): c = [] r = [] count = 1 for i in range(nsent): rsent = numpy.random.randint(nsent - 1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low=delta, high=nword - delta) pchunk = sentences[rsent][pidx - delta:pidx + delta + rest] nchunk = [] st = sentences[rsent][pidx - delta:pidx] en = sentences[rsent][pidx + 1:pidx + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk) * hp['nneg'] tfidf_chunk = tfidf_data[rsent:rsent + 1].toarray() #pdb.set_trace() tfidf_value = tfidf_chunk[0, sentences[rsent][pidx]] tfidf_chunk[0, sentences[rsent][pidx]] = 0. tfidx = sentences[rsent][ pidx] # numpy.zeros(tfidf_chunk.shape).astype('float32') #tfidx[0,sentences[rsent][pidx]] = 1. p, n, b, iidx, tfval = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna), tfidf_chunk, tfidx, tfidf_value) count += tfval != 0 l, g = opt(p, n, b, iidx, tfval) c = c c.append(l) r.append(g) """ if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(20*hp['freq']) == 0 and debug==False: valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk """ if (i + 1) % hp['freq'] == 0 or debug: hp['score'] = numpy.array(c).sum() / (numpy.array(c) > 0).sum() hp['pred'] = numpy.array(r).sum() / float(count) hp['e'] = e hp['i'] = i print '' print e, i, 'NN Score:', hp['score'], 'Reconstruction:', hp[ 'pred'] if debug != True: ne = knn( freq_idx, embedding.params['e_weights'].get_value(borrow=True)) open('files/' + fname + 'nearest.txt', 'w').write(display(ne, senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def run(jobman,debug = False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_bow = T.matrix() s_posit = T.matrix()#theano.sparse.csr_matrix() s_negat = T.matrix()#theano.sparse.csr_matrix() sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist() hashtab = dict( zip( gsubset, range( len( gsubset)))) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[-10:] sentences = sentences[:-10] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = T.nnet.sigmoid) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh) L = logistic(i_size = hp['hsize'], h_size = 1) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = T.nnet.sigmoid) valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) rec = embedding.reconstruct(s_bow, loss='ce') CC = (rect(C)).mean() + hp['lambda'] * rec opt = theano.function([s_posit, s_negat, s_bow], [C.mean(),rec], updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update(CC,lr)) ) validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') print 'Saved successfully' delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [ hashtab[idx] for idx in freq_idx ] fname = '' for e in range(hp['epoch']): c = [] r = [] for i in range(nsent): rsent = numpy.random.randint(nsent-1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low = delta, high = nword-delta) pchunk = sentences[rsent][pidx-delta:pidx+delta+rest] nchunk = [] st = sentences[rsent][pidx-delta:pidx] en = sentences[rsent][pidx+1:pidx+delta+rest] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk)*hp['nneg'] p, n, b = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), idx2vec(sentences[rsent],nsenna)) l,g = opt(p,n,b) c.append(l) r.append(g) if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(50*hp['freq']) == 0: valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk hp['e'] = e hp['i'] = i jobman.save() saveexp() print 'Random Valid Mean rank',mrk if i%hp['freq'] == 0: hp['score'] = numpy.array(c).mean() hp['rec'] = numpy.array(r).mean() print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['rec'] ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() save()
def run(jobman, debug=False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() idx_start = T.lscalar() idx_stop = T.lscalar() s_valid = theano.sparse.csr_matrix() w2i = cPickle.load( open( '/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl' )) i2w = dict((v, k) for k, v in w2i.iteritems()) i2w[0] = 'UNK' senna = [i2w[i] for i in range(len(i2w.keys()))] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=T.tanh) L = logistic(i_size=hp['hsize'], h_size=1, act=identity) del H.params['d_bias'] del embedding.params['d_bias'] del embedding.params['e_bias'] minsize = hp['minsize'] maxsize = hp['maxsize'] dsize = maxsize - minsize + 1 H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros( (dsize, hp['hsize'])), dtype=theano.config.floatX), name='e_bias') path = hp['loadpath'] if path: load(embedding, path + '/embedding.pkl') #load(H,path+'/hidden.pkl') #load(L,path+'/logistic.pkl') hp['embedsize'] = embedding.params['e_weights'].get_value( borrow=True).shape[1] #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1] jobman.save() H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros( (dsize, hp['hsize'])), dtype=theano.config.floatX), name='e_bias') valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity) valid_embedding.params['weights'] = sp.shared( value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value( borrow=True))) lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape( (1, hp['embedsize'] * hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape( (hp['nneg'], hp['embedsize'] * hp['wsize'])) valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape( (nsenna, hp['embedsize'] * hp['wsize'])) posit_embed_left = T.concatenate([ posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']], T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:]) ], axis=1) negat_embed_left = T.concatenate([ negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']], T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:]) ], axis=1) posit_embed_right = T.concatenate([ T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]), posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']] ], axis=1) negat_embed_right = T.concatenate([ T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]), negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']] ], axis=1) posit_embed = T.concatenate([ T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]), posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']], T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:]) ], axis=1) negat_embed = T.concatenate([ T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]), negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']], T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:]) ], axis=1) #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed) #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed) #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed) #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed) Hposit = T.tanh( T.dot(posit_embed, H.params['e_weights']) + H.params['e_bias'][idx_stop - idx_start - minsize, :]) Hnegat = T.tanh( T.dot(negat_embed, H.params['e_weights']) + H.params['e_bias'][idx_stop - idx_start - minsize, :]) posit_score = L.encode(Hposit) negat_score = L.encode(Hnegat) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat, idx_start, idx_stop], (rect(C)).mean(), updates=dict( L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))) validfct = theano.function([s_valid], valid_score) def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 freq_idx = cPickle.load( open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl' ))[:2000] fname = '' validsentence = [ ] # cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl')) tseenwords = not debug for e in range(hp['epoch']): hp['split'] = numpy.random.randint(45) sentences = cPickle.load( open( '/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split' + str(hp['split']) + '.pkl')) nsent = len(sentences) bigc = [] bigr = [] seen_words = 0 for i, s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp['maxsize'] + 2: continue rndsize = numpy.random.randint(low=hp['minsize'] + 1, high=hp['maxsize'] - 1) idxsta = numpy.random.randint(low=1, high=hp['maxsize'] - rndsize) idxsto = idxsta + rndsize print 'r', rndsize, 'b', idxsta, 'e', idxsto, 'shape', H.params[ 'e_bias'].get_value().shape c = [] r = [] if debug: print ' *** Processing document', i, 'with', nword, sys.stdout.flush() for j in range(delta, nword - delta): nd = rndsize / 2 rd = rndsize % 2 pchunk = s[j - delta:j + delta + rest] nchunk = [] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for kk in range(hp['nneg']): tmpchunk = copy.copy(pchunk) tmpchunk[idxsta + nd] = rndidx[kk] nchunk += tmpchunk assert len(nchunk) == len(pchunk) * hp['nneg'] p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna)) l = opt(p, n, idxsta, idxsto) c.append(l) if debug: print '.', break if debug: print '' bigc += [numpy.array(c).sum()] if 0: #(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']): tseenwords = 0 valid_embedding.params['weights'] = sp.shared( value=scipy.sparse.csr_matrix( embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank', mrk if seen_words > hp['freq'] or debug: seen_words = 0 hp['score'] = numpy.array(bigc).mean() hp['e'] = e hp['i'] = i print '' print e, i, 'NN Score:', hp['score'] if not debug: ne = knn( freq_idx, embedding.params['e_weights'].get_value(borrow=True)) open('files/' + fname + 'nearest.txt', 'w').write(display(ne, senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def run(jobman,debug = False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() idx_start = T.lscalar() idx_stop = T.lscalar() s_valid = theano.sparse.csr_matrix() w2i = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl')) i2w = dict( (v,k) for k,v in w2i.iteritems() ) i2w[0] = 'UNK' senna = [ i2w[i] for i in range(len(i2w.keys())) ] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh) L = logistic(i_size = hp['hsize'], h_size = 1, act = identity) del H.params['d_bias'] del embedding.params['d_bias'] del embedding.params['e_bias'] minsize = hp['minsize'] maxsize = hp['maxsize'] dsize = maxsize - minsize +1 H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias') path = hp['loadpath'] if path: load(embedding,path+'/embedding.pkl') #load(H,path+'/hidden.pkl') #load(L,path+'/logistic.pkl') hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1] #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1] jobman.save() H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias') valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_embed_left = T.concatenate([posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) negat_embed_left = T.concatenate([negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) posit_embed_right = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]), posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1) negat_embed_right = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]), negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1) posit_embed = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]), posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) negat_embed = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]), negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']], T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1) #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed) #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed) #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed) #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed) Hposit = T.tanh(T.dot(posit_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:]) Hnegat = T.tanh(T.dot(negat_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:]) posit_score = L.encode(Hposit) negat_score = L.encode(Hnegat) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat, idx_start, idx_stop], (rect(C)).mean(), updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) ) validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'))[:2000] fname = '' validsentence = []# cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl')) tseenwords = not debug for e in range(hp['epoch']): hp['split'] = numpy.random.randint(45) sentences = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'+str(hp['split'])+'.pkl')) nsent = len(sentences) bigc = [] bigr = [] seen_words = 0 for i,s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp['maxsize'] + 2: continue rndsize = numpy.random.randint(low=hp['minsize']+1,high=hp['maxsize']-1) idxsta = numpy.random.randint(low=1, high=hp['maxsize']-rndsize) idxsto = idxsta+rndsize print 'r',rndsize,'b',idxsta,'e',idxsto,'shape',H.params['e_bias'].get_value().shape c =[] r =[] if debug: print ' *** Processing document',i,'with',nword, sys.stdout.flush() for j in range(delta,nword-delta): nd = rndsize/2 rd = rndsize%2 pchunk = s[j-delta:j+delta+rest] nchunk = [] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for kk in range(hp['nneg']): tmpchunk = copy.copy(pchunk) tmpchunk[idxsta+nd] = rndidx[kk] nchunk += tmpchunk assert len(nchunk) == len(pchunk)*hp['nneg'] p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) l = opt(p,n, idxsta, idxsto) c.append(l) if debug: print '.', break if debug: print '' bigc += [numpy.array(c).sum()] if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']): tseenwords = 0 valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk if seen_words > hp['freq'] or debug: seen_words = 0 hp['score'] = numpy.array(bigc).mean() hp['e'] = e hp['i'] = i print '' print e,i,'NN Score:', hp['score'] if not debug: ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
test_model_mat = ohe.transform(test_data["RESOURCE"].reshape((test_data.shape[0], 1))) train_model_mat = train_model_mat.astype(theano.config.floatX) test_model_mat = test_model_mat.astype(theano.config.floatX) n_train = 22000 n_valid = 5000 n_test = 5769 train_i = np.zeros(n_train) valid_i = np.zeros(n_valid) + 1 test_i = np.zeros(n_test) + 2 perm = np.random.permutation(np.hstack([train_i, valid_i, test_i])) train_set_x = sparse.shared(train_model_mat[np.where(perm == 0)[0]]) train_set_y = shared(train_data.ACTION[perm == 0].astype("int32")) valid_set_x = sparse.shared(train_model_mat[np.where(perm == 1)[0]]) valid_set_y = shared(train_data.ACTION[perm == 1].astype("int32")) test_set_x = sparse.shared(train_model_mat[np.where(perm == 2)[0]]) test_set_y = shared(train_data.ACTION[perm == 2].astype("int32")) datasets = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] rbm = test_rbm(datasets) # dbn = train_dbn(datasets, batch_size = 10, pretraining_epochs = 100, training_epochs = 1000) # pred_set_x = sparse.shared(test_model_mat) # pred_proba, _ = dbn.build_prediction_functions(pred_set_x, batch_size = 100)
def run(jobman,debug = False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_bow = T.matrix() s_idx = T.iscalar() s_tf = T.scalar() s_posit = T.matrix()#theano.sparse.csr_matrix() s_negat = T.matrix()#theano.sparse.csr_matrix() sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist() hashtab = dict( zip( gsubset, range( len( gsubset)))) tfidf_data = numpy.load('/scratch/rifaisal/data/guten/guten_tfidf.npy').item().tocsr().astype('float32') #tfidf = cPickle.load(open('/scratch/rifaisal/repos/senna/gutentokenizer.pkl')) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[10000:10010] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh) L = logistic(i_size = hp['hsize'], h_size = 1, act = identity) S = logistic(i_size = hp['embedsize'], h_size = nsenna, act= T.nnet.softmax) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) s_bow_pred = S.encode(embedding.encode(s_bow)) pred = s_tf * nllsoft(s_bow_pred,s_idx) CC = (rect(C)).mean() + hp['lambda'] * pred opt = theano.function([s_posit, s_negat, s_bow, s_idx, s_tf], [(rect(C)).mean(),pred], updates = dict( S.update(CC,lr) + L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) ) #validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [ hashtab[idx] for idx in freq_idx ] fname = '' for e in range(hp['epoch']): c = [] r = [] count = 1 for i in range(nsent): rsent = numpy.random.randint(nsent-1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low = delta, high = nword-delta) pchunk = sentences[rsent][pidx-delta:pidx+delta+rest] nchunk = [] st = sentences[rsent][pidx-delta:pidx] en = sentences[rsent][pidx+1:pidx+delta+rest] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk)*hp['nneg'] tfidf_chunk = tfidf_data[rsent:rsent+1].toarray() #pdb.set_trace() tfidf_value = tfidf_chunk[0,sentences[rsent][pidx]] tfidf_chunk[0,sentences[rsent][pidx]] = 0. tfidx = sentences[rsent][pidx] # numpy.zeros(tfidf_chunk.shape).astype('float32') #tfidx[0,sentences[rsent][pidx]] = 1. p, n, b, iidx, tfval = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), tfidf_chunk, tfidx, tfidf_value ) count += tfval!=0 l,g = opt(p,n,b, iidx, tfval) c = c c.append(l) r.append(g) """ if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(20*hp['freq']) == 0 and debug==False: valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk """ if (i+1)%hp['freq'] == 0 or debug: hp['score'] = numpy.array(c).sum() / (numpy.array(c)>0).sum() hp['pred'] = numpy.array(r).sum()/float(count) hp['e'] = e hp['i'] = i print '' print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['pred'] if debug != True: ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def run(jobman,debug = False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() s_valid = theano.sparse.csr_matrix() #vocab = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) #senna = cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/WestburyLab.wikicorp.201004_vocab30k.pkl')) w2i = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/merged_word2idx.pkl')) i2w = dict( (v,k) for k,v in w2i.iteritems() ) i2w[0] = 'UNK' senna = [ i2w[i] for i in range(len(i2w.keys())) ] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh) L = logistic(i_size = hp['hsize'], h_size = 1, act = identity) path = hp['loadpath'] if path: load(embedding,path+'/embedding.pkl') load(H,path+'/hidden.pkl') load(L,path+'/logistic.pkl') hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1] hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1] jobman.save() valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat], (rect(C)).mean(), updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) ) #validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') delta = hp['wsize']/2 rest = hp['wsize']%2 #freq_idx = range(29000,30000) freq_idx = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/sorted_vocab.pkl'))[:2000] fname = '' #validsentence = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/valid.pkl')) tseenwords = not debug for e in range(hp['epoch']): hp['split'] = numpy.random.randint(45) sentences = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/split'+str(hp['split'])+'.pkl')) nsent = len(sentences) bigc = [] bigr = [] seen_words = 0 for i,s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp['wsize'] + 2: continue c =[] r =[] if debug: print ' *** Processing document',i,'with',nword, sys.stdout.flush() for j in range(delta,nword-delta): pchunk = s[j-delta:j+delta+rest] nchunk = [] st = s[j-delta:j] en = s[j+1:j+delta+rest] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for kk in range(hp['nneg']): nchunk += st + [rndidx[kk]] + en assert len(nchunk) == len(pchunk)*hp['nneg'] p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) l = opt(p,n) c.append(l) if debug: print '.', break if debug: print '' bigc += [numpy.array(c).sum()] if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']): tseenwords = 0 valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True))) mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk jobman.save() saveexp() print 'Random Valid Mean rank',mrk if seen_words > hp['freq'] or debug: seen_words = 0 hp['score'] = numpy.array(bigc).mean() hp['e'] = e hp['i'] = i print '' print e,i,'NN Score:', hp['score'] if not debug: ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() saveexp()