def error(sentences, scorefct, embeddings, wsize): nsent = len(sentences) delta = wsize / 2 rest = wsize % 2 rank = [] for i, s in enumerate(sentences): nword = len(s) if nword < wsize + 2: continue for j in range(delta, nword - delta): chunks = [] st = s[j - delta:j] en = s[j + 1:j + delta + 1] for k in range(embeddings): chunks += st + [k] + en score = scorefct(idx2spmat(chunks, embeddings)) sortedscore = numpy.argsort(score[::-1].flatten()) rank += [numpy.argwhere(sortedscore == j).flatten()[0]] return numpy.mean(rank)
def error(sentences, scorefct, embeddings, wsize): nsent = len(sentences) delta = wsize/2 rest = wsize%2 rank = [] for i,s in enumerate(sentences): nword = len(s) if nword < wsize + 2: continue for j in range(delta,nword-delta): chunks = [] st = s[j-delta:j] en = s[j+1:j+delta+1] for k in range(embeddings): chunks += st + [k] + en score = scorefct(idx2spmat(chunks,embeddings)) sortedscore = numpy.argsort(score[::-1].flatten()) rank += [ numpy.argwhere(sortedscore==j).flatten()[0] ] return numpy.mean(rank)
def run(jobman, debug=False): expstart = time.time() hp = jobman.state if not os.path.exists("files/"): os.mkdir("files/") # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() s_valid = theano.sparse.csr_matrix() w2i = cPickle.load(open("/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl")) i2w = dict((v, k) for k, v in w2i.iteritems()) i2w[0] = "UNK" senna = [i2w[i] for i in range(len(i2w.keys()))] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp["embedsize"], e_act=identity) H = ae(i_size=hp["embedsize"] * hp["wsize"], h_size=hp["hsize"], e_act=T.tanh) L = logistic(i_size=hp["hsize"], h_size=1, act=identity) path = hp["loadpath"] if path: load(embedding, path + "/embedding.pkl") load(H, path + "/hidden.pkl") load(L, path + "/logistic.pkl") hp["embedsize"] = embedding.params["e_weights"].get_value(borrow=True).shape[1] hp["hsize"] = H.params["e_weights"].get_value(borrow=True).shape[1] jobman.save() del H.params["d_bias"] del embedding.params["d_bias"] del embedding.params["e_bias"] lr = hp["lr"] h_size = hp["hsize"] bs = hp["bs"] posit_embed = theano.sparse.dot(s_valid, embedding.params["e_weights"]).reshape( (1 + hp["nneg"], hp["embedsize"] * hp["wsize"]) ) # posit_embed = T.dot(s_valid, embedding.params['e_weights']).reshape((1+hp['nneg'],hp['embedsize']*hp['wsize'])) # negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) # negat_score = L.encode(H.encode(negat_embed)) C = posit_score[1:] - posit_score[0] + hp["margin"] CC = (rect(C)).mean() opt = theano.function( [s_valid], (rect(C)).mean(), updates=dict(L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr)) ) def saveexp(): save(embedding, fname + "embedding.pkl") save(H, fname + "hidden.pkl") save(L, fname + "logistic.pkl") delta = hp["wsize"] / 2 rest = hp["wsize"] % 2 wsimtester = wsim() freq_idx = cPickle.load(open("/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl"))[:1000] fname = "" tseenwords = not debug for e in range(hp["epoch"]): hp["split"] = numpy.random.randint(285) sentences = cPickle.load( open("/mnt/scratch/bengio/bengio_group/data/gutenberg/small_ints_50000/split" + str(hp["split"]) + ".pkl") ) nsent = len(sentences) bigc = [] bigr = [] E = embedding.params["e_weights"].get_value(borrow=True) seen_words = 0 for i, s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp["wsize"] + 2: continue c = [] r = [] if debug: print " *** Processing document", i, "with", nword, sys.stdout.flush() for j in range(delta, nword - delta): cstart = time.time() pchunk = s[j - delta : j + delta + rest] nchunk = [] st = s[j - delta : j] en = s[j + 1 : j + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp["nneg"],)) nchunk = [] for kk in range(hp["nneg"]): nchunk += st + [rndidx[kk]] + en # assert len(nchunk) == len(pchunk)*hp['nneg'] # p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) pn = idx2spmat(pchunk + nchunk, nsenna) # assert pn[0,pchunk[0]] == 1 ctime = time.time() - cstart tstart = time.time() l = opt(pn) c.append(l) if debug: print ".", break if debug: print "" bigc += [numpy.array(c).sum()] if tseenwords > hp["wsimfreq"] or debug: hp["wsim"] = wsimtester.score(embedding.params["e_weights"].get_value(borrow=True)) print i, "WordSim Sp Corr:", hp["wsim"] sys.stdout.flush() hp["score"] = numpy.array(bigc).mean() hp["e"] = e hp["i"] = i print e, i, "NN Score:", hp["score"] tseenwords = 0 jobman.save() if seen_words > hp["freq"] or debug: seen_words = 0 ne = knn(freq_idx, embedding.params["e_weights"].get_value(borrow=True)) open("files/" + fname + "nearest.txt", "w").write(display(ne, i2w)) saveexp() sys.stdout.flush() jobman.save() saveexp()
def run(jobman, debug=False): expstart = time.time() hp = jobman.state if not os.path.exists('files/'): os.mkdir('files/') # Symbolic variables s_posit = T.matrix() s_negat = T.matrix() s_valid = theano.sparse.csr_matrix() w2i = cPickle.load( open( '/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl' )) i2w = dict((v, k) for k, v in w2i.iteritems()) i2w[0] = 'UNK' senna = [i2w[i] for i in range(len(i2w.keys()))] nsenna = len(senna) embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=T.tanh) L = logistic(i_size=hp['hsize'], h_size=1, act=identity) path = hp['loadpath'] if path: load(embedding, path + '/embedding.pkl') load(H, path + '/hidden.pkl') load(L, path + '/logistic.pkl') hp['embedsize'] = embedding.params['e_weights'].get_value( borrow=True).shape[1] hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1] jobman.save() del H.params['d_bias'] del embedding.params['d_bias'] del embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = theano.sparse.dot(s_valid, embedding.params['e_weights']).reshape( (1 + hp['nneg'], hp['embedsize'] * hp['wsize'])) #posit_embed = T.dot(s_valid, embedding.params['e_weights']).reshape((1+hp['nneg'],hp['embedsize']*hp['wsize'])) #negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) #negat_score = L.encode(H.encode(negat_embed)) C = (posit_score[1:] - posit_score[0] + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_valid], (rect(C)).mean(), updates=dict( L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))) def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 wsimtester = wsim() freq_idx = cPickle.load( open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl' ))[:1000] fname = '' tseenwords = not debug for e in range(hp['epoch']): hp['split'] = numpy.random.randint(285) sentences = cPickle.load( open( '/mnt/scratch/bengio/bengio_group/data/gutenberg/small_ints_50000/split' + str(hp['split']) + '.pkl')) nsent = len(sentences) bigc = [] bigr = [] E = embedding.params['e_weights'].get_value(borrow=True) seen_words = 0 for i, s in enumerate(sentences): nword = len(s) seen_words += nword tseenwords += nword if nword < hp['wsize'] + 2: continue c = [] r = [] if debug: print ' *** Processing document', i, 'with', nword, sys.stdout.flush() for j in range(delta, nword - delta): cstart = time.time() pchunk = s[j - delta:j + delta + rest] nchunk = [] st = s[j - delta:j] en = s[j + 1:j + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for kk in range(hp['nneg']): nchunk += st + [rndidx[kk]] + en #assert len(nchunk) == len(pchunk)*hp['nneg'] #p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) pn = idx2spmat(pchunk + nchunk, nsenna) #assert pn[0,pchunk[0]] == 1 ctime = time.time() - cstart tstart = time.time() l = opt(pn) c.append(l) if debug: print '.', break if debug: print '' bigc += [numpy.array(c).sum()] if tseenwords > hp['wsimfreq'] or debug: hp['wsim'] = wsimtester.score( embedding.params['e_weights'].get_value(borrow=True)) print i, 'WordSim Sp Corr:', hp['wsim'] sys.stdout.flush() hp['score'] = numpy.array(bigc).mean() hp['e'] = e hp['i'] = i print e, i, 'NN Score:', hp['score'] tseenwords = 0 jobman.save() if seen_words > hp['freq'] or debug: seen_words = 0 ne = knn(freq_idx, embedding.params['e_weights'].get_value(borrow=True)) open('files/' + fname + 'nearest.txt', 'w').write(display(ne, i2w)) saveexp() sys.stdout.flush() jobman.save() saveexp()