def evaluate_baseline_method(method, item_dataset, min_supports, dataset_name): for min_support in min_supports: item_sets, exc_time = method(item_dataset, min_support) io.save([item_sets, exc_time], method.__name__ + "_minSup=" + str(min_support), "freq_sets/" + dataset_name, loc="_results")
def save_spn( spn, const_time, dataset_name, rdc_threshold, min_instances_slice, value_dict=None, nrows=None, ): if value_dict is None: fn.generate_adhoc_value_dict(spn) name = "rdc=" + str(rdc_threshold) + "_mis=" + str(min_instances_slice) if nrows: name = name + "_n=" + np.format_float_scientific( nrows, precision=0, trim='-') io.save([spn, value_dict, const_time], name, dataset_name, loc="_spns")
def evaluate_spn1_method(rdc_thresholds, min_instances_slices, min_supports, dataset_name, binary_positive=True): for rdc_threshold in rdc_thresholds: for min_instances_slice in min_instances_slices: loc = "_spns" ident = "rdc=" + str(rdc_threshold) + "_mis=" + str( min_instances_slice) spn, const_time = io.load(ident, dataset_name, loc) for min_support in min_supports: item_sets, exc_time = methods.spn1( spn, min_support, binary_positive=binary_positive) io.save([item_sets, exc_time], "spn1_" + ident + "_minSup=" + str(min_support), "freq_sets/" + dataset_name, loc="_results")
def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') print 'Saved successfully'
def run(jobman,debug = False): expstart = time.time() hp = jobman.state # Symbolic variables s_bow = T.matrix() s_posit = T.matrix()#theano.sparse.csr_matrix() s_negat = T.matrix()#theano.sparse.csr_matrix() sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist() hashtab = dict( zip( gsubset, range( len( gsubset)))) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[-10:] sentences = sentences[:-10] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = T.nnet.sigmoid) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh) L = logistic(i_size = hp['hsize'], h_size = 1) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = T.nnet.sigmoid) valid_embedding.params['weights'] = embedding.params['e_weights'] valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) rec = embedding.reconstruct(s_bow, loss='ce') CC = (rect(C)).mean() + hp['lambda'] * rec opt = theano.function([s_posit, s_negat, s_bow], [C.mean(),rec], updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update(CC,lr)) ) validfct = theano.function([s_valid],valid_score) def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') print 'Saved successfully' delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [ hashtab[idx] for idx in freq_idx ] fname = sys.argv[0]+'_' for e in range(hp['epoch']): c = [] r = [] for i in range(nsent): rsent = numpy.random.randint(nsent-1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low = delta, high = nword-delta) pchunk = sentences[rsent][pidx-delta:pidx+delta+rest] nchunk = [] st = sentences[rsent][pidx-delta:pidx] en = sentences[rsent][pidx+1:pidx+delta+rest] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk)*hp['nneg'] p, n, b = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), idx2vec(sentences[rsent],nsenna)) l,g = opt(p,n,b) c.append(l) r.append(g) if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(50*hp['freq']) == 0: mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk hp['e'] = e hp['i'] = i jobman.save() saveexp() print 'Random Valid Mean rank',mrk if i%hp['freq'] == 0: hp['score'] = numpy.array(c).mean() hp['rec'] = numpy.array(r).mean() print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['rec'] ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True)) open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) saveexp() sys.stdout.flush() jobman.save() save()
def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl')
def saveexp(): save(embedding, fname + "embedding.pkl") save(H, fname + "hidden.pkl") save(L, fname + "logistic.pkl")
def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') print 'Saved successfully'
def run(jobman, debug=False): expstart = time.time() hp = jobman.state # Symbolic variables s_bow = T.matrix() s_posit = T.matrix() #theano.sparse.csr_matrix() s_negat = T.matrix() #theano.sparse.csr_matrix() sentences = cPickle.load( open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl')) senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl')) gsubset = cPickle.load( open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten( ).tolist() hashtab = dict(zip(gsubset, range(len(gsubset)))) senna = numpy.array(senna)[gsubset].tolist() s_valid = theano.sparse.csr_matrix() validsentence = sentences[-10:] sentences = sentences[:-10] nsent = len(sentences) nsenna = len(senna) # Layers embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=T.nnet.sigmoid) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=rect, d_act=hardtanh) L = logistic(i_size=hp['hsize'], h_size=1) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act=T.nnet.sigmoid) valid_embedding.params['weights'] = embedding.params['e_weights'] valid_embedding.params['bias'] = embedding.params['e_bias'] lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = embedding.encode(s_posit).reshape( (1, hp['embedsize'] * hp['wsize'])) negat_embed = embedding.encode(s_negat).reshape( (hp['nneg'], hp['embedsize'] * hp['wsize'])) valid_embed = valid_embedding.encode(s_valid).reshape( (nsenna, hp['embedsize'] * hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) rec = embedding.reconstruct(s_bow, loss='ce') CC = (rect(C)).mean() + hp['lambda'] * rec opt = theano.function([s_posit, s_negat, s_bow], [C.mean(), rec], updates=dict( L.update(CC, lr) + H.update(CC, lr) + embedding.update(CC, lr))) validfct = theano.function([s_valid], valid_score) def saveexp(): save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') print 'Saved successfully' delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 freq_idx = cPickle.load( open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [hashtab[idx] for idx in freq_idx] fname = sys.argv[0] + '_' for e in range(hp['epoch']): c = [] r = [] for i in range(nsent): rsent = numpy.random.randint(nsent - 1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low=delta, high=nword - delta) pchunk = sentences[rsent][pidx - delta:pidx + delta + rest] nchunk = [] st = sentences[rsent][pidx - delta:pidx] en = sentences[rsent][pidx + 1:pidx + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk) * hp['nneg'] p, n, b = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna), idx2vec(sentences[rsent], nsenna)) l, g = opt(p, n, b) c.append(l) r.append(g) if (time.time() - expstart) > (3600 * 24 * 6 + 3600 * 20) or ( i + 1) % (50 * hp['freq']) == 0: mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize']) hp['mrk'] = mrk hp['e'] = e hp['i'] = i jobman.save() saveexp() print 'Random Valid Mean rank', mrk if i % hp['freq'] == 0: hp['score'] = numpy.array(c).mean() hp['rec'] = numpy.array(r).mean() print e, i, 'NN Score:', hp['score'], 'Reconstruction:', hp[ 'rec'] ne = knn(freq_idx, embedding.params['e_weights'].get_value(borrow=True)) open('files/' + fname + 'nearest.txt', 'w').write(display(ne, senna)) saveexp() sys.stdout.flush() jobman.save() save()
def run(jobman, debug=False): hp = jobman.state # Symbolic variables s_posit = T.matrix() #theano.sparse.csr_matrix() s_negat = T.matrix() #theano.sparse.csr_matrix() s_valid = theano.sparse.csr_matrix() sentences = cPickle.load( open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl')) validsentence = sentences[-10:] sentences = sentences[:-10] senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl')) gsubset = cPickle.load( open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten( ).tolist() hashtab = dict(zip(gsubset, range(len(gsubset)))) senna = numpy.array(senna)[gsubset].tolist() nsent = len(sentences) nsenna = len(senna) # Layers embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity) H = ae(i_size=hp['embedsize'] * hp['wsize'], h_size=hp['hsize'], e_act=rect, d_act=hardtanh) L = logistic(i_size=hp['hsize'], h_size=1) #, act = identity) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity) #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True)) #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True)) lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = embedding.encode(s_posit).reshape( (1, hp['embedsize'] * hp['wsize'])) negat_embed = embedding.encode(s_negat).reshape( (hp['nneg'], hp['embedsize'] * hp['wsize'])) #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) #valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat], C.mean(), updates=dict( L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))) #validfct = theano.function([s_valid],valid_score) #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize']) #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl') load(embedding, 'files/gutensubsetdense_exp.py_embedding.pkl') load(H, 'files/gutensubsetdense_exp.py_hidden.pkl') load(L, 'files/gutensubsetdense_exp.py_logistic.pkl') delta = hp['wsize'] / 2 rest = hp['wsize'] % 2 freq_idx = cPickle.load( open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [hashtab[idx] for idx in freq_idx] fname = sys.argv[0] + '_' for e in range(hp['epoch']): c = [] for i in range(nsent): rsent = numpy.random.randint(nsent - 1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low=delta, high=nword - delta) pchunk = sentences[rsent][pidx - delta:pidx + delta + rest] nchunk = [] st = sentences[rsent][pidx - delta:pidx] en = sentences[rsent][pidx + 1:pidx + delta + rest] rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], )) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk) * hp['nneg'] #start = time.time() p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna)) #print 'Select row:',time.time()-start, #start = time.time() c.append(opt(p, n)) #print 'grad up:',time.time()-start if i % hp['freq'] == 0: print e, i, numpy.array(c).mean(0) ne = knn(freq_idx, embedding.params['weights'].get_value(borrow=True)) save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl') sys.stdout.flush() open('files/' + fname + 'nearest.txt', 'w').write(display(ne, senna)) #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize']) save(embedding, fname + 'embedding.pkl') save(H, fname + 'hidden.pkl') save(L, fname + 'logistic.pkl')
def saveexp(): save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl')
def run(jobman,debug = False): hp = jobman.state # Symbolic variables s_posit = T.matrix()#theano.sparse.csr_matrix() s_negat = T.matrix()#theano.sparse.csr_matrix() s_valid = theano.sparse.csr_matrix() sentences = cPickle.load(open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl')) validsentence = sentences[-10:] sentences = sentences[:-10] senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl')) gsubset = cPickle.load(open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten().tolist() hashtab = dict( zip( gsubset, range( len( gsubset)))) senna = numpy.array(senna)[gsubset].tolist() nsent = len(sentences) nsenna = len(senna) # Layers embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh) L = logistic(i_size = hp['hsize'], h_size = 1)#, act = identity) valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity) #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True)) #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True)) lr = hp['lr'] h_size = hp['hsize'] bs = hp['bs'] posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize'])) negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize'])) #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize'])) posit_score = L.encode(H.encode(posit_embed)) negat_score = L.encode(H.encode(negat_embed)) #valid_score = L.encode(H.encode(valid_embed)) C = (negat_score - posit_score.flatten() + hp['margin']) CC = (rect(C)).mean() opt = theano.function([s_posit, s_negat], C.mean(), updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) ) #validfct = theano.function([s_valid],valid_score) #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize']) #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl') load(embedding,'files/gutensubsetdense_exp.py_embedding.pkl') load(H,'files/gutensubsetdense_exp.py_hidden.pkl') load(L,'files/gutensubsetdense_exp.py_logistic.pkl') delta = hp['wsize']/2 rest = hp['wsize']%2 freq_idx = cPickle.load(open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000] freq_idx = [ hashtab[idx] for idx in freq_idx ] fname = sys.argv[0]+'_' for e in range(hp['epoch']): c = [] for i in range(nsent): rsent = numpy.random.randint(nsent-1) nword = len(sentences[rsent]) if nword < hp['wsize'] + 2: continue pidx = numpy.random.randint(low = delta, high = nword-delta) pchunk = sentences[rsent][pidx-delta:pidx+delta+rest] nchunk = [] st = sentences[rsent][pidx-delta:pidx] en = sentences[rsent][pidx+1:pidx+delta+rest] rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],)) nchunk = [] for j in range(hp['nneg']): nchunk += en + [rndidx[j]] + st assert len(nchunk) == len(pchunk)*hp['nneg'] #start = time.time() p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna)) #print 'Select row:',time.time()-start, #start = time.time() c.append(opt(p,n)) #print 'grad up:',time.time()-start if i%hp['freq'] == 0: print e,i, numpy.array(c).mean(0) ne = knn(freq_idx,embedding.params['weights'].get_value(borrow=True)) save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl') sys.stdout.flush() open('files/'+fname+'nearest.txt','w').write(display(ne,senna)) #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize']) save(embedding,fname+'embedding.pkl') save(H,fname+'hidden.pkl') save(L,fname+'logistic.pkl')