Ejemplo n.º 1
0
def evaluate_baseline_method(method, item_dataset, min_supports, dataset_name):
    for min_support in min_supports:
        item_sets, exc_time = method(item_dataset, min_support)
        io.save([item_sets, exc_time],
                method.__name__ + "_minSup=" + str(min_support),
                "freq_sets/" + dataset_name,
                loc="_results")
Ejemplo n.º 2
0
def save_spn(
    spn,
    const_time,
    dataset_name,
    rdc_threshold,
    min_instances_slice,
    value_dict=None,
    nrows=None,
):
    if value_dict is None: fn.generate_adhoc_value_dict(spn)
    name = "rdc=" + str(rdc_threshold) + "_mis=" + str(min_instances_slice)
    if nrows:
        name = name + "_n=" + np.format_float_scientific(
            nrows, precision=0, trim='-')
    io.save([spn, value_dict, const_time], name, dataset_name, loc="_spns")
Ejemplo n.º 3
0
def evaluate_spn1_method(rdc_thresholds,
                         min_instances_slices,
                         min_supports,
                         dataset_name,
                         binary_positive=True):

    for rdc_threshold in rdc_thresholds:
        for min_instances_slice in min_instances_slices:

            loc = "_spns"
            ident = "rdc=" + str(rdc_threshold) + "_mis=" + str(
                min_instances_slice)
            spn, const_time = io.load(ident, dataset_name, loc)

            for min_support in min_supports:
                item_sets, exc_time = methods.spn1(
                    spn, min_support, binary_positive=binary_positive)
                io.save([item_sets, exc_time],
                        "spn1_" + ident + "_minSup=" + str(min_support),
                        "freq_sets/" + dataset_name,
                        loc="_results")
Ejemplo n.º 4
0
 def saveexp():
     save(embedding,fname+'embedding.pkl')
     save(H,fname+'hidden.pkl')
     save(L,fname+'logistic.pkl')
     print 'Saved successfully'
Ejemplo n.º 5
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    # Symbolic variables
    s_bow = T.matrix()
    s_posit = T.matrix()#theano.sparse.csr_matrix()
    s_negat = T.matrix()#theano.sparse.csr_matrix()

    sentences = cPickle.load(open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten().tolist()
    hashtab = dict( zip( gsubset, range( len( gsubset))))    

    senna = numpy.array(senna)[gsubset].tolist()
    s_valid = theano.sparse.csr_matrix()

    validsentence = sentences[-10:]
    sentences = sentences[:-10]




    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = T.nnet.sigmoid)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh)
    L = logistic(i_size = hp['hsize'],  h_size = 1)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = T.nnet.sigmoid)
    valid_embedding.params['weights'] = embedding.params['e_weights']
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    rec = embedding.reconstruct(s_bow, loss='ce')
    CC = (rect(C)).mean() + hp['lambda'] * rec

    opt = theano.function([s_posit, s_negat, s_bow], 
                          [C.mean(),rec], 
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update(CC,lr)) )

    validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')
        print 'Saved successfully'

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx =  [ hashtab[idx] for idx in freq_idx ]

    fname = sys.argv[0]+'_'
    
    for e in range(hp['epoch']):
        c = []
        r = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent-1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low = delta, high = nword-delta)
            pchunk = sentences[rsent][pidx-delta:pidx+delta+rest]
            nchunk = []
            st = sentences[rsent][pidx-delta:pidx]
            en = sentences[rsent][pidx+1:pidx+delta+rest]
            rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st


            assert len(nchunk) == len(pchunk)*hp['nneg']

            p, n, b = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna), idx2vec(sentences[rsent],nsenna))

            l,g = opt(p,n,b)
            c.append(l)
            r.append(g)
            
            if (time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (i+1)%(50*hp['freq']) == 0:
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                hp['e'] = e
                hp['i'] = i
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk

            if i%hp['freq'] == 0:
                hp['score'] = numpy.array(c).mean()
                hp['rec'] = numpy.array(r).mean()
                print e,i,'NN Score:', hp['score'], 'Reconstruction:', hp['rec']

                ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                open('files/'+fname+'nearest.txt','w').write(display(ne,senna))

                saveexp()
                sys.stdout.flush()
                jobman.save()
                
    save()
Ejemplo n.º 6
0
 def saveexp():
     save(embedding, fname + 'embedding.pkl')
     save(H, fname + 'hidden.pkl')
     save(L, fname + 'logistic.pkl')
Ejemplo n.º 7
0
 def saveexp():
     save(embedding, fname + "embedding.pkl")
     save(H, fname + "hidden.pkl")
     save(L, fname + "logistic.pkl")
Ejemplo n.º 8
0
 def saveexp():
     save(embedding, fname + 'embedding.pkl')
     save(H, fname + 'hidden.pkl')
     save(L, fname + 'logistic.pkl')
     print 'Saved successfully'
Ejemplo n.º 9
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    # Symbolic variables
    s_bow = T.matrix()
    s_posit = T.matrix()  #theano.sparse.csr_matrix()
    s_negat = T.matrix()  #theano.sparse.csr_matrix()

    sentences = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_subset_idx.pkl'))

    senna = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    gsubset = cPickle.load(
        open('/scratch/rifaisal/data/guten/guten_vocab_subset.pkl')).flatten(
        ).tolist()
    hashtab = dict(zip(gsubset, range(len(gsubset))))

    senna = numpy.array(senna)[gsubset].tolist()
    s_valid = theano.sparse.csr_matrix()

    validsentence = sentences[-10:]
    sentences = sentences[:-10]

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers

    embedding = cae(i_size=nsenna,
                    h_size=hp['embedsize'],
                    e_act=T.nnet.sigmoid)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=rect,
           d_act=hardtanh)
    L = logistic(i_size=hp['hsize'], h_size=1)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=T.nnet.sigmoid)
    valid_embedding.params['weights'] = embedding.params['e_weights']
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    valid_embed = valid_embedding.encode(s_valid).reshape(
        (nsenna, hp['embedsize'] * hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    rec = embedding.reconstruct(s_bow, loss='ce')
    CC = (rect(C)).mean() + hp['lambda'] * rec

    opt = theano.function([s_posit, s_negat, s_bow], [C.mean(), rec],
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update(CC, lr)))

    validfct = theano.function([s_valid], valid_score)

    def saveexp():
        save(embedding, fname + 'embedding.pkl')
        save(H, fname + 'hidden.pkl')
        save(L, fname + 'logistic.pkl')
        print 'Saved successfully'

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/scratch/rifaisal/data/guten/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx = [hashtab[idx] for idx in freq_idx]

    fname = sys.argv[0] + '_'

    for e in range(hp['epoch']):
        c = []
        r = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent - 1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low=delta, high=nword - delta)
            pchunk = sentences[rsent][pidx - delta:pidx + delta + rest]
            nchunk = []
            st = sentences[rsent][pidx - delta:pidx]
            en = sentences[rsent][pidx + 1:pidx + delta + rest]
            rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st

            assert len(nchunk) == len(pchunk) * hp['nneg']

            p, n, b = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna),
                       idx2vec(sentences[rsent], nsenna))

            l, g = opt(p, n, b)
            c.append(l)
            r.append(g)

            if (time.time() - expstart) > (3600 * 24 * 6 + 3600 * 20) or (
                    i + 1) % (50 * hp['freq']) == 0:
                mrk = evaluation.error(validsentence, validfct, nsenna,
                                       hp['wsize'])
                hp['mrk'] = mrk
                hp['e'] = e
                hp['i'] = i
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank', mrk

            if i % hp['freq'] == 0:
                hp['score'] = numpy.array(c).mean()
                hp['rec'] = numpy.array(r).mean()
                print e, i, 'NN Score:', hp['score'], 'Reconstruction:', hp[
                    'rec']

                ne = knn(freq_idx,
                         embedding.params['e_weights'].get_value(borrow=True))
                open('files/' + fname + 'nearest.txt',
                     'w').write(display(ne, senna))

                saveexp()
                sys.stdout.flush()
                jobman.save()

    save()
Ejemplo n.º 10
0
def run(jobman, debug=False):
    hp = jobman.state

    # Symbolic variables

    s_posit = T.matrix()  #theano.sparse.csr_matrix()
    s_negat = T.matrix()  #theano.sparse.csr_matrix()

    s_valid = theano.sparse.csr_matrix()

    sentences = cPickle.load(
        open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl'))

    validsentence = sentences[-10:]
    sentences = sentences[:-10]
    senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl'))
    gsubset = cPickle.load(
        open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten(
        ).tolist()
    hashtab = dict(zip(gsubset, range(len(gsubset))))

    senna = numpy.array(senna)[gsubset].tolist()

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=rect,
           d_act=hardtanh)
    L = logistic(i_size=hp['hsize'], h_size=1)  #, act = identity)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=identity)
    #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True))
    #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True))

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    #valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat],
                          C.mean(),
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update_norm(CC, lr)))

    #validfct = theano.function([s_valid],valid_score)

    #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])

    #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(embedding, 'files/gutensubsetdense_exp.py_embedding.pkl')
    load(H, 'files/gutensubsetdense_exp.py_hidden.pkl')
    load(L, 'files/gutensubsetdense_exp.py_logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx = [hashtab[idx] for idx in freq_idx]

    fname = sys.argv[0] + '_'

    for e in range(hp['epoch']):
        c = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent - 1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low=delta, high=nword - delta)
            pchunk = sentences[rsent][pidx - delta:pidx + delta + rest]
            nchunk = []
            st = sentences[rsent][pidx - delta:pidx]
            en = sentences[rsent][pidx + 1:pidx + delta + rest]
            rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st

            assert len(nchunk) == len(pchunk) * hp['nneg']
            #start = time.time()
            p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna))
            #print 'Select row:',time.time()-start,
            #start = time.time()
            c.append(opt(p, n))
            #print 'grad up:',time.time()-start

            if i % hp['freq'] == 0:
                print e, i, numpy.array(c).mean(0)
                ne = knn(freq_idx,
                         embedding.params['weights'].get_value(borrow=True))
                save(embedding, fname + 'embedding.pkl')
                save(H, fname + 'hidden.pkl')
                save(L, fname + 'logistic.pkl')
                sys.stdout.flush()
                open('files/' + fname + 'nearest.txt',
                     'w').write(display(ne, senna))

    #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
    save(embedding, fname + 'embedding.pkl')
    save(H, fname + 'hidden.pkl')
    save(L, fname + 'logistic.pkl')
Ejemplo n.º 11
0
 def saveexp():
     save(embedding,fname+'embedding.pkl')
     save(H,fname+'hidden.pkl')
     save(L,fname+'logistic.pkl')
Ejemplo n.º 12
0
def run(jobman,debug = False):
    hp = jobman.state

    # Symbolic variables

    s_posit = T.matrix()#theano.sparse.csr_matrix()
    s_negat = T.matrix()#theano.sparse.csr_matrix()

    s_valid = theano.sparse.csr_matrix()

    sentences = cPickle.load(open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl'))

    validsentence = sentences[-10:]
    sentences = sentences[:-10]
    senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl'))
    gsubset = cPickle.load(open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten().tolist()
    hashtab = dict( zip( gsubset, range( len( gsubset))))    

    senna = numpy.array(senna)[gsubset].tolist()

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh)
    L = logistic(i_size = hp['hsize'],  h_size = 1)#, act = identity)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True))
    #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True))


    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))


    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    #valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat], 
                          C.mean(), 
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    #validfct = theano.function([s_valid],valid_score)

    #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])

    #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(H,'files/gutensubsetdense_exp.py_hidden.pkl')
    load(L,'files/gutensubsetdense_exp.py_logistic.pkl')

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx =  [ hashtab[idx] for idx in freq_idx ]

    fname = sys.argv[0]+'_'
    

    for e in range(hp['epoch']):
        c = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent-1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low = delta, high = nword-delta)
            pchunk = sentences[rsent][pidx-delta:pidx+delta+rest]
            nchunk = []
            st = sentences[rsent][pidx-delta:pidx]
            en = sentences[rsent][pidx+1:pidx+delta+rest]
            rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st


            assert len(nchunk) == len(pchunk)*hp['nneg']
            #start = time.time()
            p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
            #print 'Select row:',time.time()-start,
            #start = time.time()
            c.append(opt(p,n))
            #print 'grad up:',time.time()-start

            if i%hp['freq'] == 0:
                print e,i, numpy.array(c).mean(0)
                ne = knn(freq_idx,embedding.params['weights'].get_value(borrow=True))
                save(embedding,fname+'embedding.pkl')
                save(H,fname+'hidden.pkl')
                save(L,fname+'logistic.pkl')
                sys.stdout.flush()
                open('files/'+fname+'nearest.txt','w').write(display(ne,senna))

    #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
    save(embedding,fname+'embedding.pkl')
    save(H,fname+'hidden.pkl')
    save(L,fname+'logistic.pkl')