def test_classify():

    from util import io
    from data import real_data

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.1)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()
    #spn = fn.marg(spn, keep=[0,1,2,4,5,7])

    ranges = np.array(
        [[None, NominalRange([1]), None, None, None, None, None, None],
         [None, NominalRange([0]), None, None, None, None, None, None],
         [None, NominalRange([0]), None, None, None, None, None, None]])
    res = fn.classifies(spn, target_id=0, ranges=ranges, value_dict=value_dict)
    print(res)

    res = fn.classify(spn, target_id=0)
    print(res)

    df, _ = real_data.get_titanic()
    a = {v[1]: v[2] for _, v in value_dict.items() if v[0] == "discrete"}
    df = df.replace(a)

    preds = fn.classify_dataset(spn,
                                target_id=0,
                                df=df,
                                transform=True,
                                value_dict=value_dict)
    print(preds)
def test_mpe_old():

    from util import io
    from data import real_data

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.1)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()
    #spn = fn.marg(spn, keep=[0,1,2,4,5,7])

    df, _ = real_data.get_titanic()
    input_data = df.values
    res = fn.mpe_spflow(spn, 0, input_data)
    print(res)

    df, _ = real_data.get_titanic()
    a = {v[1]: v[2] for _, v in value_dict.items() if v[0] == "discrete"}
    df = df.replace(a)

    preds = fn.classify_dataset(spn,
                                target_id=0,
                                df=df,
                                transform=True,
                                value_dict=value_dict)
    print(preds)
Exemple #3
0
def score(jobman,path):
    hp = jobman.state
    nsenna = 30000

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    load(embedding,path+'/embedding.pkl')

    words = parse_data()
    scores = []
    esims = []
    msim = []
    hsim = []
    Em = embedding.params['e_weights'].get_value(borrow=True)
    for i,(w1,w2,s) in enumerate(words):
        sys.stdout.flush()

        w1em = Em[w1]
        w2em = Em[w2]

        esim = -((w1em - w2em)**2).sum()
        esims.append(esim)
        hsim.append(s)
                           
    print 'Embeddings:',scipy.stats.spearmanr(numpy.array(hsim), numpy.array(esims))[0]
Exemple #4
0
def score(jobman, path):
    hp = jobman.state
    nsenna = 30000

    embedding = cae(i_size=nsenna, h_size=hp["embedsize"], e_act=identity)
    load(embedding, path + "/embedding.pkl")

    words = parse_data()
    scores = []
    esims = []
    msim = []
    hsim = []
    Em = embedding.params["e_weights"].get_value(borrow=True)
    for i, (w1, w2, s) in enumerate(words):
        sys.stdout.flush()

        w1em = Em[w1]
        w2em = Em[w2]

        esim = -((w1em - w2em) ** 2).sum()
        esims.append(esim)
        hsim.append(s)

    print "Embeddings:", scipy.stats.spearmanr(numpy.array(hsim), numpy.array(esims))[0]
Exemple #5
0
def score(jobman, path):
    hp = jobman.state
    nsenna = 30000

    PATH = "/scratch/rifaisal/msrtest/test/"
    delta = hp["wsize"] / 2
    rest = hp["wsize"] % 2
    sent = T.matrix()

    embedding = cae(i_size=nsenna, h_size=hp["embedsize"], e_act=identity)
    H = ae(i_size=hp["embedsize"] * hp["wsize"], h_size=hp["hsize"], e_act=T.tanh)
    L = logistic(i_size=hp["hsize"], h_size=1, act=identity)

    load(embedding, path + "/embedding.pkl")
    load(H, path + "/hidden.pkl")
    load(L, path + "/logistic.pkl")

    posit_embed = T.dot(sent, embedding.params["e_weights"]).reshape((1, hp["embedsize"] * hp["wsize"]))
    posit_score = H.encode(posit_embed)
    scoreit = theano.function([sent], posit_score)
    sentences = parse_data()
    scores = []
    esims = []
    msim = []
    hsim = []
    Em = embedding.params["e_weights"].get_value(borrow=True)
    for i, (sc, w1, w2, c1, c2) in enumerate(sentences):
        sys.stdout.flush()

        c1 = [29999] * 10 + c1 + [29999] * 10
        c2 = [29999] * 10 + c2 + [29999] * 10

        w1seqs = [c1[10 + idx - delta : 10 + idx + delta + rest] for idx in w1]
        w2seqs = [c2[10 + idx - delta : 10 + idx + delta + rest] for idx in w2]

        c = []

        w1em = Em[c1[10 + w1[0]]]
        w2em = Em[c2[10 + w2[0]]]

        w1sc = numpy.concatenate([scoreit(idx2mat(w1seqs[0], nsenna)).flatten(), Em[c1[10 + w1[0]]]])
        w2sc = numpy.concatenate([scoreit(idx2mat(w2seqs[0], nsenna)).flatten(), Em[c2[10 + w2[0]]]])

        metric = L.params["weights"].get_value(borrow=True).flatten()

        sim = -(((w1sc - w2sc)) ** 2).sum()
        esim = -((w1em - w2em) ** 2).sum()

        msim.append(sim)
        esims.append(esim)
        hsim.append(numpy.mean(sc))

    print "Model:", scipy.stats.spearmanr(numpy.array(hsim), numpy.array(msim))[
        0
    ], ", Embeddings:", scipy.stats.spearmanr(numpy.array(hsim), numpy.array(esims))[0]
Exemple #6
0
def evaluate_spn1_method(rdc_thresholds,
                         min_instances_slices,
                         min_supports,
                         dataset_name,
                         binary_positive=True):

    for rdc_threshold in rdc_thresholds:
        for min_instances_slice in min_instances_slices:

            loc = "_spns"
            ident = "rdc=" + str(rdc_threshold) + "_mis=" + str(
                min_instances_slice)
            spn, const_time = io.load(ident, dataset_name, loc)

            for min_support in min_supports:
                item_sets, exc_time = methods.spn1(
                    spn, min_support, binary_positive=binary_positive)
                io.save([item_sets, exc_time],
                        "spn1_" + ident + "_minSup=" + str(min_support),
                        "freq_sets/" + dataset_name,
                        loc="_results")
Exemple #7
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists("files/"):
        os.mkdir("files/")

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    s_valid = theano.sparse.csr_matrix()

    w2i = cPickle.load(open("/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl"))
    i2w = dict((v, k) for k, v in w2i.iteritems())
    i2w[0] = "UNK"
    senna = [i2w[i] for i in range(len(i2w.keys()))]

    nsenna = len(senna)

    embedding = cae(i_size=nsenna, h_size=hp["embedsize"], e_act=identity)
    H = ae(i_size=hp["embedsize"] * hp["wsize"], h_size=hp["hsize"], e_act=T.tanh)
    L = logistic(i_size=hp["hsize"], h_size=1, act=identity)

    path = hp["loadpath"]

    if path:
        load(embedding, path + "/embedding.pkl")
        load(H, path + "/hidden.pkl")
        load(L, path + "/logistic.pkl")
        hp["embedsize"] = embedding.params["e_weights"].get_value(borrow=True).shape[1]
        hp["hsize"] = H.params["e_weights"].get_value(borrow=True).shape[1]
        jobman.save()

    del H.params["d_bias"]
    del embedding.params["d_bias"]
    del embedding.params["e_bias"]

    lr = hp["lr"]
    h_size = hp["hsize"]
    bs = hp["bs"]

    posit_embed = theano.sparse.dot(s_valid, embedding.params["e_weights"]).reshape(
        (1 + hp["nneg"], hp["embedsize"] * hp["wsize"])
    )
    # posit_embed = T.dot(s_valid, embedding.params['e_weights']).reshape((1+hp['nneg'],hp['embedsize']*hp['wsize']))
    # negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    # negat_score = L.encode(H.encode(negat_embed))

    C = posit_score[1:] - posit_score[0] + hp["margin"]

    CC = (rect(C)).mean()

    opt = theano.function(
        [s_valid], (rect(C)).mean(), updates=dict(L.update(CC, lr) + H.update(CC, lr) + embedding.update_norm(CC, lr))
    )

    def saveexp():
        save(embedding, fname + "embedding.pkl")
        save(H, fname + "hidden.pkl")
        save(L, fname + "logistic.pkl")

    delta = hp["wsize"] / 2
    rest = hp["wsize"] % 2

    wsimtester = wsim()
    freq_idx = cPickle.load(open("/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl"))[:1000]

    fname = ""

    tseenwords = not debug
    for e in range(hp["epoch"]):
        hp["split"] = numpy.random.randint(285)
        sentences = cPickle.load(
            open("/mnt/scratch/bengio/bengio_group/data/gutenberg/small_ints_50000/split" + str(hp["split"]) + ".pkl")
        )
        nsent = len(sentences)
        bigc = []
        bigr = []
        E = embedding.params["e_weights"].get_value(borrow=True)
        seen_words = 0
        for i, s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp["wsize"] + 2:
                continue
            c = []
            r = []
            if debug:
                print " *** Processing document", i, "with", nword,
                sys.stdout.flush()
            for j in range(delta, nword - delta):
                cstart = time.time()
                pchunk = s[j - delta : j + delta + rest]
                nchunk = []
                st = s[j - delta : j]
                en = s[j + 1 : j + delta + rest]
                rndidx = numpy.random.randint(nsenna, size=(hp["nneg"],))
                nchunk = []
                for kk in range(hp["nneg"]):
                    nchunk += st + [rndidx[kk]] + en

                # assert len(nchunk) == len(pchunk)*hp['nneg']
                # p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                pn = idx2spmat(pchunk + nchunk, nsenna)
                # assert pn[0,pchunk[0]] == 1
                ctime = time.time() - cstart
                tstart = time.time()

                l = opt(pn)
                c.append(l)

                if debug:
                    print ".",
                    break

            if debug:
                print ""

            bigc += [numpy.array(c).sum()]

            if tseenwords > hp["wsimfreq"] or debug:
                hp["wsim"] = wsimtester.score(embedding.params["e_weights"].get_value(borrow=True))
                print i, "WordSim Sp Corr:", hp["wsim"]
                sys.stdout.flush()
                hp["score"] = numpy.array(bigc).mean()
                hp["e"] = e
                hp["i"] = i
                print e, i, "NN Score:", hp["score"]
                tseenwords = 0
                jobman.save()

            if seen_words > hp["freq"] or debug:
                seen_words = 0
                ne = knn(freq_idx, embedding.params["e_weights"].get_value(borrow=True))
                open("files/" + fname + "nearest.txt", "w").write(display(ne, i2w))
                saveexp()
                sys.stdout.flush()
                jobman.save()

    saveexp()
Exemple #8
0
def run(jobman, debug=False):
    hp = jobman.state

    # Symbolic variables

    s_posit = T.matrix()  #theano.sparse.csr_matrix()
    s_negat = T.matrix()  #theano.sparse.csr_matrix()

    s_valid = theano.sparse.csr_matrix()

    sentences = cPickle.load(
        open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl'))

    validsentence = sentences[-10:]
    sentences = sentences[:-10]
    senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl'))
    gsubset = cPickle.load(
        open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten(
        ).tolist()
    hashtab = dict(zip(gsubset, range(len(gsubset))))

    senna = numpy.array(senna)[gsubset].tolist()

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=rect,
           d_act=hardtanh)
    L = logistic(i_size=hp['hsize'], h_size=1)  #, act = identity)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=identity)
    #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True))
    #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True))

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    #valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat],
                          C.mean(),
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update_norm(CC, lr)))

    #validfct = theano.function([s_valid],valid_score)

    #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])

    #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(embedding, 'files/gutensubsetdense_exp.py_embedding.pkl')
    load(H, 'files/gutensubsetdense_exp.py_hidden.pkl')
    load(L, 'files/gutensubsetdense_exp.py_logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx = [hashtab[idx] for idx in freq_idx]

    fname = sys.argv[0] + '_'

    for e in range(hp['epoch']):
        c = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent - 1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low=delta, high=nword - delta)
            pchunk = sentences[rsent][pidx - delta:pidx + delta + rest]
            nchunk = []
            st = sentences[rsent][pidx - delta:pidx]
            en = sentences[rsent][pidx + 1:pidx + delta + rest]
            rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st

            assert len(nchunk) == len(pchunk) * hp['nneg']
            #start = time.time()
            p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna))
            #print 'Select row:',time.time()-start,
            #start = time.time()
            c.append(opt(p, n))
            #print 'grad up:',time.time()-start

            if i % hp['freq'] == 0:
                print e, i, numpy.array(c).mean(0)
                ne = knn(freq_idx,
                         embedding.params['weights'].get_value(borrow=True))
                save(embedding, fname + 'embedding.pkl')
                save(H, fname + 'hidden.pkl')
                save(L, fname + 'logistic.pkl')
                sys.stdout.flush()
                open('files/' + fname + 'nearest.txt',
                     'w').write(display(ne, senna))

    #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
    save(embedding, fname + 'embedding.pkl')
    save(H, fname + 'hidden.pkl')
    save(L, fname + 'logistic.pkl')
Exemple #9
0
def msrerror(vocab, jobman):
    hp = jobman.state
    nsenna = 30000

    PATH = '/scratch/rifaisal/msrtest/test/'
    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2
    sent = T.matrix()

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)

    path = hp['loadpath']

    load(embedding, path + '/embedding.pkl')
    load(H, path + '/hidden.pkl')
    load(L, path + '/logistic.pkl')

    posit_embed = T.dot(sent, embedding.params['e_weights']).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    posit_score = L.encode(H.encode(posit_embed))
    fct = theano.function([sent], posit_score)
    sentences = idxdataset(vocab)
    scores = []
    for i, s in enumerate(sentences):
        print i,
        sys.stdout.flush()
        nword = len(s)
        if nword < hp['wsize'] + 2:
            #print i,'Failure'
            s += [29999] * 3
        c = []
        for j in range(delta, nword - delta):
            pchunk = s[j - delta:j + delta + rest]
            p = idx2mat(pchunk, nsenna)
            l = fct(p)
            c.append(l)
        if not len(c):
            print 'pas bim'
            scores.append(0)
        else:
            scores.append(numpy.mean(c))
        #if i%5 == 0:
        #    print scores[i-5:i]

    score_template = open(PATH + 'data/Holmes.lm_format.questions.txt')
    score_output = open('energy.lm_output.txt', 'w')
    sentencelist = score_template.readlines()
    for sc, sentence in zip(scores, sentencelist):
        score_output.write(sentence.split('\n')[0] + '\t' + str(sc) + '\n')
    score_output.close()

    pipebestof5 = subprocess.Popen(
        ['perl', PATH + 'bestof5.pl', './energy.lm_output.txt'],
        stdout=subprocess.PIPE)
    energyanswer = open('./energy.answers', 'w')

    for line in pipebestof5.stdout:
        energyanswer.write(line)

    energyanswer.close()

    pipescore = subprocess.Popen([
        'perl', PATH + 'score.pl', './energy.answers',
        PATH + 'data/Holmes.lm_format.answers.txt'
    ],
                                 stdout=subprocess.PIPE)
    legend = ['correct', '%correct', 'valid', 'test']
    out = zip(legend,
              [r.split('\n')[0] for r in pipescore.stdout.readlines()[-4:]])
    res = dict(out)
    res = dict((k, float(v)) for k, v in res.iteritems())
    print res
    print out
Exemple #10
0
                        help='Dropout applied to decoder outputs and'
                        ' hidden states')

    parser.add_argument('--teacher_forcing', type=float, default=1.0,
                        help='Ratio of decoder`s own predictions and true'
                        ' target values used during training.')

    parser.add_argument('--bidirectional', action='store_true', default=False,
                        help='Set encoder to compute forward and backward'
                        ' hidden states.')

    args = parser.parse_args()

    if validate_args(args):

        lang, datasets = io.load(args.data)
        vocab = {
            'src': Vocab(lang['vocab']['src']),
            'tgt': Vocab(lang['vocab']['tgt']),
            'stack': Vocab(lang['vocab']['stack']),
            'operator': Vocab(lang['vocab']['operator'])
        }

        log = Logger(out_path=args.out)
        line = log.add_text('')
        log.start()

        logger = {
            'log': log,
            'line': line
        }
        return all_diffs     
    
    else:
        raise Exception("Unknown attribute-type: " + str(value_info[0]))








if __name__ == '__main__':
    
    from util import io
    from data import real_data
    
    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.1)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()
    spn = fn.marg(spn, keep=[0,1,2,3,4,5,6,7])
    #spn = example_spns.get_credit_spn()
    
    
    fis = feature_importance(spn, 1, value_dict=value_dict)
    print(fis)
    
    fis = feature_importance_weighted(spn, 1, value_dict=value_dict)
    print(fis)
    
Exemple #12
0
def visualize_stats(dataset_name,
                    min_supports,
                    gt_method="apriori1",
                    other_methods=[
                        "fpgrowth2", "spn1_rdc=0.3_mis=0.1",
                        "spn1_rdc=0.3_mis=0.01", "spn1_rdc=0.2_mis=0.1",
                        "spn1_rdc=0.2_mis=0.01", "spn1_rdc=0.1_mis=0.1",
                        "spn1_rdc=0.1_mis=0.01"
                    ]):

    folder = "freq_sets/" + dataset_name
    vis_dict = {
        "item_sets": {},
        "n_items": {},
        "exc_times": {},
        "same_items": {},
        "same_errors": {},
        "missing_items": {},
        "missing_probs": {},
        "new_items": {},
        "new_probs": {}
    }

    for method in [gt_method] + other_methods:
        vis_dict["item_sets"][method] = []
        vis_dict["n_items"][method] = []
        vis_dict["exc_times"][method] = []
        vis_dict["same_items"][method] = []
        vis_dict["missing_items"][method] = []
        vis_dict["new_items"][method] = []

        vis_dict["same_errors"][method] = []
        vis_dict["new_probs"][method] = []
        vis_dict["missing_probs"][method] = []

        for i, min_sup in enumerate(min_supports):
            method_itemsets, method_exc_time = io.load(method + "_minSup=" +
                                                       str(min_sup),
                                                       folder,
                                                       loc="_results")
            vis_dict["item_sets"][method].append(method_itemsets)
            vis_dict["n_items"][method].append(len(method_itemsets))
            vis_dict["exc_times"][method].append(method_exc_time)

            same_sets, missing_sets, different_sets = __compare_itemsets(
                vis_dict["item_sets"][gt_method][i], method_itemsets)
            vis_dict["same_items"][method].append(len(same_sets))
            vis_dict["missing_items"][method].append(len(missing_sets))
            vis_dict["new_items"][method].append(len(different_sets))

            vis_dict["same_errors"][method].append(
                [s1 - s2 for (_, s1, s2) in same_sets])
            vis_dict["new_probs"][method].append(
                [sup for (sup, _) in different_sets])
            vis_dict["missing_probs"][method].append(
                [sup for (sup, _) in missing_sets])

    plot_names = list(vis_dict.keys())
    plot_names.remove("item_sets")

    ncols = 1
    nrows = len(plot_names)
    figsize_x = 12
    figsize_y = 20
    _, axes = plt.subplots(nrows,
                           ncols,
                           figsize=(figsize_x, figsize_y),
                           squeeze=False,
                           sharex=False)

    r = np.arange(len(min_supports))
    n_methods = 1 + len(other_methods)
    barWidth = 1 / (1 + n_methods)

    for j, plot_name in enumerate(plot_names):

        if plot_name in ["same_errors", "new_probs", "missing_probs"]:

            for i, (name, vals) in enumerate(vis_dict[plot_name].items()):
                #positions = np.arange(len(method_box_plots[i]))+ (i/(2 + len(method_box_plots))) - (len(method_box_plots)/2)/len(method_box_plots) + 1.5/len(method_box_plots)
                axes[j][0].boxplot(vals,
                                   positions=r + i * barWidth,
                                   widths=barWidth)

            axes[j][0].set_title(plot_name)
            axes[j][0].set_xlabel('min support')
            axes[j][0].set_xlim(-0.25, len(min_supports))
            axes[j][0].set_xticks(np.arange(0, len(min_supports)))
            axes[j][0].set_xticklabels(
                [str(min_sup) for min_sup in min_supports])
            axes[j][0].legend()
        else:

            for i, (name, vals) in enumerate(vis_dict[plot_name].items()):
                axes[j][0].bar(r + i * barWidth,
                               vals,
                               width=barWidth,
                               label=name)
            axes[j][0].set_title(plot_name)
            axes[j][0].set_xlabel('min support')

            if plot_name == "exc_times":
                axes[j][0].set_yscale('log')

            axes[j][0].set_xlim(-0.25, len(min_supports))
            axes[j][0].set_xticks(np.arange(0, len(min_supports)))
            axes[j][0].set_xticklabels(
                [str(min_sup) for min_sup in min_supports])
            axes[j][0].legend()

    plt.tight_layout()

    plt.savefig("stats.pdf")

    plt.show()
Exemple #13
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    idx_start = T.lscalar()
    idx_stop = T.lscalar()
    s_valid = theano.sparse.csr_matrix()



    w2i = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl'))
    i2w = dict( (v,k) for k,v in w2i.iteritems() )
    i2w[0] = 'UNK'
    senna = [ i2w[i] for i in range(len(i2w.keys())) ]


    nsenna = len(senna)
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)

    del H.params['d_bias']
    del embedding.params['d_bias']
    del embedding.params['e_bias']
    minsize = hp['minsize']
    maxsize = hp['maxsize']

    dsize = maxsize - minsize +1

    H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias')

    path = hp['loadpath']
 
    if path:
        load(embedding,path+'/embedding.pkl')
        #load(H,path+'/hidden.pkl')
        #load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1]
        #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    H.params['e_bias'] = theano.shared( numpy.array(numpy.zeros((dsize,hp['hsize'])),dtype=theano.config.floatX),name='e_bias')
    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))


    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_embed_left = T.concatenate([posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                  T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    negat_embed_left = T.concatenate([negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                   T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    posit_embed_right = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]),
                                  posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1)

    negat_embed_right = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]),
                                   negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']]],axis=1)



    posit_embed = T.concatenate([ T.zeros_like(posit_embed[:,:idx_start*hp['embedsize']]),
                                  posit_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                  T.zeros_like(posit_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    negat_embed = T.concatenate([ T.zeros_like(negat_embed[:,:idx_start*hp['embedsize']]),
                                   negat_embed[:,idx_start*hp['embedsize']:idx_stop*hp['embedsize']],
                                   T.zeros_like(negat_embed[:,idx_stop*hp['embedsize']:]) ],axis=1)

    
    #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed)
    #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed)

    #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed)
    #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed)

    Hposit = T.tanh(T.dot(posit_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:])
    Hnegat = T.tanh(T.dot(negat_embed,H.params['e_weights']) + H.params['e_bias'][idx_stop-idx_start-minsize,:])
    posit_score = L.encode(Hposit)
    negat_score = L.encode(Hnegat)
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat, idx_start, idx_stop],
                          (rect(C)).mean(),
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'))[:2000]
    fname = ''
    validsentence = []# cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(open('/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'+str(hp['split'])+'.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i,s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['maxsize'] + 2:
                continue
            rndsize = numpy.random.randint(low=hp['minsize']+1,high=hp['maxsize']-1)
            idxsta = numpy.random.randint(low=1, high=hp['maxsize']-rndsize)
            idxsto = idxsta+rndsize

            print 'r',rndsize,'b',idxsta,'e',idxsto,'shape',H.params['e_bias'].get_value().shape

            c =[]
            r =[]
            if debug:
                print ' *** Processing document',i,'with',nword,
                sys.stdout.flush()
            for j in range(delta,nword-delta):
                nd = rndsize/2
                rd = rndsize%2
                pchunk = s[j-delta:j+delta+rest]
                nchunk = []
                
                rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
                nchunk = []
                for kk in range(hp['nneg']):
                    tmpchunk = copy.copy(pchunk)
                    tmpchunk[idxsta+nd] = rndidx[kk]
                    nchunk += tmpchunk
                assert len(nchunk) == len(pchunk)*hp['nneg']
                p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                l = opt(p,n, idxsta, idxsto)
                c.append(l)

                if debug:
                    print '.',
                    break


            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk


            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean() 
                hp['e'] = e
                hp['i'] = i
                print ''
                print e,i,'NN Score:', hp['score']

                if not debug:
                    ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                    open('files/'+fname+'nearest.txt','w').write(display(ne,senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()
                
    saveexp()
Exemple #14
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    idx_start = T.lscalar()
    idx_stop = T.lscalar()
    s_valid = theano.sparse.csr_matrix()

    w2i = cPickle.load(
        open(
            '/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl'
        ))
    i2w = dict((v, k) for k, v in w2i.iteritems())
    i2w[0] = 'UNK'
    senna = [i2w[i] for i in range(len(i2w.keys()))]

    nsenna = len(senna)

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)

    del H.params['d_bias']
    del embedding.params['d_bias']
    del embedding.params['e_bias']
    minsize = hp['minsize']
    maxsize = hp['maxsize']

    dsize = maxsize - minsize + 1

    H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros(
        (dsize, hp['hsize'])),
                                                   dtype=theano.config.floatX),
                                       name='e_bias')

    path = hp['loadpath']

    if path:
        load(embedding, path + '/embedding.pkl')
        #load(H,path+'/hidden.pkl')
        #load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(
            borrow=True).shape[1]
        #hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    H.params['e_bias'] = theano.shared(numpy.array(numpy.zeros(
        (dsize, hp['hsize'])),
                                                   dtype=theano.config.floatX),
                                       name='e_bias')
    valid_embedding = sparse.supervised.logistic(i_size=nsenna,
                                                 h_size=hp['embedsize'],
                                                 act=identity)
    valid_embedding.params['weights'] = sp.shared(
        value=scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(
            borrow=True)))

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape(
        (hp['nneg'], hp['embedsize'] * hp['wsize']))
    valid_embed = sp.dot(s_valid, valid_embedding.params['weights']).reshape(
        (nsenna, hp['embedsize'] * hp['wsize']))

    posit_embed_left = T.concatenate([
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:])
    ],
                                     axis=1)

    negat_embed_left = T.concatenate([
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:])
    ],
                                     axis=1)

    posit_embed_right = T.concatenate([
        T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]),
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']]
    ],
                                      axis=1)

    negat_embed_right = T.concatenate([
        T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]),
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']]
    ],
                                      axis=1)

    posit_embed = T.concatenate([
        T.zeros_like(posit_embed[:, :idx_start * hp['embedsize']]),
        posit_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(posit_embed[:, idx_stop * hp['embedsize']:])
    ],
                                axis=1)

    negat_embed = T.concatenate([
        T.zeros_like(negat_embed[:, :idx_start * hp['embedsize']]),
        negat_embed[:, idx_start * hp['embedsize']:idx_stop * hp['embedsize']],
        T.zeros_like(negat_embed[:, idx_stop * hp['embedsize']:])
    ],
                                axis=1)

    #posit_embed = ifelse(T.eq(idx_start, 0), posit_embed_left, posit_embed)
    #posit_embed = ifelse(T.eq(idx_stop, hp['maxsize']), posit_embed_right, posit_embed)

    #negat_embed = ifelse(T.eq(idx_start, 0), negat_embed_left, negat_embed)
    #negat_embed = ifelse(T.eq(idx_stop, hp['maxsize']), negat_embed_right, negat_embed)

    Hposit = T.tanh(
        T.dot(posit_embed, H.params['e_weights']) +
        H.params['e_bias'][idx_stop - idx_start - minsize, :])
    Hnegat = T.tanh(
        T.dot(negat_embed, H.params['e_weights']) +
        H.params['e_bias'][idx_stop - idx_start - minsize, :])
    posit_score = L.encode(Hposit)
    negat_score = L.encode(Hnegat)
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat, idx_start, idx_stop],
                          (rect(C)).mean(),
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update_norm(CC, lr)))

    validfct = theano.function([s_valid], valid_score)

    def saveexp():
        save(embedding, fname + 'embedding.pkl')
        save(H, fname + 'hidden.pkl')
        save(L, fname + 'logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    freq_idx = cPickle.load(
        open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'
             ))[:2000]
    fname = ''
    validsentence = [
    ]  # cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/valid_debug.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(
            open(
                '/mnt/scratch/bengio/bengio_group/data/gutenberg/ints_50000/split'
                + str(hp['split']) + '.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i, s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['maxsize'] + 2:
                continue
            rndsize = numpy.random.randint(low=hp['minsize'] + 1,
                                           high=hp['maxsize'] - 1)
            idxsta = numpy.random.randint(low=1, high=hp['maxsize'] - rndsize)
            idxsto = idxsta + rndsize

            print 'r', rndsize, 'b', idxsta, 'e', idxsto, 'shape', H.params[
                'e_bias'].get_value().shape

            c = []
            r = []
            if debug:
                print ' *** Processing document', i, 'with', nword,
                sys.stdout.flush()
            for j in range(delta, nword - delta):
                nd = rndsize / 2
                rd = rndsize % 2
                pchunk = s[j - delta:j + delta + rest]
                nchunk = []

                rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
                nchunk = []
                for kk in range(hp['nneg']):
                    tmpchunk = copy.copy(pchunk)
                    tmpchunk[idxsta + nd] = rndidx[kk]
                    nchunk += tmpchunk
                assert len(nchunk) == len(pchunk) * hp['nneg']
                p, n = (idx2mat(pchunk, nsenna), idx2mat(nchunk, nsenna))
                l = opt(p, n, idxsta, idxsto)
                c.append(l)

                if debug:
                    print '.',
                    break

            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:  #(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(
                    value=scipy.sparse.csr_matrix(
                        embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna,
                                       hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank', mrk

            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean()
                hp['e'] = e
                hp['i'] = i
                print ''
                print e, i, 'NN Score:', hp['score']

                if not debug:
                    ne = knn(
                        freq_idx,
                        embedding.params['e_weights'].get_value(borrow=True))
                    open('files/' + fname + 'nearest.txt',
                         'w').write(display(ne, senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()

    saveexp()
        raise Exception(
            "Not implemented for other than discrete or numeric ...: " +
            str(value_dict[target_id][0]))
    return conds, labels


if __name__ == '__main__':

    from simple_spn.example import example_spns
    from util import io

    spn = example_spns.get_gender_spn()

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.1)
    spn, value_dict, _ = io.load(ident, "titanic", loc)
    spn = fn.marg(spn, keep=[0, 1, 2, 3, 6])

    #visualize_sub_populations(spn)
    #visualize_overall_distribution(spn, value_dict=value_dict, save_path="overall_visualization.pdf")
    #visualize_target_based_overall_distribution_single(spn, 0, value_dict=value_dict, save_path="overall_visualization_target_based.pdf")
    #visualize_target_based_overall_distribution_compact(spn, 0, value_dict=value_dict, save_path="overall_visualization_target_based_compact.pdf")
    visualize_expected_sub_populations(spn,
                                       value_dict=value_dict,
                                       save_path="expectation_line_plot.pdf")
    visualized_target_based_expected_sub_populations(
        spn,
        0,
        value_dict=value_dict,
        save_path="target_based_expectation_line_plot.pdf")
    pass
Exemple #16
0
def run(jobman,debug = False):
    hp = jobman.state

    # Symbolic variables

    s_posit = T.matrix()#theano.sparse.csr_matrix()
    s_negat = T.matrix()#theano.sparse.csr_matrix()

    s_valid = theano.sparse.csr_matrix()

    sentences = cPickle.load(open('/data/lisatmp2/rifaisal/guten_subset_idx.pkl'))

    validsentence = sentences[-10:]
    sentences = sentences[:-10]
    senna = cPickle.load(open('/data/lisatmp2/rifaisal/senna.pkl'))
    gsubset = cPickle.load(open('/data/lisatmp2/rifaisal/guten_vocab_subset.pkl')).flatten().tolist()
    hashtab = dict( zip( gsubset, range( len( gsubset))))    

    senna = numpy.array(senna)[gsubset].tolist()

    nsent = len(sentences)
    nsenna = len(senna)

    # Layers
    embedding = logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = rect, d_act = hardtanh)
    L = logistic(i_size = hp['hsize'],  h_size = 1)#, act = identity)

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    #valid_embedding.params['weights'].set_value(embedding.params['weights'].get_value(borrow=True))
    #valid_embedding.params['bias'].set_value(embedding.params['bias'].get_value(borrow=True))


    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = embedding.encode(s_posit).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = embedding.encode(s_negat).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    #valid_embed = valid_embedding.encode(s_valid).reshape((nsenna,hp['embedsize']*hp['wsize']))


    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    #valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat], 
                          C.mean(), 
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    #validfct = theano.function([s_valid],valid_score)

    #print 'Random Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])

    #load(valid_embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(embedding,'files/gutensubsetdense_exp.py_embedding.pkl')
    load(H,'files/gutensubsetdense_exp.py_hidden.pkl')
    load(L,'files/gutensubsetdense_exp.py_logistic.pkl')

    delta = hp['wsize']/2
    rest = hp['wsize']%2

    freq_idx = cPickle.load(open('/data/lisatmp2/rifaisal/gutten_sorted_vocab.pkl'))[:1000]
    freq_idx =  [ hashtab[idx] for idx in freq_idx ]

    fname = sys.argv[0]+'_'
    

    for e in range(hp['epoch']):
        c = []
        for i in range(nsent):
            rsent = numpy.random.randint(nsent-1)
            nword = len(sentences[rsent])
            if nword < hp['wsize'] + 2:
                continue

            pidx = numpy.random.randint(low = delta, high = nword-delta)
            pchunk = sentences[rsent][pidx-delta:pidx+delta+rest]
            nchunk = []
            st = sentences[rsent][pidx-delta:pidx]
            en = sentences[rsent][pidx+1:pidx+delta+rest]
            rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
            nchunk = []
            for j in range(hp['nneg']):
                nchunk += en + [rndidx[j]] + st


            assert len(nchunk) == len(pchunk)*hp['nneg']
            #start = time.time()
            p, n = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
            #print 'Select row:',time.time()-start,
            #start = time.time()
            c.append(opt(p,n))
            #print 'grad up:',time.time()-start

            if i%hp['freq'] == 0:
                print e,i, numpy.array(c).mean(0)
                ne = knn(freq_idx,embedding.params['weights'].get_value(borrow=True))
                save(embedding,fname+'embedding.pkl')
                save(H,fname+'hidden.pkl')
                save(L,fname+'logistic.pkl')
                sys.stdout.flush()
                open('files/'+fname+'nearest.txt','w').write(display(ne,senna))

    #print 'Valid Mean rank',evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
    save(embedding,fname+'embedding.pkl')
    save(H,fname+'hidden.pkl')
    save(L,fname+'logistic.pkl')
Exemple #17
0
def load_spn(dataset_name, rdc_threshold, min_instances_slice, nrows=None):
    fname = "rdc=" + str(rdc_threshold) + "_mis=" + str(min_instances_slice)
    if nrows:
        fname = fname + '_n=' + np.format_float_scientific(
            nrows, precision=0, trim='-')
    return io.load(fname, dataset_name, "_spns")
    item_dataset, parametric_types = real_data.get_T10I4D(num_features, 100000)
    item_dataset[0][9] = 1
    dataset_name = "T10I4D_20"
    '''
    Create SPNs if necessary
    '''
    #rdc_thresholds = [0.1]
    #min_instances_slices = [0.01]
    #learn_SPN.create_parametric_spns(item_dataset, parametric_types, rdc_thresholds, min_instances_slices, folder=dataset_name)

    print("fdsafsdfasd")
    rdc_threshold = 0.1
    min_instances_slice = 0.01
    loc = "_spns"
    ident = "rdc=" + str(rdc_threshold) + "_mis=" + str(min_instances_slice)
    spn, const_time = io.load(ident, dataset_name, loc)

    print(spn)
    print(const_time)

    fn.print_statistics(spn)

    t0 = time.time()
    res = naive_approach(spn, min_support=0.0001)
    print(time.time() - t0)
    #print("Number item-sets naive approach: " + str(len(res)))

    t0 = time.time()
    res = min_sub_population(spn, min_support=0.0001)
    print(time.time() - t0)
    print("Number item-sets min sub population approach: " + str(len(res)))
Exemple #19
0
def msrerror(vocab,jobman):
    hp = jobman.state
    nsenna = 30000

    PATH = '/scratch/rifaisal/msrtest/test/'
    delta = hp['wsize']/2
    rest = hp['wsize']%2
    sent = T.matrix()

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)

    path = hp['loadpath']

    load(embedding,path+'/embedding.pkl')
    load(H,path+'/hidden.pkl')
    load(L,path+'/logistic.pkl')

    posit_embed = T.dot(sent, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    posit_score = L.encode(H.encode(posit_embed))
    fct = theano.function([sent],posit_score)
    sentences = idxdataset(vocab)
    scores = []
    for i,s in enumerate(sentences):
        print i,
        sys.stdout.flush()
        nword = len(s)
        if nword < hp['wsize'] + 2:
            #print i,'Failure'
            s += [29999]*3
        c =[]
        for j in range(delta,nword-delta):
            pchunk = s[j-delta:j+delta+rest]
            p = idx2mat(pchunk,nsenna)
            l = fct(p)
            c.append(l)
        if not len(c):
            print 'pas bim'
            scores.append(0)
        else:
            scores.append(numpy.mean(c))
        #if i%5 == 0:
        #    print scores[i-5:i]

    score_template = open(PATH+'data/Holmes.lm_format.questions.txt')
    score_output = open('energy.lm_output.txt','w')
    sentencelist = score_template.readlines()
    for sc,sentence in zip(scores, sentencelist):
        score_output.write(sentence.split('\n')[0]+'\t'+str(sc)+'\n')
    score_output.close()

    pipebestof5 = subprocess.Popen(['perl', PATH+'bestof5.pl','./energy.lm_output.txt'],stdout=subprocess.PIPE)
    energyanswer = open('./energy.answers','w')

    for line in pipebestof5.stdout: energyanswer.write(line)

    energyanswer.close()

    pipescore = subprocess.Popen(['perl', PATH+'score.pl','./energy.answers',PATH+'data/Holmes.lm_format.answers.txt'],stdout=subprocess.PIPE)
    legend = ['correct','%correct','valid','test']
    out = zip(legend,[ r.split('\n')[0] for r in  pipescore.stdout.readlines()[-4:] ])
    res = dict(out)
    res = dict( (k,float(v)) for k,v in res.iteritems())
    print res
    print out
Exemple #20
0
def run(jobman, debug=False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    s_valid = theano.sparse.csr_matrix()

    w2i = cPickle.load(
        open(
            '/mnt/scratch/bengio/bengio_group/data/gutenberg/merged_word2idx.pkl'
        ))
    i2w = dict((v, k) for k, v in w2i.iteritems())
    i2w[0] = 'UNK'
    senna = [i2w[i] for i in range(len(i2w.keys()))]

    nsenna = len(senna)

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)

    path = hp['loadpath']

    if path:
        load(embedding, path + '/embedding.pkl')
        load(H, path + '/hidden.pkl')
        load(L, path + '/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(
            borrow=True).shape[1]
        hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    del H.params['d_bias']
    del embedding.params['d_bias']
    del embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = theano.sparse.dot(s_valid,
                                    embedding.params['e_weights']).reshape(
                                        (1 + hp['nneg'],
                                         hp['embedsize'] * hp['wsize']))
    #posit_embed = T.dot(s_valid, embedding.params['e_weights']).reshape((1+hp['nneg'],hp['embedsize']*hp['wsize']))
    #negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    #negat_score = L.encode(H.encode(negat_embed))

    C = (posit_score[1:] - posit_score[0] + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_valid], (rect(C)).mean(),
                          updates=dict(
                              L.update(CC, lr) + H.update(CC, lr) +
                              embedding.update_norm(CC, lr)))

    def saveexp():
        save(embedding, fname + 'embedding.pkl')
        save(H, fname + 'hidden.pkl')
        save(L, fname + 'logistic.pkl')

    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2

    wsimtester = wsim()
    freq_idx = cPickle.load(
        open('/mnt/scratch/bengio/bengio_group/data/gutenberg/sorted_vocab.pkl'
             ))[:1000]

    fname = ''

    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(285)
        sentences = cPickle.load(
            open(
                '/mnt/scratch/bengio/bengio_group/data/gutenberg/small_ints_50000/split'
                + str(hp['split']) + '.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []
        E = embedding.params['e_weights'].get_value(borrow=True)
        seen_words = 0
        for i, s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['wsize'] + 2:
                continue
            c = []
            r = []
            if debug:
                print ' *** Processing document', i, 'with', nword,
                sys.stdout.flush()
            for j in range(delta, nword - delta):
                cstart = time.time()
                pchunk = s[j - delta:j + delta + rest]
                nchunk = []
                st = s[j - delta:j]
                en = s[j + 1:j + delta + rest]
                rndidx = numpy.random.randint(nsenna, size=(hp['nneg'], ))
                nchunk = []
                for kk in range(hp['nneg']):
                    nchunk += st + [rndidx[kk]] + en

                #assert len(nchunk) == len(pchunk)*hp['nneg']
                #p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                pn = idx2spmat(pchunk + nchunk, nsenna)
                #assert pn[0,pchunk[0]] == 1
                ctime = time.time() - cstart
                tstart = time.time()

                l = opt(pn)
                c.append(l)

                if debug:
                    print '.',
                    break

            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if tseenwords > hp['wsimfreq'] or debug:
                hp['wsim'] = wsimtester.score(
                    embedding.params['e_weights'].get_value(borrow=True))
                print i, 'WordSim Sp Corr:', hp['wsim']
                sys.stdout.flush()
                hp['score'] = numpy.array(bigc).mean()
                hp['e'] = e
                hp['i'] = i
                print e, i, 'NN Score:', hp['score']
                tseenwords = 0
                jobman.save()

            if seen_words > hp['freq'] or debug:
                seen_words = 0
                ne = knn(freq_idx,
                         embedding.params['e_weights'].get_value(borrow=True))
                open('files/' + fname + 'nearest.txt',
                     'w').write(display(ne, i2w))
                saveexp()
                sys.stdout.flush()
                jobman.save()

    saveexp()
Exemple #21
0
def demo_visualize_density():
    #data, parametric_types = real_data.get_p_value_dataset()
    #learn_SPN.create_parametric_spns(data, parametric_types, [0.3], [0.01], folder="p_value_test")

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.01)
    spn, _ = io.load(ident, "p_value_test", loc)
    value_dict = real_data.get_p_value_test_value_dict()

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density1.pdf"
    visualize_density(spn,
                      value_dict,
                      rang=rang,
                      max_density=10,
                      save_path=save_path)

    rang = [None] * 5 + [NominalRange([0])]
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density2.pdf"
    visualize_density(spn,
                      value_dict,
                      rang=rang,
                      max_density=10,
                      save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density3.pdf"
    visualize_density_target(spn,
                             5,
                             value_dict,
                             rang=rang,
                             max_density=10,
                             save_path=save_path)

    loc = "_spns"
    ident = "rdc=" + str(0.3) + "_mis=" + str(0.01)
    spn, _ = io.load(ident, "titanic", loc)
    value_dict = real_data.get_titanic_value_dict()

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density5.pdf"
    visualize_density(spn, value_dict, max_density=0.1, save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density6.pdf"
    visualize_density_target(spn,
                             0,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)

    rang = None
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density7.pdf"
    visualize_density_target(spn,
                             2,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)

    rang = [None] * 2 + [NominalRange([0])] + [None] * 5
    save_path = os.path.dirname(os.path.realpath(
        __file__)) + "/../../../_plots/interpretability/blackbox/density8.pdf"
    visualize_density_target(spn,
                             0,
                             value_dict,
                             max_density=0.1,
                             save_path=save_path)
Exemple #22
0
def score(jobman, path):
    hp = jobman.state
    nsenna = 30000

    PATH = '/scratch/rifaisal/msrtest/test/'
    delta = hp['wsize'] / 2
    rest = hp['wsize'] % 2
    sent = T.matrix()

    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act=identity)
    H = ae(i_size=hp['embedsize'] * hp['wsize'],
           h_size=hp['hsize'],
           e_act=T.tanh)
    L = logistic(i_size=hp['hsize'], h_size=1, act=identity)

    load(embedding, path + '/embedding.pkl')
    load(H, path + '/hidden.pkl')
    load(L, path + '/logistic.pkl')

    posit_embed = T.dot(sent, embedding.params['e_weights']).reshape(
        (1, hp['embedsize'] * hp['wsize']))
    posit_score = H.encode(posit_embed)
    scoreit = theano.function([sent], posit_score)
    sentences = parse_data()
    scores = []
    esims = []
    msim = []
    hsim = []
    Em = embedding.params['e_weights'].get_value(borrow=True)
    for i, (sc, w1, w2, c1, c2) in enumerate(sentences):
        sys.stdout.flush()

        c1 = [29999] * 10 + c1 + [29999] * 10
        c2 = [29999] * 10 + c2 + [29999] * 10

        w1seqs = [c1[10 + idx - delta:10 + idx + delta + rest] for idx in w1]
        w2seqs = [c2[10 + idx - delta:10 + idx + delta + rest] for idx in w2]

        c = []

        w1em = Em[c1[10 + w1[0]]]
        w2em = Em[c2[10 + w2[0]]]

        w1sc = numpy.concatenate([
            scoreit(idx2mat(w1seqs[0], nsenna)).flatten(), Em[c1[10 + w1[0]]]
        ])
        w2sc = numpy.concatenate([
            scoreit(idx2mat(w2seqs[0], nsenna)).flatten(), Em[c2[10 + w2[0]]]
        ])

        metric = L.params['weights'].get_value(borrow=True).flatten()

        sim = -(((w1sc - w2sc))**2).sum()
        esim = -((w1em - w2em)**2).sum()

        msim.append(sim)
        esims.append(esim)
        hsim.append(numpy.mean(sc))

    print 'Model:', scipy.stats.spearmanr(
        numpy.array(hsim),
        numpy.array(msim))[0], ', Embeddings:', scipy.stats.spearmanr(
            numpy.array(hsim), numpy.array(esims))[0]
Exemple #23
0
def run(jobman,debug = False):
    expstart = time.time()
    hp = jobman.state

    if not os.path.exists('files/'): os.mkdir('files/')

    # Symbolic variables
    s_posit = T.matrix()
    s_negat = T.matrix()
    s_valid = theano.sparse.csr_matrix()

    #vocab = cPickle.load(open('/scratch/rifaisal/data/guten/senna.pkl'))
    #senna = cPickle.load(open('/scratch/rifaisal/data/wiki_april_2010/WestburyLab.wikicorp.201004_vocab30k.pkl'))
    w2i = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/merged_word2idx.pkl'))
    i2w = dict( (v,k) for k,v in w2i.iteritems() )
    i2w[0] = 'UNK'
    senna = [ i2w[i] for i in range(len(i2w.keys())) ]

    nsenna = len(senna)
    
    embedding = cae(i_size=nsenna, h_size=hp['embedsize'], e_act = identity)
    H = ae(i_size = hp['embedsize']*hp['wsize'], h_size=hp['hsize'], e_act = T.tanh)
    L = logistic(i_size = hp['hsize'], h_size = 1, act = identity)
 
    path = hp['loadpath']
 
    if path:
        load(embedding,path+'/embedding.pkl')
        load(H,path+'/hidden.pkl')
        load(L,path+'/logistic.pkl')
        hp['embedsize'] = embedding.params['e_weights'].get_value(borrow=True).shape[1]
        hp['hsize'] = H.params['e_weights'].get_value(borrow=True).shape[1]
        jobman.save()

    valid_embedding = sparse.supervised.logistic(i_size=nsenna, h_size=hp['embedsize'], act = identity)
    valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
    valid_embedding.params['bias'] = embedding.params['e_bias']

    lr = hp['lr']
    h_size = hp['hsize']
    bs = hp['bs']

    posit_embed = T.dot(s_posit, embedding.params['e_weights']).reshape((1,hp['embedsize']*hp['wsize']))
    negat_embed = T.dot(s_negat, embedding.params['e_weights']).reshape((hp['nneg'],hp['embedsize']*hp['wsize']))
    valid_embed = sp.dot(s_valid,valid_embedding.params['weights']).reshape((nsenna,hp['embedsize']*hp['wsize']))

    posit_score = L.encode(H.encode(posit_embed))
    negat_score = L.encode(H.encode(negat_embed))
    valid_score = L.encode(H.encode(valid_embed))

    C = (negat_score - posit_score.flatten() + hp['margin'])

    CC = (rect(C)).mean()

    opt = theano.function([s_posit, s_negat],
                          (rect(C)).mean(),
                          updates = dict( L.update(CC,lr) + H.update(CC,lr) + embedding.update_norm(CC,lr)) )

    #validfct = theano.function([s_valid],valid_score)

    def saveexp():
        save(embedding,fname+'embedding.pkl')
        save(H,fname+'hidden.pkl')
        save(L,fname+'logistic.pkl')


    delta = hp['wsize']/2
    rest = hp['wsize']%2
    #freq_idx = range(29000,30000)
    freq_idx = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/sorted_vocab.pkl'))[:2000]
    fname = ''
    #validsentence = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/valid.pkl'))
    tseenwords = not debug
    for e in range(hp['epoch']):
        hp['split'] = numpy.random.randint(45)
        sentences = cPickle.load(open('/scratch/rifaisal/data/gutenberg_aistats/split'+str(hp['split'])+'.pkl'))
        nsent = len(sentences)
        bigc = []
        bigr = []

        seen_words = 0
        for i,s in enumerate(sentences):
            nword = len(s)
            seen_words += nword
            tseenwords += nword

            if nword < hp['wsize'] + 2:
                continue
            c =[]
            r =[]
            if debug:
                print ' *** Processing document',i,'with',nword,
                sys.stdout.flush()
            for j in range(delta,nword-delta):
                pchunk = s[j-delta:j+delta+rest]
                nchunk = []
                st = s[j-delta:j]
                en = s[j+1:j+delta+rest]
                rndidx = numpy.random.randint(nsenna, size = (hp['nneg'],))
                nchunk = []
                for kk in range(hp['nneg']):
                    nchunk += st + [rndidx[kk]] + en

                assert len(nchunk) == len(pchunk)*hp['nneg']
                p, n  = (idx2mat(pchunk,nsenna), idx2mat(nchunk,nsenna))
                l = opt(p,n)
                c.append(l)

                if debug:
                    print '.',
                    break


            if debug:
                print ''

            bigc += [numpy.array(c).sum()]

            if 0:#(time.time() - expstart) > ( 3600 * 24 * 6 + 3600*20) or (tseenwords)>(10*hp['freq']):
                tseenwords = 0
                valid_embedding.params['weights'] = sp.shared(value = scipy.sparse.csr_matrix(embedding.params['e_weights'].get_value(borrow=True)))
                mrk = evaluation.error(validsentence, validfct, nsenna, hp['wsize'])
                hp['mrk'] = mrk
                jobman.save()
                saveexp()
                print 'Random Valid Mean rank',mrk


            if seen_words > hp['freq'] or debug:
                seen_words = 0
                hp['score'] = numpy.array(bigc).mean() 
                hp['e'] = e
                hp['i'] = i
                print ''
                print e,i,'NN Score:', hp['score']

                if not debug:
                    ne = knn(freq_idx,embedding.params['e_weights'].get_value(borrow=True))
                    open('files/'+fname+'nearest.txt','w').write(display(ne,senna))
                    saveexp()
                sys.stdout.flush()
                jobman.save()
                
    saveexp()