def test_auto_mask_within_seq2vec(self):
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        batsize = 11
        seqlen = 3
        seqblank = 2
        wordlen = 3
        wordblank = 2
        numchars = 20
        numwords = 100
        encdim = 4
        embdim = 50
        innerdim = 2

        worddata = np.random.randint(0, numwords, (batsize, seqlen, 1))
        worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1
        worddata = np.concatenate([worddata, worddatablank], axis=1)
        chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen))
        charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblank], axis=2)
        charblankblank = np.zeros(
            (batsize, seqblank, wordlen + wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblankblank], axis=1)
        data = np.concatenate([worddata, chardata], axis=2)

        wordemb = WordEncoderPlusGlove(numchars=numchars,
                                       numwords=numwords,
                                       encdim=encdim,
                                       embdim=embdim,
                                       maskid=-1,
                                       embtrainfrac=0)
        rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                             innerdim,
                                             bidir=False)
        enc = Seq2Vec(wordemb, rnn, maskid=-1)
        enc.enc.with_outputs()
        finalpred, pred = enc.predict(data)
        #print pred.shape, finalpred.shape
        #print pred[0], finalpred[0]
        i = 1
        while i < pred.shape[1]:
            self.assertEqual(np.allclose(pred[:, i - 1, :], pred[:, i, :]),
                             i >= seqlen)
            i += 1
    def test_auto_mask_within_seq2vec(self):
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        batsize = 11
        seqlen = 3
        seqblank = 2
        wordlen = 3
        wordblank = 2
        numchars = 20
        numwords = 100
        encdim = 4
        embdim = 50
        innerdim = 2

        worddata = np.random.randint(0, numwords, (batsize, seqlen, 1))
        worddatablank = np.zeros((batsize, seqblank, 1)).astype("int32") - 1
        worddata = np.concatenate([worddata, worddatablank], axis=1)
        chardata = np.random.randint(0, numchars, (batsize, seqlen, wordlen))
        charblank = np.zeros((batsize, seqlen, wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblank], axis=2)
        charblankblank = np.zeros((batsize, seqblank, wordlen+wordblank)).astype("int32") - 1
        chardata = np.concatenate([chardata, charblankblank], axis=1)
        data = np.concatenate([worddata, chardata], axis=2)

        wordemb = WordEncoderPlusGlove(numchars=numchars, numwords=numwords, encdim=encdim, embdim=embdim,
                                       maskid=-1,
                                       embtrainfrac=0)
        rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim, innerdim, bidir=False)
        enc = Seq2Vec(wordemb, rnn, maskid=-1)
        enc.enc.with_outputs
        finalpred, pred = enc.predict(data)
        #print pred.shape, finalpred.shape
        #print pred[0], finalpred[0]
        i = 1
        while i < pred.shape[1]:
            self.assertEqual(np.allclose(pred[:, i-1, :], pred[:, i, :]), i >= seqlen)
            i += 1
def run(epochs=10,
        numbats=100,
        numsam=10000,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.wordchar.pkl",
        embdim=50,
        encdim=50,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        keepmincount=5,
        sameenc=False,
        memaddr="dot",
        memattdim=100,
        layers=1,
        embtrainfrac=0.0,
        mem=False,
        membidir=False,
        memlayers=1,
        sharedwordenc=False):
    """ Memory match-based glove-based word-level relation classification """

    (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\
        = readdata(datap)

    # get words from relation names, update word dic
    memdata = getmemdata(entdic, worddic, chardic)

    # get glove and transform word mats to glove index space
    d2g, newdic, glove = getdic2glove(worddic,
                                      dim=embdim,
                                      trainfrac=embtrainfrac)
    traindata, validdata, testdata, memdata = \
        [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2)
         for x in [traindata, validdata, testdata, memdata]]

    print traindata.shape, testdata.shape
    #embed()

    numwords = max(worddic.values()) + 1  # don't use this, use glove
    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    wordemb = WordEncoderPlusGlove(numchars=numchars,
                                   encdim=encdim,
                                   embdim=embdim,
                                   maskid=-1,
                                   embtrainfrac=embtrainfrac)
    rnn, lastdim = SimpleSeq2Vec.makernu(embdim + encdim,
                                         encinnerdim,
                                         bidir=bidir)
    enc = Seq2Vec(wordemb, rnn, maskid=-1)

    if mem:
        memembdim = embdim
        memencdim = encdim
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        if not sharedwordenc:
            memwordemb = WordEncoderPlusGlove(numchars=numchars,
                                              encdim=encdim,
                                              embdim=embdim,
                                              maskid=-1,
                                              embtrainfrac=embtrainfrac)
        else:
            memwordemb = wordemb
        memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim + memencdim,
                                                   innerdim,
                                                   bidir=membidir)
        memenc = Seq2Vec(memwordemb, memrnn, maskid=-1)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc,
                         memdata,
                         memdim=innerdim,
                         memaddr=memaddr,
                         memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"
Beispiel #4
0
def run(
        epochs=10,
        numbats=100,
        numsam=10000,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.wordchar.pkl",
        embdim=50,
        encdim=50,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        keepmincount=5,
        sameenc=False,
        memaddr="dot",
        memattdim=100,
        layers=1,
        embtrainfrac=0.0,
        mem=False,
        membidir=False,
        memlayers=1,
        sharedwordenc=False
        ):
    """ Memory match-based glove-based word-level relation classification """

    (traindata, traingold), (validdata, validgold), (testdata, testgold), worddic, chardic, entdic\
        = readdata(datap)

    # get words from relation names, update word dic
    memdata = getmemdata(entdic, worddic, chardic)

    # get glove and transform word mats to glove index space
    d2g, newdic, glove = getdic2glove(worddic, dim=embdim, trainfrac=embtrainfrac)
    traindata, validdata, testdata, memdata = \
        [np.concatenate([np.vectorize(d2g)(x[..., 0]).reshape(x.shape[:2] + (1,)), x[..., 1:]], axis=2)
         for x in [traindata, validdata, testdata, memdata]]

    print traindata.shape, testdata.shape
    #embed()

    numwords = max(worddic.values()) + 1    # don't use this, use glove
    numchars = max(chardic.values()) + 1
    numrels = max(entdic.values()) + 1

    if bidir:
        encinnerdim = [innerdim/2]*layers
    else:
        encinnerdim = [innerdim]*layers

    wordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1, embtrainfrac=embtrainfrac)
    rnn, lastdim = SimpleSeq2Vec.makernu(embdim+encdim, encinnerdim, bidir=bidir)
    enc = Seq2Vec(wordemb, rnn, maskid=-1)

    if mem:
        memembdim = embdim
        memencdim = encdim
        if membidir:
            innerdim = [innerdim/2]*memlayers
        else:
            innerdim = [innerdim]*memlayers
        if not sharedwordenc:
            memwordemb = WordEncoderPlusGlove(numchars=numchars, encdim=encdim, embdim=embdim, maskid=-1,
                                       embtrainfrac=embtrainfrac)
        else:
            memwordemb = wordemb
        memrnn, memlastdim = SimpleSeq2Vec.makernu(memembdim+memencdim, innerdim, bidir=membidir)
        memenc = Seq2Vec(memwordemb, memrnn, maskid=-1)
        if memaddr is None or memaddr == "dot":
            memaddr = DotMemAddr
        elif memaddr == "lin":
            memaddr = LinearGateMemAddr
        dec = MemVec2Idx(memenc, memdata, memdim=innerdim, memaddr=memaddr, memattdim=memattdim)
    else:
        dec = SimpleVec2Idx(indim=innerdim, outdim=numrels)

    m = Seq2Idx(enc, dec)

    m = m.train([traindata], traingold).adagrad(lr=lr).l2(wreg).grad_total_norm(1.0).cross_entropy()\
        .validate_on([validdata], validgold).accuracy().cross_entropy().takebest()\
        .train(numbats=numbats, epochs=epochs)

    pred = m.predict(testdata)
    print pred.shape
    evalres = evaluate(np.argmax(pred, axis=1), testgold)
    print str(evalres) + "%"