def test_seqdecatt(  # seems to work
            wreg=0.00001,  # TODO: regularization other than 0.0001 first stagnates, then goes down
            epochs=50,
            numbats=20,
            lr=0.1,
            statedim=50,
            encdim=50,
            attdim=50,
            startsym=0,
    ):
        # get words
        vocsize = 27
        embdim = 50
        lm = Glove(embdim, 2000)
        allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
        words = allwords[1000:]
        vwords = allwords[:1000]
        data = words2ints(words)
        sdata = shiftdata(data)
        vdata = words2ints(vwords)
        svdata = shiftdata(vdata)
        testneglogprob = 17

        testpred = ["the", "alias", "mock", "test", "stalin", "allahuakbar", "python", "pythonista",
                    " " * (data.shape[1])]
        testpred = words2ints(testpred)
        print testpred

        block = SimpleSeqEncDecAtt(inpvocsize=vocsize, outvocsize=vocsize, encdim=encdim, decdim=statedim,
                                   attdim=attdim, inconcat=False)

        s = SeqEncDecSearch(block)
        pred, probs = s.decode(testpred, startsym, testpred.shape[1])
        print ints2words(pred), probs
Beispiel #2
0
def run_seqdecatt(  # seems to work
    wreg=0.00001,
    epochs=50,
    numbats=50,
    lr=0.1,
    statedim=50,
    encdim=50,
    attdim=50,
    numwords=5000,
):
    # get words
    vocsize = 28
    lm = Glove(50, numwords)
    allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    #embed()
    invwords = [word[::-1] for word in allwords]
    data = words2ints(allwords)
    idata = words2ints(invwords)
    startsym = 0

    golddata = data

    #golddata = idata

    print data[:10]
    print shiftdata(data, startsym)[:10]

    testwords = [
        "the", "alias", "mock", "test", "stalin", "allahuakbar", "python",
        "pythonista"
    ]
    testpred = words2ints(testwords)

    block = SimpleSeqEncDecAtt(inpvocsize=vocsize,
                               outvocsize=vocsize,
                               encdim=encdim,
                               decdim=statedim,
                               attdim=attdim,
                               inconcat=False,
                               bidir=False,
                               statetrans=None)
    block.train([data, shiftdata(golddata, startsym)], golddata).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg) \
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(2) \
        .train(numbats=numbats, epochs=epochs)

    s = SeqEncDecSearch(block)
    pred, probs = s.decode(testpred, startsym, testpred.shape[1])
    print ints2words(pred), probs
Beispiel #3
0
    def test_seqdecatt(  # seems to work
        wreg=0.00001,  # TODO: regularization other than 0.0001 first stagnates, then goes down
        epochs=50,
        numbats=20,
        lr=0.1,
        statedim=50,
        encdim=50,
        attdim=50,
        startsym=0,
    ):
        # get words
        vocsize = 27
        embdim = 50
        lm = Glove(embdim, 2000)
        allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
        words = allwords[1000:]
        vwords = allwords[:1000]
        data = words2ints(words)
        sdata = shiftdata(data)
        vdata = words2ints(vwords)
        svdata = shiftdata(vdata)
        testneglogprob = 17

        testpred = [
            "the", "alias", "mock", "test", "stalin", "allahuakbar", "python",
            "pythonista", " " * (data.shape[1])
        ]
        testpred = words2ints(testpred)
        print testpred

        block = SimpleSeqEncDecAtt(inpvocsize=vocsize,
                                   outvocsize=vocsize,
                                   encdim=encdim,
                                   decdim=statedim,
                                   attdim=attdim,
                                   inconcat=False)

        s = SeqEncDecSearch(block)
        pred, probs = s.decode(testpred, startsym, testpred.shape[1])
        print ints2words(pred), probs
Beispiel #4
0
def run_seqdecatt(  # seems to work
        wreg=0.00001,
        epochs=50,
        numbats=50,
        lr=0.1,
        statedim=50,
        encdim=50,
        attdim=50,
        numwords=5000,
    ):
    # get words
    vocsize = 28
    lm = Glove(50, numwords)
    allwords = filter(lambda x: re.match("^[a-z]+$", x), lm.D.keys())
    #embed()
    invwords = [word[::-1] for word in allwords]
    data = words2ints(allwords)
    idata = words2ints(invwords)
    startsym = 0

    golddata = data

    #golddata = idata

    print data[:10]
    print shiftdata(data, startsym)[:10]

    testwords = ["the", "alias", "mock", "test", "stalin", "allahuakbar", "python", "pythonista"]
    testpred = words2ints(testwords)

    block = SimpleSeqEncDecAtt(inpvocsize=vocsize, outvocsize=vocsize, encdim=encdim, decdim=statedim, attdim=attdim, inconcat=False, bidir=False, statetrans=None)
    block.train([data, shiftdata(golddata, startsym)], golddata).seq_cross_entropy().grad_total_norm(1.0).adagrad(lr=lr).l2(wreg) \
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(2) \
        .train(numbats=numbats, epochs=epochs)

    s = SeqEncDecSearch(block)
    pred, probs = s.decode(testpred, startsym, testpred.shape[1])
    print ints2words(pred), probs