Example #1
0
def run(p="../../../data/atis/atis.pkl", wordembdim=100, innerdim=200, lr=0.05, numbats=100, epochs=20, validinter=1, wreg=0.0003, depth=1):
    p = os.path.join(os.path.dirname(__file__), p)
    train, test, dics = pickle.load(open(p))
    word2idx = dics["words2idx"]
    table2idx = dics["tables2idx"]
    label2idx = dics["labels2idx"]
    label2idxrev = {v: k for k, v in label2idx.items()}
    train = zip(*train)
    test = zip(*test)
    print "%d training examples, %d test examples" % (len(train), len(test))
    #tup2text(train[0], word2idx, table2idx, label2idx)
    maxlen = 0
    for tup in train + test:
        maxlen = max(len(tup[0]), maxlen)

    numwords = max(word2idx.values()) + 2
    numlabels = max(label2idx.values()) + 2

    # get training data
    traindata = getdatamatrix(train, maxlen, 0).astype("int32")
    traingold = getdatamatrix(train, maxlen, 2).astype("int32")
    trainmask = (traindata > 0).astype("float32")

    # test data
    testdata = getdatamatrix(test, maxlen, 0).astype("int32")
    testgold = getdatamatrix(test, maxlen, 2).astype("int32")
    testmask = (testdata > 0).astype("float32")

    res = atiseval(testgold-1, testgold-1, label2idxrev); print res#; exit()

    # define model
    innerdim = [innerdim] * depth
    m = SimpleSeqTransducer(indim=numwords, embdim=wordembdim, innerdim=innerdim, outdim=numlabels)
    '''m = StupidAtis(inpembdim = wordembdim, indim = numwords, outdim = numlabels)
    m = StupidAtisNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)'''
    #m = StupidAtisScanMod(inpembdim=wordembdim, indim=numwords, outdim=numlabels)
    #m = StupidAtisScanModNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)

    # training
    '''m.train([traindata, trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\
        .train(numbats, epochs)'''

    m.train([traindata], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\
        .train(numbats, epochs)

    # predict after training
    testpredprobs = m.predict(testdata, testmask)
    testpred = np.argmax(testpredprobs, axis=2)-1
    #testpred = testpred * testmask
    #print np.vectorize(lambda x: label2idxrev[x] if x > -1 else " ")(testpred)

    evalres = atiseval(testpred, testgold-1, label2idxrev); print evalres
    def test_output_shape(self):
        # settings
        batsize = 10
        seqlen = 5
        invocsize = 50
        inembdim = 50
        innerdim = 11
        outvocsize = 17

        # data
        traindata = np.random.randint(0, invocsize, (batsize, seqlen))
        traingold = np.random.randint(0, outvocsize, (batsize, seqlen))

        # model
        m = SimpleSeqTransducer(indim=invocsize, embdim=inembdim, innerdim=innerdim, outdim=outvocsize)

        pred = m.predict(traindata)
        self.assertEqual(pred.shape, (batsize, seqlen, outvocsize))
Example #3
0
    def test_output_shape(self):
        # settings
        batsize = 10
        seqlen = 5
        invocsize = 50
        inembdim = 50
        innerdim = 11
        outvocsize = 17

        # data
        traindata = np.random.randint(0, invocsize, (batsize, seqlen))
        traingold = np.random.randint(0, outvocsize, (batsize, seqlen))

        # model
        m = SimpleSeqTransducer(indim=invocsize, embdim=inembdim, innerdim=innerdim, outdim=outvocsize)

        pred = m.predict(traindata)
        self.assertEqual(pred.shape, (batsize, seqlen, outvocsize))
Example #4
0
def run(p="../../../data/atis/atis.pkl",
        wordembdim=100,
        innerdim=200,
        lr=0.05,
        numbats=100,
        epochs=20,
        validinter=1,
        wreg=0.0003,
        depth=1):
    p = os.path.join(os.path.dirname(__file__), p)
    train, test, dics = pickle.load(open(p))
    word2idx = dics["words2idx"]
    table2idx = dics["tables2idx"]
    label2idx = dics["labels2idx"]
    label2idxrev = {v: k for k, v in label2idx.items()}
    train = zip(*train)
    test = zip(*test)
    print "%d training examples, %d test examples" % (len(train), len(test))
    #tup2text(train[0], word2idx, table2idx, label2idx)
    maxlen = 0
    for tup in train + test:
        maxlen = max(len(tup[0]), maxlen)

    numwords = max(word2idx.values()) + 2
    numlabels = max(label2idx.values()) + 2

    # get training data
    traindata = getdatamatrix(train, maxlen, 0).astype("int32")
    traingold = getdatamatrix(train, maxlen, 2).astype("int32")
    trainmask = (traindata > 0).astype("float32")

    # test data
    testdata = getdatamatrix(test, maxlen, 0).astype("int32")
    testgold = getdatamatrix(test, maxlen, 2).astype("int32")
    testmask = (testdata > 0).astype("float32")

    res = atiseval(testgold - 1, testgold - 1, label2idxrev)
    print res  #; exit()

    # define model
    innerdim = [innerdim] * depth
    m = SimpleSeqTransducer(indim=numwords,
                            embdim=wordembdim,
                            innerdim=innerdim,
                            outdim=numlabels)
    '''m = StupidAtis(inpembdim = wordembdim, indim = numwords, outdim = numlabels)
    m = StupidAtisNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)'''
    #m = StupidAtisScanMod(inpembdim=wordembdim, indim=numwords, outdim=numlabels)
    #m = StupidAtisScanModNative(inpembdim=wordembdim, indim=numwords, outdim=numlabels)

    # training
    '''m.train([traindata, trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\
        .train(numbats, epochs)'''

    m.train([traindata], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\
        .split_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter)\
        .train(numbats, epochs)

    # predict after training
    testpredprobs = m.predict(testdata, testmask)
    testpred = np.argmax(testpredprobs, axis=2) - 1
    #testpred = testpred * testmask
    #print np.vectorize(lambda x: label2idxrev[x] if x > -1 else " ")(testpred)

    evalres = atiseval(testpred, testgold - 1, label2idxrev)
    print evalres
Example #5
0
def run(epochs=50,
        numbats=25,
        lr=0.1,
        layers=1,
        embdim=100,
        encdim=200,
        bidir=False,
        wordlevel=False,        # "char" or "word"
        maxlen=75,
        maxwordlen=15,
        ):
    mode = "word" if wordlevel else "char"
    (traindata, traingold), (testdata, testgold), dic = \
        readdata("../../../data/hatespeech/train.csv",
                 "../../../data/hatespeech/test.csv",
                 mode=mode, maxlen=maxlen)

    revdic = {v: k for k, v in dic.items()}
    def pp(s):
        print "".join([revdic[x] if x in revdic else "<???>" for x in s])

    embed()
    # data stats
    print "class distribution in train: {}% positive".format(np.sum(traingold)*1. / np.sum(np.ones_like(traingold)))
    print "class distribution in test: {}% positive".format(np.sum(testgold)*1. / np.sum(np.ones_like(testgold)))

    wordemb = VectorEmbed(indim=len(dic), dim=embdim)
    clasemb = VectorEmbed(indim=2, dim=embdim)
    encdim = [encdim] * layers
    enc = SimpleSeqTransducer(inpemb=Eye(embdim*2), innerdim=encdim,
                              outdim=len(dic))

    m = GenClass(wordemb, clasemb, enc)


    # shift traindata
    straindata = np.ones((traindata.shape[0], 1), dtype="int32")
    straindata = np.concatenate([straindata, traindata[:, :-1]], axis=1)

    m = m.train([straindata, traingold], traindata)\
        .adadelta(lr=lr).grad_total_norm(1.0).seq_cross_entropy()\
        .split_validate(6, random=True).seq_cross_entropy().seq_accuracy()\
        .train(numbats=numbats, epochs=epochs)

    #enc.save("hatemodel.{}.Emb{}D.Enc{}D.{}L.model".format(mode, embdim, encdim, layers))


    # pre predict
    stestdata = np.ones((testdata.shape[0], 1), dtype="int32")
    stestdata = np.concatenate([stestdata, testdata[:, :-1]], axis=1)
    negpreds = m.predict(stestdata, np.zeros_like(testgold))  # (batsize, seqlen, vocsize)
    pospreds = m.predict(stestdata, np.ones_like(testgold))
    negprobs = negpreds[
        np.arange(negpreds.shape[0])[:, None],
        np.arange(negpreds.shape[1])[None, :],
        testdata]
    posprobs = pospreds[
        np.arange(pospreds.shape[0])[:, None],
        np.arange(pospreds.shape[1])[None, :],
        testdata]
    negprobs = np.sum(-np.log(negprobs), axis=1)
    posprobs = np.sum(-np.log(posprobs), axis=1)
    pred = negprobs < posprobs
    embed()