Beispiel #1
0
 def apply(self, x):  # (batsize, outerseqlen, innerseqlen)
     y = x
     xm = T.neq(x, self.maskid) if self.maskid is not None else None
     for enc in self.encoders:
         y = EncLastDim(enc)(y, mask=xm)
         xm = T.sum(xm, axis=-1) > 0 if self.maskid is not None else None
     return y
Beispiel #2
0
 def __init__(self,
              lenc,
              renc,
              aggregator=asblock(lambda x: T.sum(x, axis=1)),
              **kw):
     self.agg = aggregator
     super(SeqMatchScore, self).__init__(lenc, renc, **kw)
Beispiel #3
0
 def apply(self, x, mask=None, weights=None):
     ret = self.enc(x, mask=mask,
                    weights=weights)  # (batsize, seqlen, lastdim)
     outs = []
     # apply mask    (SeqEncoder should attach mask to outvar if all_outputs()
     mask = ret.mask
     for i in range(self.numouts):
         selfweights = Softmax()(ret[:, :, i])  # (batsize, seqlen)
         selfweights *= mask  # apply mask
         selfweights = selfweights / T.sum(selfweights, axis=1).dimshuffle(
             0, "x")  # renormalize
         weightedstates = ret[:, :, self.numouts:] * selfweights.dimshuffle(
             0, 1, "x")
         out = T.sum(weightedstates, axis=1)  # (batsize, lastdim)
         outs.append(out)
     if self.mode == "concat":
         ret = T.concatenate(outs, axis=1)
     elif self.mode == "seq":
         outs = [out.dimshuffle(0, "x", 1) for out in outs]
         ret = T.concatenate(outs, axis=1)
     return ret
Beispiel #4
0
 def apply(self, x, mask=None, weights=None):
     ret = self.enc(x, mask=mask,
                    weights=weights)  # (batsize, seqlen, lastdim)
     outs = []
     # apply mask    (SeqEncoder should attach mask to outvar if all_outputs()
     mask = mask if mask is not None else ret.mask if hasattr(
         ret, "mask") else None
     if self.bidir:
         mid = ret.shape[2] / 2
         ret1 = ret[:, :, :mid]
         ret2 = ret[:, :, mid:]
         ret = ret1
     for i in range(self.numouts):
         selfweights = ret[:, :, i]  # (batsize, seqlen)
         if self.bidir:
             selfweights += ret2[:, :, i]
         selfweights = Softmax()(selfweights)
         if mask is not None:
             selfweights *= mask  # apply mask
         selfweights = selfweights / T.sum(selfweights, axis=1).dimshuffle(
             0, "x")  # renormalize
         weightedstates = ret[:, :, self.numouts:] * selfweights.dimshuffle(
             0, 1, "x")
         if self.bidir:
             weightedstates2 = ret2[:, :,
                                    self.numouts:] * selfweights.dimshuffle(
                                        0, 1, "x")
             weightedstates = T.concatenate(
                 [weightedstates, weightedstates2], axis=2)
         out = T.sum(weightedstates, axis=1)  # (batsize, lastdim)
         outs.append(out)
     if self.mode == "concat":
         ret = T.concatenate(outs, axis=1)
     elif self.mode == "seq":
         outs = [out.dimshuffle(0, "x", 1) for out in outs]
         ret = T.concatenate(outs, axis=1)
     return ret
Beispiel #5
0
 def apply(self, x, mask=None):  # (batsize, seqlen, dim)
     mask = x.mask if mask is None else mask
     if mask is not None:
         assert (mask.ndim == x.ndim - 1)
         realm = T.tensordot(mask, T.ones((x.shape[-1], )), 0)
         if self.mode == "max":
             x = T.switch(realm, x, np.infty * (realm - 1))
         else:
             x = x * realm
     if self.mode == "max":
         ret = T.max(x, axis=-2)
     elif self.mode == "sum":
         ret = T.sum(x, axis=-2)
     elif self.mode == "avg":
         ret = T.sum(x, axis=-2) / x.shape[-2]
     else:
         raise Exception("unknown pooling mode: {:3s}".format(self.mode))
     # ret: (batsize, dim)
     if mask is not None:
         mask = 1 * (T.sum(mask, axis=-1) > 0)
         ret = T.switch(T.tensordot(mask, T.ones((x.shape[-1], )), 0), ret,
                        T.zeros_like(ret))
         ret.mask = mask
     return ret
Beispiel #6
0
 def apply(self, x):
     if self.l2emb is not None:
         l1tensor = x[:, :, 1:]
         l1encs = EncLastDim(self.l1enc)(l1tensor)
         l2mat = x[:, :, 0]
         assert (l2mat.ndim == 2)
         l2embs = self.l2emb(l2mat)
         l2vecs = T.concatenate([l1encs, l2embs], axis=2)
         wmask = T.neq(l2mat,
                       self.maskid) if self.maskid is not None else None
     else:
         l2vecs = EncLastDim(self.l1enc)(x)
         wmask = T.gt(T.sum(T.eq(x, self.maskid), axis=2), 0)
     l2vecs.mask = wmask
     fenc = self.l2enc(l2vecs)
     return fenc  #, wmask #mask for debug
Beispiel #7
0
 def apply(self, data, weights):  # data: (batsize, seqlen, elem_dim)
     # weights: (batsize, seqlen)
     w = weights.dimshuffle(0, 1, 'x')
     ret = data * w
     return T.sum(ret, axis=1)
Beispiel #8
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        preeval = True
        #specemb = 100
        margin = 1.
        evalsplits = 1
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)

    subjenc = EntEnc(
        SimpleSeq2Vec(invocsize=numwords,
                      inpembdim=embdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    repsplit = entmat[relstarts, 0]
    if specids:  # include vectorembedder
        subjenc = EntEmbEnc(subjenc, numentembs, specemb)
    predenc = VectorEmbed(indim=numents - relstarts + 1,
                          dim=subjenc.outdim,
                          init="zero")
    entenc = CustomEntEnc(subjenc, predenc, repsplit)

    inpenc = CustomSeq2Pair(inpemb=emb,
                            encdim=encinnerdim,
                            scadim=encinnerdim,
                            enclayers=layers,
                            scalayers=layers,
                            bidir=bidir,
                            maskid=maskid,
                            outdim=subjenc.outdim)

    # adjust params for enc/dec construction
    # encinnerdim[-1] += specemb
    # innerdim[-1] += specemb

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    if sumhingeloss:
        scorerkwargs["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, gold):
            entrand = np.random.randint(self.min, self.midsplit,
                                        (gold.shape[0], 1))
            relrand = np.random.randint(self.midsplit, self.max,
                                        (gold.shape[0], 1))
            ret = np.concatenate([entrand, relrand], axis=1)
            return datas, ret.astype("int32")

    #embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    # embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = CustomRankSearch(inpenc,
                             entenc,
                             scorer.s,
                             scorer.agg,
                             relstarts=relstarts)
        eval = FullRankEval()
        pred, scores = s.search(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    if not loadmodel:
        tt.tick("training")
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \
            .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
            .validate_on([validdata, validgold]) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained")
        scorer.save("customfullrank.scorer.save")
    else:
        scorer = SeqMatchScore.load("customfullrank.scorer.save")

    # eval
    tt.tick("evaluating")

    s = CustomRankSearch(inpenc,
                         entenc,
                         scorer.s,
                         scorer.agg,
                         relstarts=relstarts)
    eval = FullRankEval()
    pred, scores = s.search(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Beispiel #9
0
 def __init__(self, lenc, renc,
              aggregator=asblock(lambda x: T.sum(x, axis=1)), **kw):
     self.agg = aggregator
     super(SeqMatchScore, self).__init__(lenc, renc, **kw)
Beispiel #10
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    balancednegidx=False,
    usetypes=False,
    evalsplits=50,
    relembrep=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        #preeval = True
        specemb = 100
        margin = 1.
        balancednegidx = True
        evalsplits = 1
        relembrep = True
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    #embed()

    if subjpred is True and predpred is False:
        traingold = traingold[:, [0]]
        validgold = validgold[:, [0]]
        testgold = testgold[:, [0]]
    if predpred is True and subjpred is False:
        traingold = traingold[:, [1]]
        validgold = validgold[:, [1]]
        testgold = testgold[:, [1]]

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    entenc = EntEnc(
        SimpleSeq2Vec(indim=numwords,
                      inpembdim=memembdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    if specids:  # include vectorembedder
        entenc = EntEmbEnc(entenc, numentembs, specemb)
    if relembrep:
        repsplit = entmat[relstarts, 0]
        entenc = EntEncRep(entenc, numentembs, repsplit)

        # adjust params for enc/dec construction
        #encinnerdim[-1] += specemb
        #innerdim[-1] += specemb

    encdec = SimpleSeqEncDecAtt(inpvocsize=numwords,
                                inpembdim=embdim,
                                encdim=encinnerdim,
                                bidir=bidir,
                                outembdim=entenc,
                                decdim=decinnerdim,
                                vecout=True,
                                statetrans="matdot")

    scorerargs = ([encdec, SeqUnroll(entenc)], {
        "argproc": lambda x, y, z: ((x, y), (z, )),
        "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)
    })
    if sumhingeloss:
        scorerargs[1]["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1])

    #scorer.save("scorer.test.save")

    # TODO: below this line, check and test
    class PreProc(object):
        def __init__(self, entmat):
            self.f = PreProcE(entmat)

        def __call__(self, encdata, decsg,
                     decgold):  # gold: idx^(batsize, seqlen)
            return (encdata, self.f(decsg), self.f(decgold)), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            return self.em[x]

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit=None):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(
            self, datas, sgold, gold
        ):  # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together
            if self.midsplit is None or not balancednegidx:
                return datas, sgold, np.random.randint(
                    self.min, self.max, gold.shape).astype("int32")
            else:
                entrand = np.random.randint(self.min, self.midsplit,
                                            gold.shape)
                relrand = np.random.randint(self.midsplit, self.max,
                                            gold.shape)
                mask = np.random.randint(0, 2, gold.shape)
                ret = entrand * mask + relrand * (1 - mask)
                return datas, sgold, ret.astype("int32")

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    traingoldshifted = shiftdata(traingold)
    validgoldshifted = shiftdata(validgold)

    #embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
        eval = FullRankEval()
        pred, scores = s.decode(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    negidxgenargs = ([numents], {"midsplit": relstarts})
    if debug:
        pass
        #negidxgenargs = ([numents], {})

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \
        .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgoldshifted, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    #scorer.save("scorer.test.save")

    # eval
    tt.tick("evaluating")
    s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
    eval = FullRankEval()
    pred, scores = s.decode(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Beispiel #11
0
 def rec(x_t, crit
         ):  # x_t is (batsize, elem_dim), crit is (batsize, crit_dim)
     ret = T.dot(T.concatenate([x_t, crit], axis=1),
                 self.W)  # (batsize, innerdim)
     return T.sum(ret, axis=1)  # (batsize, )
Beispiel #12
0
 def f(self, s):
     a = s * 2
     b = T.sum(a, axis=0)
     c = T.sum(a, axis=0)
     a.push_extra_outs({"b": b, "c": c})
     return a
Beispiel #13
0
 def apply(self, x):
     a = T.dot(x, self.W)
     b = T.sum(a, axis=1)
     c = T.sum(b, axis=0)
     a.push_extra_outs({"b": b, "c": c})
     return a
Beispiel #14
0
 def rec(x_t, crit):     # x_t is (batsize, elem_dim), crit is (batsize, crit_dim)
     ret = T.dot(T.concatenate([x_t, crit], axis=1), self.W)     # (batsize, innerdim)
     return T.sum(ret, axis=1)       # (batsize, )
Beispiel #15
0
def run(
        epochs=50,
        mode="char",    # "char" or "word" or "charword"
        numbats=100,
        lr=0.1,
        wreg=0.000001,
        bidir=False,
        layers=1,
        encdim=200,
        decdim=400,
        embdim=100,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        preeval=False,
        sumhingeloss=False,
        checkdata=False,        # starts interactive shell for data inspection
        printpreds=False,
        subjpred=False,
        predpred=False,
        specemb=-1,
        balancednegidx=False,
        usetypes=False,
    ):
    if debug:       # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        #preeval = True
        specemb = 100
        margin = 1.
        balancednegidx = True
        #usetypes=True
    # load the right file
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=specids, usetypes=usetypes)
    entmat = entmat.astype("int32")

    #embed()

    if subjpred is True and predpred is False:
        traingold = traingold[:, [0]]
        validgold = validgold[:, [0]]
        testgold = testgold[:, [0]]
    if predpred is True and subjpred is False:
        traingold = traingold[:, [1]]
        validgold = validgold[:, [1]]
        testgold = testgold[:, [1]]


    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def p(xids):
            return (" " if mode == "word" else "").join([rwd[xid] if xid > -1 else "" for xid in xids])
        embed()

    reventdic = {v: k for k, v in entdic.items()}
    revworddic = {v: k for k, v in worddic.items()}
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim/2]*memlayers
    else:
        decinnerdim = [decdim]*memlayers

    entenc = SimpleSeq2Vec(indim=numwords,
                         inpembdim=memembdim,
                         innerdim=decinnerdim,
                         maskid=-1,
                         bidir=membidir)

    if specids:     # include vectorembedder
        numentembs = len(np.unique(entmat[:, 0]))
        entenc = EntEmbEnc(entenc, numentembs, specemb)
        # adjust params for enc/dec construction
        #encinnerdim[-1] += specemb
        #innerdim[-1] += specemb

    encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim,
                    encdim=encinnerdim, bidir=bidir, outembdim=entenc,
                    decdim=decinnerdim, vecout=True, statetrans="matdot")

    scorerargs = ([encdec, SeqUnroll(entenc)],
                  {"argproc": lambda x, y, z: ((x, y), (z,)),
                   "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)})
    if sumhingeloss:
        scorerargs[1]["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1])

    #scorer.save("scorer.test.save")

    # TODO: below this line, check and test
    class PreProc(object):
        def __init__(self, entmat):
            self.f = PreProcE(entmat)

        def __call__(self, encdata, decsg, decgold):        # gold: idx^(batsize, seqlen)
            return (encdata, self.f(decsg), self.f(decgold)), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            return self.em[x]

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit=None):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, sgold, gold):    # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together
            if self.midsplit is None or not balancednegidx:
                return datas, sgold, np.random.randint(self.min, self.max, gold.shape).astype("int32")
            else:
                entrand = np.random.randint(self.min, self.midsplit, gold.shape)
                relrand = np.random.randint(self.midsplit, self.max, gold.shape)
                mask = np.random.randint(0, 2, gold.shape)
                ret = entrand * mask + relrand * (1 - mask)
                return datas, sgold, ret.astype("int32")

    # !!! MASKS ON OUTPUT SHOULD BE IMPLEMENTED FOR VARIABLE LENGTH OUTPUT SEQS
    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:    #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    traingoldshifted = shiftdata(traingold)
    validgoldshifted = shiftdata(validgold)

    #embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
        eval = FullRankEval()
        pred, scores = s.decode(testdata, 0, testgold.shape[1],
                                candata=entmat, canids=canids,
                                transform=transf.f, debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    negidxgenargs = ([numents], {"midsplit": relstarts})
    if debug:
        pass
        #negidxgenargs = ([numents], {})

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \
        .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgoldshifted, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    #scorer.save("scorer.test.save")

    # eval
    tt.tick("evaluating")
    s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
    eval = FullRankEval()
    pred, scores = s.decode(testdata, 0, testgold.shape[1],
                            candata=entmat, canids=canids,
                            transform=transf.f, debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(100):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Beispiel #16
0
def run(
    negsammode="closest",  # "close" or "random"
    usetypes=True,
    mode="concat",  # "seq" or "concat" or "multi" or "multic" or "bino"
    glove=True,
    embdim=100,
    charencdim=100,
    charembdim=50,
    encdim=400,
    bidir=False,
    layers=1,
    charenc="rnn",  # "cnn" or "rnn"
    margin=0.5,
    lr=0.1,
    numbats=700,
    epochs=15,
    gradnorm=1.0,
    wreg=0.0001,
    loadmodel="no",
    debug=False,
    debugtest=False,
    forcesubjincl=False,
    randsameval=0,
    numtestcans=5,
    multiprune=-1,
    checkdata=False,
    testnegsam=False,
    testmodel=False,
    sepcharembs=False,
):
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    (subjmat, relmat), (subjdic, reldic), worddic, \
    subjinfo, (testsubjcans, relsperent) = readdata(debug=debug,
                                                    numtestcans=numtestcans if numtestcans > 0 else None)

    if usetypes:
        print "building type matrix"
        typmat = buildtypmat(subjmat, subjinfo, worddic)
        subjmat = np.concatenate([typmat, subjmat], axis=1)
        typlen = typmat.shape[1]

    relsamplespace = None
    subjsamplespace = None
    if negsammode == "closest" or negsammode == "close":
        relsamplespace, revind = buildrelsamplespace(relmat, worddic)
        subjsamplespace = loadsubjsamplespace()
    tt.tock("data loaded")

    if checkdata:
        embed()

    numwords = max(worddic.values()) + 1
    numsubjs = max(subjdic.values()) + 1
    numrels = max(reldic.values()) + 1
    maskid = -1
    numchars = 256

    nsrelsperent = relsperent if negsammode == "closest" else None

    if testnegsam:
        nig = NegIdxGen(numsubjs - 1,
                        numrels - 1,
                        relclose=relsamplespace,
                        subjclose=subjsamplespace,
                        relsperent=nsrelsperent)
        embed()

    if mode == "seq" or mode == "multi":
        decdim = encdim
    elif mode == "concat" or mode == "multic" or mode == "bino":
        decdim = encdim / 2
    else:
        raise Exception("unrecognized mode")

    print "{} mode: {} decdim".format(mode, decdim)

    # defining model
    if glove:
        wordemb = Glove(embdim).adapt(worddic)
    else:
        wordemb = WordEmb(dim=embdim, indim=numwords)

    charemb = VectorEmbed(indim=numchars, dim=charembdim)
    charemb2 = VectorEmbed(indim=numchars, dim=charembdim)
    if charenc == "cnn":
        print "using CNN char encoder"
        charenc = CNNSeqEncoder(inpemb=charemb,
                                innerdim=[charencdim] * 2,
                                maskid=maskid,
                                stride=1)
    elif charenc == "rnn":
        print "using RNN char encoder"
        charenc = RNNSeqEncoder(inpemb=charemb, innerdim=charencdim) \
            .maskoptions(maskid, MaskMode.AUTO)
    else:
        raise Exception("no other character encoding modes available")

    if bidir:
        encdim = encdim / 2

    if mode != "bino":
        if mode == "multi" or mode == "multic":
            wordenc = \
                SimpleSeq2MultiVec(inpemb=False, inpembdim=wordemb.outdim + charencdim,
                                   innerdim=encdim, bidir=bidir, numouts=2, mode="seq")
        else:
            encdim = [encdim] * layers
            wordenc = RNNSeqEncoder(inpemb=False,
                                    inpembdim=wordemb.outdim + charencdim,
                                    innerdim=encdim,
                                    bidir=bidir).maskoptions(MaskMode.NONE)

        question_encoder = TwoLevelEncoder(l1enc=charenc,
                                           l2emb=wordemb,
                                           l2enc=wordenc,
                                           maskid=maskid)

    else:
        question_encoder = BinoEncoder(charenc=charenc,
                                       wordemb=wordemb,
                                       maskid=maskid,
                                       scadim=100,
                                       encdim=encdim / 2,
                                       bidir=bidir,
                                       enclayers=layers,
                                       outdim=decdim,
                                       scabidir=True)

    # encode predicate on word level
    predemb = SimpleSeq2Vec(inpemb=wordemb,
                            innerdim=decdim,
                            maskid=maskid,
                            bidir=False,
                            layers=1)

    #predemb.load(relmat)

    scharemb = charemb2 if sepcharembs else charemb
    if usetypes:
        # encode subj type on word level
        subjtypemb = SimpleSeq2Vec(inpemb=wordemb,
                                   innerdim=int(np.ceil(decdim * 1. / 2)),
                                   maskid=maskid,
                                   bidir=False,
                                   layers=1)
        # encode subject on character level
        charbidir = True
        charencinnerdim = int(np.floor(decdim * 1. / 2))
        charenclayers = 1
        if charbidir:
            charencinnerdim /= 2
            charenclayers = 2
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=charencinnerdim,
                                maskid=maskid,
                                bidir=charbidir,
                                layers=charenclayers)
        subjemb = TypedSubjBlock(typlen, subjemb, subjtypemb)
    else:
        # encode subject on character level
        subjemb = SimpleSeq2Vec(inpemb=scharemb,
                                innerdim=decdim,
                                maskid=maskid,
                                bidir=False,
                                layers=1)
    #subjemb.load(subjmat)
    if testmodel:
        embed()
    # package
    if mode == "seq":
        lb = SeqLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "concat":
        lb = ConcatLeftBlock(question_encoder)
        rb = RightBlock(subjemb, predemb)
    elif mode == "multi" or mode == "multic":
        lb = MultiLeftBlock(question_encoder, mode)
        rb = RightBlock(subjemb, predemb)
    elif mode == "bino":
        lb = question_encoder
        rb = RightBlock(subjemb, predemb)
    else:
        raise Exception("unrecognized mode")
    scorer = SeqMatchScore(lb,
                           rb,
                           scorer=CosineDistance(),
                           aggregator=lambda x: x,
                           argproc=lambda x, y, z: ((x, ), (y, z)))

    obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    class PreProc(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, data, gold):  # gold: idxs-(batsize, 2)
            st = self.ef(gold[:, 0])[0][0]
            rt = self.rf(gold[:, 1])[0][0]
            return (data, st, rt), {}

    class PreProcE(object):
        def __init__(self, subjmat, relmat):
            self.ef = PreProcEnt(subjmat)
            self.rf = PreProcEnt(relmat)

        def __call__(self, x):
            subjslice = self.ef(x[:, 0])[0][0]
            relslice = self.rf(x[:, 1])[0][0]
            return (subjslice, relslice), {}

    class PreProcEnt(object):
        def __init__(self, mat):
            self.entmat = Val(mat)

        def __call__(self, x):
            return (self.entmat[x], ), {}

    transf = PreProc(subjmat, relmat)

    if debug:
        embed()

    if epochs > 0 and loadmodel == "no":
        tt.tick("training")
        saveid = "".join([str(np.random.randint(0, 10)) for i in range(4)])
        print("CHECKPOINTING AS: {}".format(saveid))
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numsubjs-1, numrels-1,
                                    relclose=relsamplespace,
                                    subjclose=subjsamplespace,
                                    relsperent=nsrelsperent)) \
            .objective(obj).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm) \
            .validate_on([validdata, validgold]) \
            .autosavethis(scorer, "fullrank{}.model".format(saveid)) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained").tick()

        # saving
        #scorer.save("fullrank{}.model".format(saveid))
        print("SAVED AS: {}".format(saveid))

    if loadmodel is not "no":
        tt.tick("loading model")
        m = SeqMatchScore.load("fullrank{}.model".format(loadmodel))
        #embed()
        lb = m.l
        subjemb = m.r.subjenc
        predemb = m.r.predenc
        tt.tock("loaded model")

    # evaluation
    predictor = CustomPredictor(
        questionencoder=lb,
        entityencoder=subjemb,
        relationencoder=predemb,
        #mode=mode,
        enttrans=transf.ef,
        reltrans=transf.rf,
        debug=debugtest,
        subjinfo=subjinfo)

    tt.tick("predicting")
    if forcesubjincl:  # forces the intended subject entity to be among candidates
        for i in range(len(testsubjcans)):
            if testgold[i, 0] not in testsubjcans[i]:
                testsubjcans[i].append(testgold[i, 0])

    if randsameval > 0:  # generate random sampling eval data
        testsubjcans = np.random.randint(0, numsubjs,
                                         (testgold.shape[0], randsameval))
        testrelcans = np.random.randint(0, numrels,
                                        (testgold.shape[0], randsameval))
        testsubjcans = np.concatenate([testgold[:, 0:1], testsubjcans], axis=1)
        testrelcans = np.concatenate([testgold[:, 1:2], testrelcans], axis=1)
        testsubjcans = testsubjcans.tolist()
        testrelcans = testrelcans.tolist()
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relcans=testrelcans)
    else:
        prediction = predictor.predict(testdata,
                                       entcans=testsubjcans,
                                       relsperent=relsperent,
                                       multiprune=multiprune)
    tt.tock("predicted")
    tt.tick("evaluating")
    evalmat = prediction == testgold
    subjacc = np.sum(evalmat[:, 0]) * 1. / evalmat.shape[0]
    predacc = np.sum(evalmat[:, 1]) * 1. / evalmat.shape[0]
    totalacc = np.sum(np.sum(evalmat, axis=1) == 2) * 1. / evalmat.shape[0]
    print "Test results ::::::::::::::::"
    print "Total Acc: \t {}".format(totalacc)
    print "Subj Acc: \t {}".format(subjacc)
    print "Pred Acc: \t {}".format(predacc)
    tt.tock("evaluated")

    def subjinspect(subjrank, gold):
        ret = [
            (("GOLD - " if gold == x else "       ") + subjinfo[x][0] + " (" +
             " ".join(subjinfo[x][1]) + ")" + str(subjinfo[x][3]) + " rels",
             y) if x in subjinfo else (x, y) for x, y in subjrank
        ]
        return ret

    def inspectboth(hidecorrect=False, hidenotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            predx = testgold[i, 1]
            subjrank = predictor.subjranks[i]
            predrank = predictor.relranks[i]
            if hidecorrect and subjx == subjrank[0][0] and predrank[0][
                    0] == predx:
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue

    def inspectsubjs(hidecorrect=False,
                     hidenotincan=False,
                     shownotincan=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.subjranks)):
            subjx = testgold[i, 0]
            subjrank = predictor.subjranks[i]
            if subjx == subjrank[0][0] and hidecorrect:  # only look for errors
                continue
            if subjx not in [k for k, v in subjrank]:
                if hidenotincan:
                    continue
            if shownotincan and subjx in [k for k, v in subjrank]:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(
                    testdata[i, :, 0], rwd), "{} ({}) - {} rels --- {}".format(
                        *([
                            subjinfo[subjx][0], subjinfo[subjx][1],
                            subjinfo[subjx][3], subjinfo[subjx][2]
                        ] if subjx in
                          subjinfo else ["<UNK>", "<UNK>", "<UNK>", "<UNK>"])))
            inspres = subjinspect(subjrank, subjx)
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    def inspectpreds(hidecorrect=False):
        rwd = {v: k for k, v in worddic.items()}
        for i in range(len(predictor.relranks)):
            relx = testgold[i, 1]
            subjx = testgold[i, 0]
            relrank = predictor.relranks[i]
            if relx == relrank[0][0] and hidecorrect:
                continue
            print "test question {}: {} \t GOLD: {}".format(
                i, wordids2string(testdata[i, :, 0], rwd),
                wordids2string(relmat[relx, :], rwd))
            inspres = [(("GOLD - " if relx == x else "        ") +
                        wordids2string(relmat[x], rwd), y) for x, y in relrank]
            i = 1
            for inspre in inspres:
                print "{}:\t{}\t{}".format(i, inspre[1], inspre[0])
                if i % 50 == 0:
                    inp()
                i += 1
            inp()

    embed()