Ejemplo n.º 1
0
def run(
    epochs=50,
    mode="char",  # "char" or "word" or "charword"
    numbats=1000,
    lr=0.1,
    wreg=0.000001,
    bidir=False,
    layers=1,
    encdim=200,
    decdim=200,
    embdim=100,
    negrate=1,
    margin=1.,
    hingeloss=False,
    debug=False,
    preeval=False,
    sumhingeloss=False,
    checkdata=False,  # starts interactive shell for data inspection
    printpreds=False,
    subjpred=False,
    predpred=False,
    specemb=-1,
    usetypes=False,
    evalsplits=50,
    cosine=False,
    loadmodel=False,
):
    if debug:  # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        preeval = True
        #specemb = 100
        margin = 1.
        evalsplits = 1
        #usetypes=True
        #mode = "charword"
        #checkdata = True
    # load the right file
    maskid = -1
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids, wordmat, chardic\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=True,
                   usetypes=usetypes, maskid=maskid)
    entmat = entmat.astype("int32")

    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}

        def p(xids):
            return (" " if mode == "word" else "").join(
                [rwd[xid] if xid > -1 else "" for xid in xids])

        embed()

    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim / 2] * memlayers
    else:
        decinnerdim = [decdim] * memlayers

    emb = VectorEmbed(numwords, embdim)

    subjenc = EntEnc(
        SimpleSeq2Vec(invocsize=numwords,
                      inpembdim=embdim,
                      innerdim=decinnerdim,
                      maskid=maskid,
                      bidir=membidir))

    numentembs = len(np.unique(entmat[:, 0]))
    repsplit = entmat[relstarts, 0]
    if specids:  # include vectorembedder
        subjenc = EntEmbEnc(subjenc, numentembs, specemb)
    predenc = VectorEmbed(indim=numents - relstarts + 1,
                          dim=subjenc.outdim,
                          init="zero")
    entenc = CustomEntEnc(subjenc, predenc, repsplit)

    inpenc = CustomSeq2Pair(inpemb=emb,
                            encdim=encinnerdim,
                            scadim=encinnerdim,
                            enclayers=layers,
                            scalayers=layers,
                            bidir=bidir,
                            maskid=maskid,
                            outdim=subjenc.outdim)

    # adjust params for enc/dec construction
    # encinnerdim[-1] += specemb
    # innerdim[-1] += specemb

    dist = DotDistance() if not cosine else CosineDistance()
    scorerkwargs = {"argproc": lambda x, y: ((x, ), (y, )), "scorer": dist}
    if sumhingeloss:
        scorerkwargs["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(inpenc, entenc, **scorerkwargs)

    class PreProc(object):
        def __init__(self, entmat, wordmat=None):
            self.f = PreProcE(entmat)
            self.w = PreProcL(wordmat) if wordmat is not None else wordmat

        def __call__(self, encdata, decgold):  # gold: idx^(batsize, seqlen)
            if self.w is not None:
                encdata = self.w(encdata)[0][0]
            if self.f is not None:
                decgold = self.f(decgold)[0][0]
            return (encdata, decgold), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    class PreProcL(object):
        def __init__(self, wordmat):
            self.em = Val(wordmat)

        def __call__(self, x):
            ret = self.em[x]
            return (ret, ), {}

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, gold):
            entrand = np.random.randint(self.min, self.midsplit,
                                        (gold.shape[0], 1))
            relrand = np.random.randint(self.midsplit, self.max,
                                        (gold.shape[0], 1))
            ret = np.concatenate([entrand, relrand], axis=1)
            return datas, ret.astype("int32")

    #embed()

    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:  #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    # embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = CustomRankSearch(inpenc,
                             entenc,
                             scorer.s,
                             scorer.agg,
                             relstarts=relstarts)
        eval = FullRankEval()
        pred, scores = s.search(testdata,
                                testgold.shape[1],
                                candata=entmat,
                                canids=canids,
                                split=evalsplits,
                                transform=transf.f,
                                debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    if not loadmodel:
        tt.tick("training")
        nscorer = scorer.nstrain([traindata, traingold]).transform(transf) \
            .negsamplegen(NegIdxGen(numents, relstarts)).negrate(negrate).objective(obj) \
            .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
            .validate_on([validdata, validgold]) \
            .train(numbats=numbats, epochs=epochs)
        tt.tock("trained")
        scorer.save("customfullrank.scorer.save")
    else:
        scorer = SeqMatchScore.load("customfullrank.scorer.save")

    # eval
    tt.tick("evaluating")

    s = CustomRankSearch(inpenc,
                         entenc,
                         scorer.s,
                         scorer.agg,
                         relstarts=relstarts)
    eval = FullRankEval()
    pred, scores = s.search(testdata,
                            testgold.shape[1],
                            candata=entmat,
                            canids=canids,
                            split=evalsplits,
                            transform=transf.f,
                            debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(1000):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))