Beispiel #1
0
def readdata_word(trainp, testp, maxlen=100, masksym=-1):
    tt = ticktock("data reader")

    def readdataset(p, wdic, maxlen=100):
        dataret = []
        goldret = []
        toolong = 0
        realmaxlen = 0
        with open(p) as f:
            data = csv.reader(f, delimiter=",")
            for row in data:
                rowelems = tokenize(row[2])
                realmaxlen = max(realmaxlen, len(rowelems))
                if len(rowelems) > maxlen:
                    toolong += 1
                for rowelem in set(rowelems):
                    if rowelem not in wdic:
                        wdic[rowelem] = len(wdic)
                dataret.append([wdic[x] for x in rowelems])
                goldret.append(row[0])
        print "{} comments were too long".format(toolong)
        maxlen = min(maxlen, realmaxlen)
        datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym
        for i in range(1, len(dataret)):
            datamat[i - 1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)]
        return datamat, np.asarray(goldret[1:], dtype="int32"), wdic

    tt.tick("reading data")
    traindata, traingold, wdic = readdataset(trainp, {}, maxlen=maxlen)
    testdata, testgold, wdic = readdataset(testp, wdic=wdic, maxlen=maxlen)
    tt.tock("data read")
    return (traindata, traingold), (testdata, testgold), wdic
Beispiel #2
0
def buildsamplespace(entmat, wd, maskid=-1):
    tt = ticktock("samplespace")
    tt.tick("making sample space")
    #rwd = {v: k for k, v in wd.items()}
    entmatm = sparse.dok_matrix((entmat.shape[0], np.max(entmat) + 1))
    posblacklist = {0: {wd["base"], wd["user"]}}
    blacklist = set([wd[x] for x in "default domain of by the in at s this for with type".split()])
    #revin = {k: set() for k in np.unique(entmat)}
    #revinm = sparse.dok_matrix((np.max(entmat), entmat.shape[0]))
    samdic = {k: set() for k in range(entmat.shape[0])}     # from ent ids to sets of ent ids
    #samdic = np.zeros((entmat.shape[0], entmat.shape[0]))
    for i in range(entmat.shape[0]):
        for j in range(entmat.shape[1]):
            w = entmat[i, j]
            if w == -1:     # beginning of padding
                break
            if j in posblacklist:
                if w in posblacklist[j]:
                    continue
            if w in blacklist:
                continue
            entmatm[i, w] = 1
            #for oe in revin[w]:     # other entities already in revind
            #    samdic[oe].add(i)
            #    samdic[i].add(oe)
            #revin[w].add(i)
            #revinm[w, i] = 1
    samdicm = entmatm.dot(entmatm.T)
    for i in range(samdicm.shape[0]):
        samdic[i] = list(np.argwhere(samdicm[i, :])[:, 1])
    tt.tock("made sample space")
    return samdic, entmatm.T
Beispiel #3
0
 def __init__(self,
              model,
              canenc,
              scorer,
              agg,
              relstarts=0,
              *buildargs,
              **kw):
     super(CustomRankSearch, self).__init__(**kw)
     self.model = model
     self.scorer = scorer
     self.canenc = canenc
     self.agg = agg
     self.tt = ticktock("RankSearch")
     self.ott = ticktock("RankSearch")
     self.relstarts = relstarts
Beispiel #4
0
 def load(p):
     tt = ticktock("SubjectSearch")
     tt.tick("loading")
     d = {}
     l = []
     k = None
     with open(p) as f:
         for line in f:
             if line[:2] == "::":
                 if k is None:
                     assert (l == [])
                 else:
                     d[k] = l
                     l = []
                 k = line[2:-1]
             else:
                 splits = line[:-1].split("\t")
                 le = dict(
                     zip("fb_id triplecount type_id type_name".split(),
                         [splits[0], int(splits[1])] + splits[2:]))
                 l.append(le)
     d[k] = l
     tt.tock("loaded")
     ret = SubjectSearch(subjinfop=d, revind=SubjectSearch.buildrevindex(d))
     return ret
Beispiel #5
0
 def build(self, p):
     i = 0
     tt = ticktock("builder")
     tt.tick("building")
     for line in open(p):
         sline = line[:-1].split("\t")
         fb_id = sline[0]
         triplecount = int(sline[1]) + int(sline[2])
         name = self.processor.processline(sline[3])
         type_id = sline[4]
         type_id = type_id if type_id != "<UNK>" else None
         type_name = " ".join(tokenize(sline[5]))
         type_name = type_name if type_name != " ".join(
             tokenize("<UNK>")) else None
         if name not in self.indexdict:
             self.indexdict[name] = []
         self.indexdict[name].append({
             "fb_id": fb_id,
             "triplecount": triplecount,
             "type_id": type_id,
             "type_name": type_name
         })
         i += 1
         if i % 1000 == 0:
             tt.live("{}k".format(i // 1000))
     tt.tock("built")
Beispiel #6
0
def readdata_char(trainp, testp, maxlen=1000, masksym=-1):
    tt = ticktock("data reader")
    def readdataset(p):
        dataret = []
        goldret = []
        toolong = 0
        with open(p) as f:
            data = csv.reader(f, delimiter=",")
            for row in data:
                if len(row[2]) > maxlen:
                    toolong += 1
                dataret.append([ord(x) for x in row[2]])
                goldret.append(row[0])
        print "{} comments were too long".format(toolong)
        datamat = np.ones((len(dataret)-1, maxlen)).astype("int32") * masksym
        for i in range(1, len(dataret)):
            datamat[i-1, :min(len(dataret[i]), maxlen)] = dataret[i][:min(len(dataret[i]), maxlen)]
        return datamat, np.asarray(goldret[1:], dtype="int32")
    tt.tick("reading data")
    traindata, traingold = readdataset(trainp)
    testdata, testgold = readdataset(testp)
    allchars = set(list(np.unique(traindata))).union(set(list(np.unique(testdata))))
    chardic = dict(zip(list(allchars), range(len(allchars))))
    chardic[masksym] = masksym
    traindata = np.vectorize(lambda x: chardic[x])(traindata)
    testdata = np.vectorize(lambda x: chardic[x])(testdata)
    chardic = {chr(k): v for k, v in chardic.items() if k != masksym}
    tt.tock("data read")
    return (traindata, traingold), (testdata, testgold), chardic
Beispiel #7
0
def wordmat2charmat(wordmat,
                    worddic=None,
                    rwd=None,
                    maxlen=100,
                    raretoken="<RARE>",
                    maskid=-1):
    assert (worddic is not None or rwd is not None)
    assert (not (worddic is not None and rwd is not None))
    tt = ticktock("wordmat2charmat")
    tt.tick("transforming word mat to char mat")
    toolong = 0
    charmat = maskid * np.ones((wordmat.shape[0], maxlen), dtype="int32")
    if rwd is None:
        rwd = {v: (k if k != raretoken else " ") for k, v in worddic.items()}
    else:
        rwd = dict([(k, (v if v != raretoken else " "))
                    for k, v in rwd.items()])
    realmaxlen = 0
    for i in range(wordmat.shape[0]):
        s = wordids2string(wordmat[i], rwd, maskid=maskid)
        s = s[:min(len(s), maxlen)]
        realmaxlen = max(len(s), realmaxlen)
        if len(s) > maxlen:
            toolong += 1
        charmat[i, :len(s)] = [ord(ch) for ch in s]
        tt.progress(i, wordmat.shape[0], live=True)
    if realmaxlen < maxlen:
        charmat = charmat[:, :realmaxlen]
    if toolong > 0:
        print "{} too long".format(toolong)
    tt.tock("transformed")
    return charmat
Beispiel #8
0
def loadsubjinfo(entinfp, entdic, cachep=None):  #"subjinfo.cache.pkl"):
    tt = ticktock("subjinfoloader")

    def make():
        tt.tick("making subject info from file")
        subjinf = {}
        c = 0
        for line in open(entinfp):
            subjuri, subjc, objc, subjname, typuri, typname = line[:-1].split(
                "\t")
            subjinf[entdic[subjuri]] = (subjname, typname.lower().split(),
                                        typuri, subjc, objc)
            if c % 1000 == 0:
                tt.live(str(c))
            c += 1
        tt.tock("made subject info from file")
        return subjinf

    if cachep is not None:
        if os.path.isfile(cachep):  # load
            tt.tick("loading cached subject info")
            subjinfo = pickle.load(open(cachep))
            tt.tock("loaded cached subject info")
        else:  # make  and dump
            subjinfo = make()
            tt.tick("dumping subject info in cache")
            pickle.dump(subjinfo, open(cachep, "w"))
            tt.tock("dumped subject info in cache")
    else:  # just make
        subjinfo = make()
    return subjinfo
Beispiel #9
0
def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords,
                numchars):
    tt = ticktock("fblexdataloader")
    tt.tick()
    gd, vocnumwords = getglovedict(glovepath, offset=wordoffset)
    tt.tock("loaded %d worddic" % len(gd)).tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FreebaseEntFeedsMaker(fblexpath,
                                   gd,
                                   ed,
                                   numwords=numwords,
                                   numchars=numchars,
                                   unkwordid=wordoffset - 1)
    datanuments = max(indata.goldfeed) + 1
    tt.tick()
    print "max entity id+1: %d" % datanuments
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed

    return traindata, golddata, vocnuments, vocnumwords, datanuments
Beispiel #10
0
def loaddata(numtestcans=5):
    tt = ticktock("dataloader")
    tt.tick("loading data")
    p = "../../../../data/simplequestions/clean/datamat.word.fb2m.pkl"
    entinfp = "../../../../data/simplequestions/clean/subjs-counts-labels-types.fb2m.tsv"
    x = pickle.load(open(p))
    tt.tock("datamat loaded")
    worddic = x["worddic"]
    entdic = x["entdic"]
    entmat = x["entmat"]
    numents = x["numents"]
    traindata, traingold = x["train"]
    validdata, validgold = x["valid"]
    testdata, testgold = x["test"]
    traingold[:, 1] -= numents
    validgold[:, 1] -= numents
    testgold[:, 1] -= numents

    rwd = {v: k for k, v in worddic.items()}

    subjdic = {k: v for k, v in entdic.items() if v < numents}
    reldic = {k: v - numents for k, v in entdic.items() if v >= numents}

    subjinfo = loadsubjinfo(entinfp, subjdic)
    testsubjcans = loadsubjtestcans(numcans=numtestcans)
    testrelcans, relspersubj = loadreltestcans(testgold, subjdic, reldic)
    return testgold, testsubjcans, relspersubj
Beispiel #11
0
def gencans(data,
            top=50,
            exact=True,
            rwd=None,
            ed=None,
            host=None,
            index=None):
    idx = SimpleQuestionsLabelIndex(host=host, index=index)
    # transform data using worddic and search
    sentences = []
    cans = []
    tt = ticktock("candidate generator")
    tt.tick("generating cans")
    for i in range(data.shape[0]):
        sentence = " ".join(
            map(lambda x: rwd[x], filter(lambda x: x in rwd, data[i, :])))
        sentences.append(sentence)
        searchres = idx.searchsentence(sentence, exact=exact, top=top)
        scans = map(lambda (x, (y, z)): ed[x], searchres.items())
        if i % 10 == 0:
            tt.live("%d of %d" % (i, data.shape[0]))
        cans.append(scans)
    tt.stoplive()
    tt.tock("generated cans")
    return cans
Beispiel #12
0
def loadreltestcans(
        testgold,
        subjdic,
        reldic,
        relsperentp="../../../../data/simplequestions/allrelsperent.dmp"):
    tt = ticktock("test rel can loader")
    testsubjs = testgold[:, 0]
    relsperent = {}  #{k: ([], []) for k in set(list(testsubjs))}
    tt.tick("loading rel test cans")
    for line in open(relsperentp):
        subj, relsout, relsin = line[:-1].split("\t")
        if subj in subjdic:
            relsperent[subjdic[subj]] = (
                [reldic[x]
                 for x in relsout.split(" ")] if relsout != "" else [],
                [reldic[x] for x in relsin.split(" ")] if relsin != "" else [])
        #if subj in subjdic and subjdic[subj] in relsoftestsubjs:
        #    relsoftestsubjs[subjdic[subj]] = (
        #        [reldic[x] for x in relsout.split(" ")] if relsout != "" else [],
        #        [reldic[x] for x in relsin.split(" ")] if relsin != "" else []
        #    )
    tt.tock("test cans loaded")
    relsoftestexamples = [(relsperent[x][0], relsperent[x][1])
                          for x in testsubjs]
    return relsoftestexamples, relsperent
Beispiel #13
0
 def __init__(self,
              model,
              canenc,
              scorer,
              agg,
              beamsize=1,
              *buildargs,
              **kw):
     super(SeqEncDecRankSearch, self).__init__(**kw)
     self.model = model
     self.beamsize = beamsize
     self.mu = SeqEncDecPredictor(model, *buildargs)
     self.scorer = scorer
     self.canenc = canenc
     self.agg = agg
     self.tt = ticktock("RankSearch")
     self.ott = ticktock("RankSearch")
Beispiel #14
0
def loadsubjtestcans(p="../../../../data/simplequestions/clean/testcans{}.pkl",
                     numcans=None):
    tt = ticktock("test subjects candidate loader")
    tt.tick("loading candidates")
    p = p.format(
        "{}c".format(numcans)) if numcans is not None else p.format("")
    ret = pickle.load(open(p))
    tt.tock("canddiates loaded")
    return ret
Beispiel #15
0
 def oldpredict(self, data, entcans, relsperent):
     tt = ticktock("predictor")
     tt.tick("computing question encodings")
     qencodings = self.qenc.predict(data)  # (numsam, encdim)
     tt.tock("computed question encodings")
     tt.tick("predicting")
     ret = np.zeros((data.shape[0], 2), dtype="int32")
     if self.mode == "concat":
         mid = qencodings.shape[1] / 2
         qencforent = qencodings[:, :mid]
         qencforrel = qencodings[:, mid:]
     elif self.mode == "seq":
         qencforent = qencodings[:, :]
         qencforrel = qencodings[:, :]
     else:
         raise Exception("unrecognized mode")
     for i in range(qencodings.shape[0]):
         # predict subject
         if len(entcans[i]) == 0:
             bestsubj = -1
         elif len(entcans[i]) == 1:
             bestsubj = entcans[i][0]
         else:
             entembs = self.eenc.predict.transform(self.enttrans)(
                 entcans[i])
             entscoresi = np.tensordot(qencforent[i], entembs, axes=(0, 1))
             scoredentcans = sorted(zip(entcans[i], entscoresi),
                                    key=lambda (x, y): y,
                                    reverse=True)
             bestsubj = scoredentcans[0][0]
             if self.debug:
                 embed()
         ret[i, 0] = bestsubj
         # predict relation
         relcans = relsperent[ret[i, 0]][0] if ret[i,
                                                   0] in relsperent else []
         if len(relcans) == 0:
             bestrel = -1
         elif len(relcans) == 1:
             bestrel = relcans[0]
         else:
             if self.debug:
                 embed()
             relembs = self.renc.predict.transform(self.reltrans)(relcans)
             relscoresi = np.tensordot(qencforrel[i], relembs, axes=(0, 1))
             scoredrelcans = sorted(zip(relcans, relscoresi),
                                    key=lambda (x, y): y,
                                    reverse=True)
             bestrel = scoredrelcans[0][0]
         ret[i, 1] = bestrel
         if self.debug:
             embed()
         tt.progress(i, qencodings.shape[0], live=True)
     tt.tock("predicted")
     return ret
Beispiel #16
0
def run(
    epochs=10,
    batsize=100,
    lr=0.1,
    embdim=200,
    encdim=300,
    layers=1,
    type="rnn",  # rnn or cnn
    clean=False,
    rarefreq=4,
    p="../../data/simplequestions/datamat.word.mem.fb2m.pkl",
):
    # load data for classification
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
        entdic, entmat, worddic, numents = readdata(p, clean=clean, rarefreq=rarefreq)
    numrels = len(entdic)
    traingold = np_utils.to_categorical(traingold, nb_classes=numrels)
    validgold = np_utils.to_categorical(validgold, nb_classes=numrels)
    testgold = np_utils.to_categorical(testgold, nb_classes=numrels)
    tt.tock("loaded data")
    # model
    tt.tick("building model")
    m = Sequential()
    m.add(Embedding(len(worddic) + 1, embdim, mask_zero=True))
    if type == "rnn":
        print("doing RNN")
        for i in range(layers - 1):
            m.add(GRU(encdim, return_sequences=True))
        m.add(GRU(encdim, return_sequences=False))
    elif type == "cnn":
        print("doing CNN")
        for i in range(layers):
            m.add(Convolution1D(encdim, encdim))
        m.add(GlobalMaxPooling1D())
    m.add(Dense(len(entdic)))
    m.add(Activation("softmax"))

    m.compile(loss="categorical_crossentropy",
              optimizer=Adadelta(lr=lr),
              metrics=["accuracy"])
    tt.tock("built model")
    tt.tick("training")
    m.fit(traindata,
          traingold,
          nb_epoch=epochs,
          batch_size=batsize,
          validation_data=(validdata, validgold))
    tt.tock("trained")
    tt.tick("testing")
    score, acc = m.evaluate(testdata, testgold, batch_size=batsize)
    print("Score: {}\nAccuracy: {}".format(score, acc))
    tt.tock("tested")
Beispiel #17
0
 def eval(self,
          data,
          gold,
          transform=None,
          savep=None
          ):  # data: wordidx^(batsize, seqlen), gold: entidx^(batsize)
     # generate candidates
     if os.path.isfile("testcans.pkl"):
         cans = self.loadcans("testcans.pkl")
     else:
         cans = gencans(data,
                        host=self.host,
                        index=self.index,
                        rwd=self.rwd,
                        ed=self.ed)  # list of lists of entidx
         pickle.dump(cans, open("testcans.pkl", "w"))
     assert len(cans) == data.shape[0] == gold.shape[0]
     #        embed()
     predictor = self.scorer.predict.transform(transform)
     tt = ticktock("evaluator")
     tt.tick("evaluating...")
     nocans = 0
     nogoldcan = 0
     tosave = {}
     for i in range(data.shape[0]):
         numcans = len(cans[i])
         if gold[i] not in cans[i]:
             nogoldcan += 1
         predinp = [
             np.repeat(np.expand_dims(data[i, :], axis=0), numcans, axis=0),
             np.asarray(cans[i], dtype="int32")
         ]
         #print predinp, "%d/%d" % (i, data.shape[0]), numcans
         if numcans > 0:
             predinpscores = predictor(*predinp)  # (numcans,)
             ranking = sorted(zip(cans[i], list(predinpscores)),
                              key=lambda (x, y): y,
                              reverse=True)
             tosave[i] = (gold[i], ranking)
             for metric in self.metrics:
                 metric.accumulate([gold[i]], ranking)
         else:
             nocans += 1
         if i % 100 == 0:
             tt.live("evaluated: %.2f%%" % (i * 100. / data.shape[0]))
     tt.tock("evaluated")
     if savep is not None:
         tt.tick("saving")
         pickle.dump(tosave, open(savep, "w"))
         tt.tock("saved")
     print "no cans for %d questions" % nocans
     print "gold not among cans for %d questions" % nogoldcan
     return self.metrics
Beispiel #18
0
    def test_ns_training(self):
        num = 2000
        self.expshape = (num, 50)
        Glove.defaultpath = "../../../data/glove/miniglove.%dd.txt"
        self.glove = Glove(self.expshape[1], self.expshape[0])
        self.cemb = VectorEmbed(indim=self.expshape[0] + 1,
                                dim=self.expshape[1])
        self.assertRaises(Exception, self.glove.block.predict, [num + 1])
        self.assertRaises(Exception, self.cemb.predict, [num + 1])

        m = MatchScore(self.glove.block, self.cemb, scorer=CosineDistance())
        mg = MatchScore(self.glove.block,
                        self.glove.block)  # TODO factor out matchscore tests
        idxs = np.arange(num + 1)

        # glove against glove
        self.assertTrue(
            np.allclose(mg.predict([num, 100], [num, 100]), [
                np.linalg.norm(self.glove % num)**2,
                np.linalg.norm(self.glove % 100)**2
            ]))

        class NegIdxGen():
            def __init__(self, num):
                self.n = num

            def __call__(self, l, r):
                return l, np.random.randint(0, self.n, r.shape)

        vdata = np.arange(num)
        negrate = 5

        def obj(p, n):
            return n - p
        m, err, verr, _, _ = m.nstrain([idxs, idxs]).negsamplegen(NegIdxGen(num+1)).negrate(negrate)\
            .adagrad(lr=0.1).objective(obj) \
            .validate_on([vdata, vdata]).extvalid(geteval(m.predict, num, negrate)).validinter(30) \
            .train(numbats=50, epochs=29, returnerrors=True)
        #.writeresultstofile("testingresultswriter.tsv") \

        tdata = np.arange(num)
        tt = ticktock("eval")
        tt.tick()
        mrr, recat1, recat10 = geteval(m.predict, num, 1)(tdata)
        tt.tock("evaluated test data")
        print "%.4f MRR,\t%.4f MR@10,\t%.4f MR@1" % (mrr, recat10, recat1)
        self.assertGreater(mrr, 0.85)
        self.assertGreater(recat10, 0.9)
        print verr
        self.assertTrue(
            np.allclose(np.asarray([mrr, recat1, recat10]),
                        np.asarray(verr[-1][1:])))
Beispiel #19
0
 def searchwordmat(self, wordmat, wd, top=5):
     cans = []
     rwd = {v: k for k, v in wd.items()}
     tt = ticktock("wordmatsearcher")
     tt.tick("started searching")
     for i in range(wordmat.shape[0]):
         sentence = wordids2string(wordmat[i], rwd=rwd)
         #ssentence.replace(" '", "")
         res = self.searchsentence(sentence, top=top)
         cans.append([r["fb_id"] for r in res])
         tt.progress(i, wordmat.shape[0], live=True)
     tt.tock("done searching")
     return cans
Beispiel #20
0
def randgen(entcans, relsperent):
    tt = ticktock("randgen")
    tt.tick("generating")
    mat = np.zeros((len(entcans), 2), dtype="int32")
    for i in range(mat.shape[0]):
        cans = entcans[i]
        mat[i, 0] = random.sample(
            cans,
            1)[0] if len(cans) > 0 else -1  # only those appearing as subject
        cans = relsperent[mat[i, 0]][0] if mat[i, 0] >= 0 else []
        mat[i, 1] = random.sample(cans, 1)[0] if len(
            cans) > 0 else -1  # only outgoing relations of predicted subject
    tt.tock("generated")
    return mat
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords):
    tt = ticktock("fblexdataloader") ; tt.tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords)
    datanuments = np.max(indata.goldfeed)+1
    tt.tick()
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed + 1  # no entity = id 0

    return traindata, golddata, vocnuments, len(worddic)+1, datanuments+1, ed
Beispiel #22
0
def readdata(p="../../../../data/simplequestions/clean/datamat.word.fb2m.pkl",
             relsperentp="../../../../data/simplequestions/allrelsperent.dmp",
             wordchar=False):
    tt = ticktock("dataloader")
    tt.tick("loading datamat")
    x = pickle.load(open(p))
    tt.tock("datamat loaded")
    worddic = x["worddic"]
    entdic = x["entdic"]
    numents = x["numents"]
    entmat = x["entmat"]
    traindata, traingold = x["train"]
    validdata, validgold = x["valid"]
    testdata, testgold = x["test"]
    testsubjs = testgold[:, 0]
    testsubjsrels = {k: ([], []) for k in set(list(testsubjs))}

    if wordchar:
        traindata = wordmat2wordchartensor(traindata, worddic=worddic)
        validdata = wordmat2wordchartensor(validdata, worddic=worddic)
        testdata = wordmat2wordchartensor(testdata, worddic=worddic)

    tt.tick("loading test cans")
    for line in open(relsperentp):
        subj, relsout, relsin = line[:-1].split("\t")
        if subj in entdic and entdic[subj] in testsubjsrels:
            testsubjsrels[entdic[subj]] = (
                [entdic[x] for x in relsout.split(" ")] if relsout != "" else [],
                [entdic[x] for x in relsin.split(" ")] if relsin != "" else []
            )
    tt.tock("test cans loaded")

    # select and offset mats
    traingold = traingold[:, 1] - numents
    validgold = validgold[:, 1] - numents
    testgold = testgold[:, 1] - numents
    entmat = entmat[numents:, :]
    # select and offset entdic
    entdic = {k: v - numents for k, v in entdic.items() if v >= numents}
    # make testrelcans with new idx space
    testrelcans = [([y - numents for y in testsubjsrels[x][0]],
                    [y - numents for y in testsubjsrels[x][1]])
                   for x in testsubjs]

    return (traindata, traingold), (validdata, validgold), (testdata, testgold),\
           worddic, entdic, entmat, testrelcans
Beispiel #23
0
def run(
    epochs=100,
    lr=0.5,
    wreg=0.0001,
    numbats=100,
    fblexpath="../../data/freebase/labelsrevlex.map.sample",
    glovepath="../../data/glove/glove.6B.50d.txt",
    fbentdicp="../../data/freebase/entdic.all.map",
    numwords=10,
    numchars=30,
    wordembdim=50,
    wordencdim=100,
    innerdim=300,
    wordoffset=1,
    validinter=3,
    gradnorm=1.0,
    validsplit=100,
):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments = \
        loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars)

    tt.tock("made data").tick()

    # define model
    m = FBBasicCompositeEncoder(
        wordembdim=wordembdim,
        wordencdim=wordencdim,
        innerdim=innerdim,
        outdim=datanuments,
        numchars=128,  # ASCII
        numwords=vocnumwords,
    )

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)

    # train model   TODO
    tt.tick("training")
    m.train([traindata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).cross_entropy()\
        .autovalidate(splits=validsplit, random=True).validinter(validinter).accuracy()\
        .train(numbats, epochs)
    #embed()
    tt.tock("trained").tick("predicting")
    print m.predict(traindata).shape
    tt.tock("predicted sample")
Beispiel #24
0
def run(
        epochs=100,
        lr=0.5,
        wreg=0.0001,
        numbats=100,
        fblexpath="../../data/freebase/labelsrevlex.map.sample",
        glovepath="../../data/glove/glove.6B.50d.txt",
        fbentdicp="../../data/freebase/entdic.all.map",
        numwords=10,
        numchars=30,
        wordembdim=50,
        wordencdim=100,
        innerdim=300,
        wordoffset=1,
        validinter=3,
        gradnorm=1.0,
        validsplit=100,
    ):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments = \
        loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars)

    tt.tock("made data").tick()

    # define model
    m = FBBasicCompositeEncoder(
        wordembdim=wordembdim,
        wordencdim=wordencdim,
        innerdim=innerdim,
        outdim=datanuments,
        numchars=128,               # ASCII
        numwords=vocnumwords,
    )

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)

    # train model   TODO
    tt.tick("training")
    m.train([traindata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).cross_entropy()\
        .autovalidate(splits=validsplit, random=True).validinter(validinter).accuracy()\
        .train(numbats, epochs)
    #embed()
    tt.tock("trained").tick("predicting")
    print m.predict(traindata).shape
    tt.tock("predicted sample")
Beispiel #25
0
 def __init__(self,
              questionencoder=None,
              entityencoder=None,
              relationencoder=None,
              enttrans=None,
              reltrans=None,
              debug=False,
              subjinfo=None):
     self.qenc = questionencoder
     self.eenc = entityencoder
     self.renc = relationencoder
     #self.mode = mode
     self.enttrans = enttrans
     self.reltrans = reltrans
     self.debug = debug
     self.subjinfo = subjinfo
     self.qencodings = None
     self.tt = ticktock("predictor")
Beispiel #26
0
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords):
    tt = ticktock("fblexdataloader")
    tt.tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords)
    datanuments = np.max(indata.goldfeed) + 1
    tt.tick()
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed + 1  # no entity = id 0

    return traindata, golddata, vocnuments, len(
        worddic) + 1, datanuments + 1, ed
Beispiel #27
0
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars):
    tt = ticktock("fblexdataloader") ; tt.tick()
    gd, vocnumwords = getglovedict(glovepath, offset=wordoffset)
    tt.tock("loaded %d worddic" % len(gd)).tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FreebaseSeqFeedMakerEntidxs(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1)
    datanuments = np.max(indata.goldfeed)+1
    tt.tick()
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed + 1  # no entity = id 0

    return traindata, golddata, vocnuments, vocnumwords, datanuments+1, ed, gd
Beispiel #28
0
def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars):
    tt = ticktock("fblexdataloader") ; tt.tick()
    gd, vocnumwords = getglovedict(glovepath, offset=wordoffset)
    tt.tock("loaded %d worddic" % len(gd)).tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FreebaseEntFeedsMaker(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1)
    datanuments = max(indata.goldfeed)+1
    tt.tick()
    print "max entity id+1: %d" % datanuments
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed

    return traindata, golddata, vocnuments, vocnumwords, datanuments
Beispiel #29
0
def gencans(data, top=50, exact=True, rwd=None, ed=None, host=None, index=None):
    idx = SimpleQuestionsLabelIndex(host=host, index=index)
    # transform data using worddic and search
    sentences = []
    cans = []
    tt = ticktock("candidate generator")
    tt.tick("generating cans")
    for i in range(data.shape[0]):
        sentence = " ".join(
                        map(lambda x: rwd[x],
                            filter(lambda x: x in rwd, data[i, :])))
        sentences.append(sentence)
        searchres = idx.searchsentence(sentence, exact=exact, top=top)
        scans = map(lambda (x, (y, z)): ed[x], searchres.items())
        if i % 10 == 0:
            tt.live("%d of %d" % (i, data.shape[0]))
        cans.append(scans)
    tt.stoplive()
    tt.tock("generated cans")
    return cans
Beispiel #30
0
 def eval(self, data, gold, transform=None, savep=None):     # data: wordidx^(batsize, seqlen), gold: entidx^(batsize)
     # generate candidates
     if os.path.isfile("testcans.pkl"):
         cans = self.loadcans("testcans.pkl")
     else:
         cans = gencans(data, host=self.host, index=self.index, rwd=self.rwd, ed=self.ed)           # list of lists of entidx
         pickle.dump(cans, open("testcans.pkl", "w"))
     assert len(cans) == data.shape[0] == gold.shape[0]
     #        embed()
     predictor = self.scorer.predict.transform(transform)
     tt = ticktock("evaluator")
     tt.tick("evaluating...")
     nocans = 0
     nogoldcan = 0
     tosave = {}
     for i in range(data.shape[0]):
         numcans = len(cans[i])
         if gold[i] not in cans[i]:
             nogoldcan += 1
         predinp = [np.repeat(np.expand_dims(data[i, :], axis=0), numcans, axis=0),
                    np.asarray(cans[i], dtype="int32")]
         #print predinp, "%d/%d" % (i, data.shape[0]), numcans
         if numcans > 0:
             predinpscores = predictor(*predinp)      # (numcans,)
             ranking = sorted(zip(cans[i], list(predinpscores)),
                              key=lambda (x, y): y, reverse=True)
             tosave[i] = (gold[i], ranking)
             for metric in self.metrics:
                 metric.accumulate([gold[i]], ranking)
         else:
             nocans += 1
         if i % 100 == 0:
             tt.live("evaluated: %.2f%%" % (i*100./data.shape[0]))
     tt.tock("evaluated")
     if savep is not None:
         tt.tick("saving")
         pickle.dump(tosave, open(savep, "w"))
         tt.tock("saved")
     print "no cans for %d questions" % nocans
     print "gold not among cans for %d questions" % nogoldcan
     return self.metrics
Beispiel #31
0
 def load(self, entdic):
     self.trainingdata = []
     self.golddata = []
     tt = ticktock(self.__class__.__name__)
     tt.tick("loading kgraph")
     with open(self.path) as f:
         c = 0
         for line in f:
             ns = line[:-1].split("\t")
             if len(ns) is not 2:
                 print line, c
                 continue
             sf, fb = ns
             self.trainingdata.append(self._process_sf(sf, self.numwords, self.numchars))
             entids = self._process_ent(fb, entdic)
             self.golddata.append(entids)
             if c % 1e6 == 0:
                 tt.tock("%.0fM" % (c/1e6)).tick()
             c += 1
     self.golddata = np.asarray(self.golddata, dtype="int32")
     self.trainingdata = np.array(self.trainingdata)
 def load(self, entdic):
     self.trainingdata = []
     self.golddata = []
     tt = ticktock(self.__class__.__name__)
     tt.tick("loading kgraph")
     with open(self.path) as f:
         c = 0
         for line in f:
             ns = line[:-1].split("\t")
             if len(ns) is not 2:
                 print line, c
                 continue
             sf, fb = ns
             self.trainingdata.append(self._process_sf(sf, self.numwords))
             entids = self._process_ent(fb, entdic)
             self.golddata.append(entids)
             if c % 1e6 == 0:
                 tt.tock("%.0fM" % (c / 1e6)).tick()
             c += 1
     self.golddata = np.asarray(self.golddata, dtype="int32")
     self.trainingdata = np.array(self.trainingdata)
Beispiel #33
0
def run(epochs=10,
        numbats=700,
        lr=0.1,
        embdim=200,
        encdim=300,
        layers=1,
        clean=False,
        rarefreq=4,
        glove=0,
        type="rnn",  # rnn or cnn
        p="../../data/simplequestions/datamat.word.mem.fb2m.pkl",
        ):
    # load data for classification
    tt = ticktock("script")
    tt.tick("loading data")
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
        entdic, entmat, worddic, numents = readdata(p, clean=clean, rarefreq=rarefreq)
    tt.tock("loaded data")
    # model
    tt.tick("building model")


    m = SimpleSeq2Idx(indim=len(worddic)+1, inpembdim=embdim, numclasses=len(entdic),
                      innerdim=encdim, maskid=0, layers=layers)
    tt.tock("built model")
    tt.tick("training")
    m.train([traindata], traingold).adadelta(lr=lr).cross_entropy().grad_total_norm(1.)\
        .validate_on([validdata], validgold).cross_entropy().accuracy().takebest()\
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")
    tt.tick("testing")
    preds = m.predict(testdata)
    preds = np.argmax(preds, axis=1)
    acc = preds == testgold
    acc = np.sum(acc) * 1.0 / testdata.shape[0]
    print("Accuracy: {}".format(acc))
    tt.tock("tested")
Beispiel #34
0
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars):
    tt = ticktock("fblexdataloader")
    tt.tick()
    gd, vocnumwords = getglovedict(glovepath, offset=wordoffset)
    tt.tock("loaded %d worddic" % len(gd)).tick()
    ed, vocnuments = getentdict(fbentdicp, offset=0)
    tt.tock("loaded %d entdic" % len(ed)).tick()

    indata = FreebaseSeqFeedMakerEntidxs(fblexpath,
                                         gd,
                                         ed,
                                         numwords=numwords,
                                         numchars=numchars,
                                         unkwordid=wordoffset - 1)
    datanuments = np.max(indata.goldfeed) + 1
    tt.tick()
    indata.trainfeed[0:9000]
    tt.tock("transformed")
    #embed()

    traindata = indata.trainfeed
    golddata = indata.goldfeed + 1  # no entity = id 0

    return traindata, golddata, vocnuments, vocnumwords, datanuments + 1, ed, gd
Beispiel #35
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
    embdim=100,
    innerdim=200,
    wreg=0.00005,
    bidir=False,
    mem=False,
    membidir=False,
    memlayers=1,
    layers=1,
    testfirst=False,
    rankingloss=False,
    rlmargin=1.,
    charlevel=False,
    pool=False,
    resultsave=False,
    resultsavep="subjdetns.res.pkl",
):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim / 2] * layers
    else:
        encinnerdim = [innerdim] * layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                         inpembdim=embdim,
                         innerdim=encinnerdim,
                         maskid=-1,
                         bidir=bidir,
                         pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim / 2] * memlayers
        else:
            innerdim = [innerdim] * memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb  # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                             inpembdim=memembdim,
                             inpemb=meminpemb,
                             innerdim=innerdim,
                             maskid=-1,
                             bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)  # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)  # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):  # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):  # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max,
                                            gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer,
                            worddic=worddic,
                            entdic=entdic,
                            metrics=[ClassAccuracy(),
                                     RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer,
                        worddic=worddic,
                        entdic=entdic,
                        metrics=[
                            ClassAccuracy(),
                            RecallAt(1),
                            RecallAt(2),
                            RecallAt(5),
                            RecallAt(10)
                        ])

    evalres = eval.eval(testdata,
                        testgold,
                        transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre
Beispiel #36
0
def run(
    epochs=10,
    numbats=100,
    negrate=1,
    lr=0.1,
    embdim=50,
    encdim=50,
    wreg=0.00005,
    marginloss=False,
    margin=1.0,
    cosine=False,
    bidir=False,
):
    tt = ticktock("script")
    # get glove words
    g = Glove(encdim)
    words = g.D.keys()
    maxwordlen = 0
    for word in words:
        maxwordlen = max(maxwordlen, len(word))
    chars = set("".join(words))
    chars.add(" ")
    print "{} words, maxlen {}, {} characters in words".format(len(words), maxwordlen, len(chars))
    # get char word matrix
    chardic = dict(zip(chars, range(len(chars))))
    pickle.dump(chardic, open("glove2c2w.chardic.pkl", "w"))
    charwordmat = -np.ones((len(words) + 1, maxwordlen), dtype="int32")
    charwordmat[0, 0] = chardic[" "]
    for i in range(0, len(words)):
        word = words[i]
        charwordmat[i + 1, : len(word)] = [chardic[x] for x in word]
    print charwordmat[0]
    # encode characters
    cwenc = SimpleSeq2Vec(
        indim=len(chars), inpembdim=embdim, innerdim=encdim / 2 if bidir else encdim, maskid=-1, bidir=bidir
    )
    dist = CosineDistance() if cosine else EuclideanDistance()  # DotDistance()
    print "using " + str(dist)
    scorer = MatchScore(cwenc, g.block, scorer=dist)

    """
    scorer.train([charwordmat, np.arange(len(words)+1)], np.ones((charwordmat.shape[0],), dtype="int32") * (-1 if cosine else 1))\
        .linear_objective().adagrad(lr=lr).l2(wreg)\
        .train(numbats=numbats, epochs=epochs)

    #embed()
    """

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if marginloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    else:
        obj = lambda p, n: n - p

    nscorer = (
        scorer.nstrain([charwordmat, np.arange(len(words) + 1)])
        .negsamplegen(NegIdxGen(len(words)))
        .negrate(negrate)
        .objective(obj)
        .adagrad(lr=lr)
        .l2(wreg)
        .train(numbats=numbats, epochs=epochs)
    )

    cwenc.save("glove2c2w.block")
Beispiel #37
0
def run(
    epochs=100,
    lr=0.01,
    wreg=0.0001,
    numbats=10,
    fbdatapath="../../data/mfqa/mfqa.tsv.sample.small",
    fblexpath="../../data/mfqa/mfqa.labels.idx.map",
    glovepath="../../data/glove/glove.6B.50d.txt",
    fbentdicp="../../data/mfqa/mfqa.dic.map",
    numwords=20,
    numchars=30,
    wordembdim=50,
    wordencdim=100,
    entembdim=100,
    innerdim=200,
    attdim=200,
    wordoffset=1,
    validinter=1,
    gradnorm=1.0,
    validsplit=1,
    vocnumwordsres=50e3,
    model="mem",
):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \
        loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars)
    outdata = shiftdata(golddata)
    tt.tock("made data").tick()
    entids, lexdata = load_lex_data(fblexpath, datanuments, worddic)
    if "mem" in model:
        print lexdata.shape
        print datanuments
        #embed()
        if "att" in model:
            print "model with attention AND memory"
            m = FBSeqCompEncMemDecAtt(
                wordembdim=wordembdim,
                wordencdim=wordencdim,
                entembdim=entembdim,
                innerdim=innerdim,
                outdim=datanuments,
                numchars=128,  # ASCII
                numwords=vocnumwords,
                memdata=[entids, lexdata],
                attdim=attdim,
                memaddr=GeneralDotMemAddr,
            )
        else:
            m = FBSeqCompositeEncMemDec(
                wordembdim=wordembdim,
                wordencdim=wordencdim,
                entembdim=entembdim,
                innerdim=innerdim,
                outdim=datanuments,
                numchars=128,  # ASCII
                numwords=vocnumwords,
                memdata=[entids, lexdata],
                attdim=attdim,
                memaddr=LinearGateMemAddr,
            )
    elif model == "lex":  # for testing purposes
        print lexdata.shape
        print datanuments
        #vocnumwords = 4000
        #exit()
        #embed()
        m = FBMemMatch(
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords,
            memdata=[entids, lexdata],
            attdim=attdim,
        )

    elif model == "nomem":
        m = FBSeqCompositeEncDec(  # compiles, errors go down
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords)
    else:
        m = None
        print "no such model"
    reventdic = {}
    for k, v in entdic.items():
        reventdic[v] = k

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)
    tt.tock("model defined")
    if model == "lex":  # for testing purposes
        tt.tick("predicting")
        print lexdata[1:5].shape, entids[1:5].shape
        #print lexdata[1:5]
        print entids[1:5]
        pred = m.predict(lexdata[1:5])
        print pred.shape
        print np.argmax(pred, axis=1) - 1
        print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(
            np.argmax(pred, axis=1) - 1)
        tt.tock("predicted sample")
        tt.tick("training")
        m.train([lexdata[1:151]], entids[1:151]).adagrad(lr=lr).cross_entropy().grad_total_norm(0.5)\
            .split_validate(5, random=True).validinter(validinter).accuracy()\
            .train(numbats, epochs)
    else:
        #embed()
        tt.tick("predicting")
        print traindata[:5].shape, outdata[:5].shape
        pred = m.predict(traindata[:5], outdata[:5])
        print np.argmax(pred, axis=2) - 1
        print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
        tt.tock("predicted sample")

        tt.tick("training")
        m.train([traindata, outdata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).seq_cross_entropy()\
            .split_validate(splits=5, random=False).validinter(validinter).seq_accuracy().seq_cross_entropy()\
            .train(numbats, epochs)
        #embed()

        tt.tock("trained").tick("predicting")
        pred = m.predict(traindata[:50], outdata[:50])
        print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
        tt.tock("predicted sample")
Beispiel #38
0
def run(
        epochs=100,
        epochsp=10,
        lr=0.03,
        wreg=0.001,
        numbats=10,
        fbdatapath="../../data/mfqa/mfqa.tsv.sample.small",
        fblexpath="../../data/mfqa/mfqa.labels.idx.map",
        glovepath="../../data/glove/glove.6B.50d.txt",
        fbentdicp="../../data/mfqa/mfqa.dic.map",
        numwords=20,
        numchars=30,
        wordembdim=50,
        wordencdim=100,
        entembdim=100,
        innerdim=400,
        attdim=200,
        wordoffset=1,
        validinter=1,
        gradnorm=1.0,
        validsplit=5,
        vocnumwordsres=50e3,
        model="nomem",
    ):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \
        loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars)
    tt.tock("made data").tick()
    entids, lexdata = load_lex_data(fblexpath, datanuments, worddic)

    # manual split # TODO: do split in feeder
    splitpoint = int(traindata.shape[0]*(1. - 1./validsplit))
    print splitpoint
    validdata = traindata[splitpoint:]
    validgold = golddata[splitpoint:]
    traindata = traindata[:splitpoint]
    golddata = golddata[:splitpoint]

    if "att" in model:
        m = FBSeqCompEncDecAtt(
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            attdim=attdim,
            numwords=vocnumwords
        )
    else:
        m = FBSeqCompositeEncDec(  # compiles, errors go down
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords
        )

    reventdic = {}
    for k, v in entdic.items():
        reventdic[v] = k

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)
    tt.tock("model defined")
    if "lex" in model:
        tt.tick("predicting lexicon")
        print lexdata[1:5].shape, entids[1:5].shape, golddata[:5].shape
        #print lexdata[1:5]
        #print entids[:5]; exit()
        pred = m.predict(lexdata[1:5], np.zeros((entids[1:5].shape[0], 1), dtype="int32"))
        print pred.shape
        print np.argmax(pred, axis=2)-1
        print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=2)-1)
        tt.tock("predicted sample")
        tt.tick("training")
        lextrain = lexdata
        print lextrain.shape
        lexgold = entids.reshape((entids.shape[0], 1))
        print lexgold.shape
        lexgoldshifted = shiftdata(lexgold)
        m.train([lextrain, lexgoldshifted], lexgold).adagrad(lr=lr).seq_cross_entropy().grad_total_norm(gradnorm)\
            .autovalidate(validsplit, random=True).validinter(validinter).seq_accuracy().seq_cross_entropy()\
            .train(numbats, epochsp)

        tt.tick("predicting")
        print lexdata[1:5].shape, entids[1:5].shape, golddata[:5].shape
        # print lexdata[1:5]
        # print entids[:5]; exit()
        pred = m.predict(lexdata[1:5], np.zeros((entids[1:5].shape[0], 1), dtype="int32"))
        print pred.shape
        print np.argmax(pred, axis=2) - 1
        print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=2) - 1)
        tt.tock("predicted sample")

        m.fixO(lr=0.01)

    # embed()
    outdata = shiftdata(golddata)

    tt.tick("predicting")
    print traindata[:5].shape, outdata[:5].shape
    #print golddata[:5]  ; exit()
    pred = m.predict(traindata[:5], outdata[:5])
    print np.argmax(pred, axis=2) - 1
    print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")

    tt.tick("training")
    m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \
        .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \
        .train(numbats, epochs)
    # embed()

    tt.tock("trained").tick("predicting")
    pred = m.predict(traindata[:50], outdata[:50])
    print np.argmax(pred, axis=2) - 1
    #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")
Beispiel #39
0
def run(
        epochs=10,
        numbats=100,
        negrate=1,
        lr=0.1,
        datap="../../../data/simplequestions/datamat.word.mem.fb2m.pkl",
        embdim=100,
        innerdim=200,
        wreg=0.00005,
        bidir=False,
        mem=False,
        membidir=False,
        memlayers=1,
        layers=1,
        testfirst=False,
        rankingloss=False,
        rlmargin=1.,
        charlevel=False,
        pool=False,
        resultsave=False,
        resultsavep="subjdetns.res.pkl",
        ):

    tt = ticktock("script")
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat\
        = readdata(datap, charlevel)

    print entmat.shape
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [innerdim/2]*layers
    else:
        encinnerdim = [innerdim]*layers

    # question representation:
    # encodes question sequence to vector
    # let's try to embed chars too <-- embdim = None if charlevel else embdim
    qenc = SimpleSeq2Vec(indim=numwords,
                        inpembdim=embdim,
                        innerdim=encinnerdim,
                        maskid=-1,
                        bidir=bidir,
                        pool=pool)

    # entity representation:
    if mem:
        # encodes label to vector
        if membidir:
            innerdim = [innerdim/2]*memlayers
        else:
            innerdim = [innerdim]*memlayers
        memembdim = embdim
        #embed chars too <-- meminpemb = None if charlevel else qenc.inpemb  # share embeddings
        #memembdim = None if charlevel else memembdim
        meminpemb = qenc.inpemb     # also chars are embedded and embeddings are always shared
        lenc = SimpleSeq2Vec(indim=numwords,
                                inpembdim=memembdim,
                                inpemb=meminpemb,
                                innerdim=innerdim,
                                maskid=-1,
                                bidir=membidir)
    else:
        # embeds entity id to vector
        lenc = VectorEmbed(indim=numents, dim=innerdim)

    # question-entity score computation:
    scorer = MatchScore(qenc, lenc)       # batched dot

    # trainer config preparation
    class PreProcf(object):
        def __init__(self, entmat):
            self.em = Val(entmat)                # entmat: idx[word]^(numents, len(ent.name))

        def __call__(self, datas, gold):    # gold: idx^(batsize, )
            return (datas, self.em[gold, :]), {}

    class NegIdxGen(object):
        def __init__(self, rng):
            self.min = 0
            self.max = rng

        def __call__(self, datas, gold):    # gold: idx^(batsize,)
            return datas, np.random.randint(self.min, self.max, gold.shape).astype("int32")

    if testfirst:
        eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(5)])
        evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat))
        for e in evalres:
            print e
        tt.msg("tested dummy")
        sys.exit()
    #embed()
    # trainer config and training
    obj = lambda p, n: n - p
    if rankingloss:
        obj = lambda p, n: (n - p + rlmargin).clip(0, np.infty)

    nscorer = scorer.nstrain([traindata, traingold]).transform(PreProcf(entmat))\
        .negsamplegen(NegIdxGen(numents)).negrate(negrate).objective(obj)\
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0)\
        .validate_on([validdata, validgold])\
        .train(numbats=numbats, epochs=epochs)

    # evaluation
    eval = SubjRankEval(scorer, worddic=worddic, entdic=entdic, metrics=[ClassAccuracy(), RecallAt(1), RecallAt(2), RecallAt(5), RecallAt(10)])

    evalres = eval.eval(testdata, testgold, transform=PreProcf(entmat),
                        savep=None if not resultsave else resultsavep)
    for evalre in evalres:
        print evalre
def run(
        epochs=100,
        lr=0.03,
        wreg=0.0001,
        numbats=10,
        fbdatapath="../../data/mfqa/mfqa.tsv.sample.small",
        fblexpath="../../data/mfqa/mfqa.labels.idx.map",
        fbentdicp="../../data/mfqa/mfqa.dic.map",
        numwords=20,
        wordembdim=50,
        entembdim=101,
        innerdim=100,
        attdim=100,
        wordoffset=1,
        validinter=1,
        gradnorm=1.0,
        validsplit=5,
        model="lex",
    ):
    tt = ticktock("fblextransrun")

    worddic = makeworddict(fblexpath, fbdatapath)

    traindata, golddata, vocnuments, vocnumwords, datanuments, entdic = \
        loaddata(worddic, fbentdicp, fbdatapath, wordoffset, numwords)
    tt.tock("made data").tick()
    entids, lexdata = load_lex_data(fblexpath, datanuments, worddic)


    # manual split # TODO: do split in feeder
    splitpoint = int(traindata.shape[0]*(1. - 1./validsplit))
    print splitpoint
    validdata = traindata[splitpoint:]
    validgold = golddata[splitpoint:]
    traindata = traindata[:splitpoint]
    golddata = golddata[:splitpoint]

    print traindata.shape, golddata.shape
    print validdata.shape, validgold.shape

    if "lex" in model:      # append lexdata
        traindata = np.concatenate([traindata, lexdata], axis=0)
        print traindata.shape
        entids = entids.reshape((entids.shape[0], 1))
        golddata = np.concatenate([golddata, np.concatenate([entids, np.zeros_like(entids, dtype="int32")], axis=1)], axis=0)
        print golddata.shape
    #exit()
    m = FBSeqSimpEncDecAtt(
        wordembdim=wordembdim,
        entembdim=entembdim,
        innerdim=innerdim,
        attdim=attdim,
        outdim=datanuments,
        numwords=vocnumwords,
    )
    tt.tock("model defined")

    reventdic = {}
    for k, v in entdic.items():
        reventdic[v] = k


    # embed()
    outdata = shiftdata(golddata)

    tt.tick("predicting")
    print traindata[:5].shape, outdata[:5].shape
    #print golddata[:5]  ; exit()
    pred = m.predict(traindata[:5], outdata[:5])
    print np.argmax(pred, axis=2) - 1
    print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")

    tt.tick("training")
    m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \
        .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \
        .train(numbats, epochs)
    # embed()

    tt.tock("trained").tick("predicting")
    pred = m.predict(validdata, shiftdata(validgold))
    print np.argmax(pred, axis=2) - 1
    #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")
Beispiel #41
0
 def __init__(self, model, canenc, scorer, agg, beamsize=1, *buildargs, **kw):
     super(SeqEncDecRankSearch, self).__init__(model, beamsize, *buildargs, **kw)
     self.scorer = scorer
     self.canenc = canenc
     self.agg = agg
     self.tt = ticktock("RankSearch")
Beispiel #42
0
def run(
        epochs=50,
        mode="char",    # "char" or "word" or "charword"
        numbats=100,
        lr=0.1,
        wreg=0.000001,
        bidir=False,
        layers=1,
        encdim=200,
        decdim=400,
        embdim=100,
        negrate=1,
        margin=1.,
        hingeloss=False,
        debug=False,
        preeval=False,
        sumhingeloss=False,
        checkdata=False,        # starts interactive shell for data inspection
        printpreds=False,
        subjpred=False,
        predpred=False,
        specemb=-1,
        balancednegidx=False,
        usetypes=False,
    ):
    if debug:       # debug settings
        sumhingeloss = True
        numbats = 10
        lr = 0.02
        epochs = 10
        printpreds = True
        whatpred = "all"
        if whatpred == "pred":
            predpred = True
        elif whatpred == "subj":
            subjpred = True
        #preeval = True
        specemb = 100
        margin = 1.
        balancednegidx = True
        #usetypes=True
    # load the right file
    tt = ticktock("script")
    specids = specemb > 0
    tt.tick()
    (traindata, traingold), (validdata, validgold), (testdata, testgold), \
    worddic, entdic, entmat, relstarts, canids\
        = readdata(mode, testcans="testcans.pkl", debug=debug, specids=specids, usetypes=usetypes)
    entmat = entmat.astype("int32")

    #embed()

    if subjpred is True and predpred is False:
        traingold = traingold[:, [0]]
        validgold = validgold[:, [0]]
        testgold = testgold[:, [0]]
    if predpred is True and subjpred is False:
        traingold = traingold[:, [1]]
        validgold = validgold[:, [1]]
        testgold = testgold[:, [1]]


    if checkdata:
        rwd = {v: k for k, v in worddic.items()}
        red = {v: k for k, v in entdic.items()}
        def p(xids):
            return (" " if mode == "word" else "").join([rwd[xid] if xid > -1 else "" for xid in xids])
        embed()

    reventdic = {v: k for k, v in entdic.items()}
    revworddic = {v: k for k, v in worddic.items()}
    print traindata.shape, traingold.shape, testdata.shape, testgold.shape

    tt.tock("data loaded")

    # *data: matrix of word ids (-1 filler), example per row
    # *gold: vector of true entity ids
    # entmat: matrix of word ids (-1 filler), entity label per row, indexes according to *gold
    # *dic: from word/ent-fbid to integer id, as used in data

    numwords = max(worddic.values()) + 1
    numents = max(entdic.values()) + 1
    print "%d words, %d entities" % (numwords, numents)

    if bidir:
        encinnerdim = [encdim / 2] * layers
    else:
        encinnerdim = [encdim] * layers

    memembdim = embdim
    memlayers = layers
    membidir = bidir
    if membidir:
        decinnerdim = [decdim/2]*memlayers
    else:
        decinnerdim = [decdim]*memlayers

    entenc = SimpleSeq2Vec(indim=numwords,
                         inpembdim=memembdim,
                         innerdim=decinnerdim,
                         maskid=-1,
                         bidir=membidir)

    if specids:     # include vectorembedder
        numentembs = len(np.unique(entmat[:, 0]))
        entenc = EntEmbEnc(entenc, numentembs, specemb)
        # adjust params for enc/dec construction
        #encinnerdim[-1] += specemb
        #innerdim[-1] += specemb

    encdec = SimpleSeqEncDecAtt(inpvocsize=numwords, inpembdim=embdim,
                    encdim=encinnerdim, bidir=bidir, outembdim=entenc,
                    decdim=decinnerdim, vecout=True, statetrans="matdot")

    scorerargs = ([encdec, SeqUnroll(entenc)],
                  {"argproc": lambda x, y, z: ((x, y), (z,)),
                   "scorer": GenDotDistance(decinnerdim[-1], entenc.outdim)})
    if sumhingeloss:
        scorerargs[1]["aggregator"] = lambda x: x  # no aggregation of scores
    scorer = SeqMatchScore(*scorerargs[0], **scorerargs[1])

    #scorer.save("scorer.test.save")

    # TODO: below this line, check and test
    class PreProc(object):
        def __init__(self, entmat):
            self.f = PreProcE(entmat)

        def __call__(self, encdata, decsg, decgold):        # gold: idx^(batsize, seqlen)
            return (encdata, self.f(decsg), self.f(decgold)), {}

    class PreProcE(object):
        def __init__(self, entmat):
            self.em = Val(entmat)

        def __call__(self, x):
            return self.em[x]

    transf = PreProc(entmat)

    class NegIdxGen(object):
        def __init__(self, rng, midsplit=None):
            self.min = 0
            self.max = rng
            self.midsplit = midsplit

        def __call__(self, datas, sgold, gold):    # the whole target sequence is corrupted, corruption targets the whole set of entities and relations together
            if self.midsplit is None or not balancednegidx:
                return datas, sgold, np.random.randint(self.min, self.max, gold.shape).astype("int32")
            else:
                entrand = np.random.randint(self.min, self.midsplit, gold.shape)
                relrand = np.random.randint(self.midsplit, self.max, gold.shape)
                mask = np.random.randint(0, 2, gold.shape)
                ret = entrand * mask + relrand * (1 - mask)
                return datas, sgold, ret.astype("int32")

    # !!! MASKS ON OUTPUT SHOULD BE IMPLEMENTED FOR VARIABLE LENGTH OUTPUT SEQS
    obj = lambda p, n: n - p
    if hingeloss:
        obj = lambda p, n: (n - p + margin).clip(0, np.infty)
    if sumhingeloss:    #
        obj = lambda p, n: T.sum((n - p + margin).clip(0, np.infty), axis=1)

    traingoldshifted = shiftdata(traingold)
    validgoldshifted = shiftdata(validgold)

    #embed()
    # eval
    if preeval:
        tt.tick("pre-evaluating")
        s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
        eval = FullRankEval()
        pred, scores = s.decode(testdata, 0, testgold.shape[1],
                                candata=entmat, canids=canids,
                                transform=transf.f, debug=printpreds)
        evalres = eval.eval(pred, testgold, debug=debug)
        for k, evalre in evalres.items():
            print("{}:\t{}".format(k, evalre))
        tt.tock("pre-evaluated")

    negidxgenargs = ([numents], {"midsplit": relstarts})
    if debug:
        pass
        #negidxgenargs = ([numents], {})

    tt.tick("training")
    nscorer = scorer.nstrain([traindata, traingoldshifted, traingold]).transform(transf) \
        .negsamplegen(NegIdxGen(*negidxgenargs[0], **negidxgenargs[1])).negrate(negrate).objective(obj) \
        .adagrad(lr=lr).l2(wreg).grad_total_norm(1.0) \
        .validate_on([validdata, validgoldshifted, validgold]) \
        .train(numbats=numbats, epochs=epochs)
    tt.tock("trained")

    #scorer.save("scorer.test.save")

    # eval
    tt.tick("evaluating")
    s = SeqEncDecRankSearch(encdec, entenc, scorer.s, scorer.agg)
    eval = FullRankEval()
    pred, scores = s.decode(testdata, 0, testgold.shape[1],
                            candata=entmat, canids=canids,
                            transform=transf.f, debug=printpreds)
    if printpreds:
        print pred
    debugarg = "subj" if subjpred else "pred" if predpred else False
    evalres = eval.eval(pred, testgold, debug=debugarg)
    for k, evalre in evalres.items():
        print("{}:\t{}".format(k, evalre))
    tt.tock("evaluated")

    # save
    basename = os.path.splitext(os.path.basename(__file__))[0]
    dirname = basename + ".results"
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    savenamegen = lambda i: "{}/{}.res".format(dirname, i)
    savename = None
    for i in xrange(100):
        savename = savenamegen(i)
        if not os.path.exists(savename):
            break
        savename = None
    if savename is None:
        raise Exception("exceeded number of saved results")
    with open(savename, "w") as f:
        f.write("{}\n".format(" ".join(sys.argv)))
        for k, evalre in evalres.items():
            f.write("{}:\t{}\n".format(k, evalre))
Beispiel #43
0
def run(
    epochs=100,
    lr=0.03,
    wreg=0.0001,
    numbats=10,
    fbdatapath="../../data/mfqa/mfqa.tsv.sample",
    fblexpath="../../data/mfqa/mfqa.labels.idx.map",
    glovepath="../../data/glove/glove.6B.50d.txt",
    fbentdicp="../../data/mfqa/mfqa.dic.map",
    numwords=20,
    numchars=30,
    wordembdim=50,
    wordencdim=50,
    entembdim=101,
    innerdim=100,
    attdim=100,
    wordoffset=1,
    validinter=1,
    gradnorm=1.0,
    validsplit=5,
    vocnumwordsres=50e3,
    model="nomem",
):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \
        loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars)
    tt.tock("made data").tick()
    entids, lexdata = load_lex_data(fblexpath, datanuments, worddic)

    # manual split # TODO: do split in feeder
    splitpoint = int(traindata.shape[0] * (1. - 1. / validsplit))
    print splitpoint
    validdata = traindata[splitpoint:]
    validgold = golddata[splitpoint:]
    traindata = traindata[:splitpoint]
    golddata = golddata[:splitpoint]

    print traindata.shape, golddata.shape
    print validdata.shape, validgold.shape

    if "lex" in model:  # append lexdata
        traindata = np.concatenate([traindata, lexdata], axis=0)
        print traindata.shape
        entids = entids.reshape((entids.shape[0], 1))
        golddata = np.concatenate([
            golddata,
            np.concatenate(
                [entids, np.zeros_like(entids, dtype="int32")], axis=1)
        ],
                                  axis=0)
        print golddata.shape
    #exit()

    if "att" in model:
        m = FBSeqCompEncDecAtt(
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords,
            attdim=attdim,
        )

    else:
        m = FBSeqCompositeEncDec(
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords,
        )

    reventdic = {}
    for k, v in entdic.items():
        reventdic[v] = k

    prelex = "lex" in model

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)
    tt.tock("model defined")

    # embed()
    outdata = shiftdata(golddata)

    tt.tick("predicting")
    print traindata[:5].shape, outdata[:5].shape
    #print golddata[:5]  ; exit()
    pred = m.predict(traindata[:5], outdata[:5])
    print np.argmax(pred, axis=2) - 1
    print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")

    tt.tick("training")
    m.train([traindata, outdata], golddata).adagrad(lr=lr).l2(wreg).grad_total_norm(gradnorm).seq_cross_entropy() \
        .validate_on([validdata, shiftdata(validgold)], validgold).validinter(validinter).seq_accuracy().seq_cross_entropy() \
        .train(numbats, epochs)
    # embed()

    tt.tock("trained").tick("predicting")
    pred = m.predict(validdata, shiftdata(validgold))
    print np.argmax(pred, axis=2) - 1
    #print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2) - 1)
    tt.tock("predicted sample")
Beispiel #44
0
def run(
        epochs=100,
        lr=0.01,
        wreg=0.0001,
        numbats=10,
        fbdatapath="../../data/mfqa/mfqa.tsv.sample.small",
        fblexpath="../../data/mfqa/mfqa.labels.idx.map",
        glovepath="../../data/glove/glove.6B.50d.txt",
        fbentdicp="../../data/mfqa/mfqa.dic.map",
        numwords=20,
        numchars=30,
        wordembdim=50,
        wordencdim=100,
        entembdim=100,
        innerdim=200,
        attdim=200,
        wordoffset=1,
        validinter=1,
        gradnorm=1.0,
        validsplit=1,
        vocnumwordsres=50e3,
        model="mem",
    ):
    tt = ticktock("fblextransrun")

    traindata, golddata, vocnuments, vocnumwords, datanuments, entdic, worddic = \
        loaddata(glovepath, fbentdicp, fbdatapath, wordoffset, numwords, numchars)
    outdata = shiftdata(golddata)
    tt.tock("made data").tick()
    entids, lexdata = load_lex_data(fblexpath, datanuments, worddic)
    if "mem" in model:
        print lexdata.shape
        print datanuments
        #embed()
        if "att" in model:
            print "model with attention AND memory"
            m = FBSeqCompEncMemDecAtt(
                wordembdim=wordembdim,
                wordencdim=wordencdim,
                entembdim=entembdim,
                innerdim=innerdim,
                outdim=datanuments,
                numchars=128,               # ASCII
                numwords=vocnumwords,
                memdata=[entids, lexdata],
                attdim=attdim,
                memaddr=GeneralDotMemAddr,
            )
        else:
            m = FBSeqCompositeEncMemDec(
                wordembdim=wordembdim,
                wordencdim=wordencdim,
                entembdim=entembdim,
                innerdim=innerdim,
                outdim=datanuments,
                numchars=128,               # ASCII
                numwords=vocnumwords,
                memdata=[entids, lexdata],
                attdim=attdim,
                memaddr=LinearGateMemAddr,
            )
    elif model=="lex":          # for testing purposes
        print lexdata.shape
        print datanuments
        #vocnumwords = 4000
        #exit()
        #embed()
        m = FBMemMatch(
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords,
            memdata=[entids, lexdata],
            attdim=attdim,
        )

    elif model=="nomem":
        m = FBSeqCompositeEncDec(           # compiles, errors go down
            wordembdim=wordembdim,
            wordencdim=wordencdim,
            entembdim=entembdim,
            innerdim=innerdim,
            outdim=datanuments,
            numchars=128,
            numwords=vocnumwords
        )
    else:
        m = None
        print "no such model"
    reventdic = {}
    for k, v in entdic.items():
        reventdic[v] = k

    #wenc = WordEncoderPlusGlove(numchars=numchars, numwords=vocnumwords, encdim=wordencdim, embdim=wordembdim)
    tt.tock("model defined")
    if model == "lex": # for testing purposes
        tt.tick("predicting")
        print lexdata[1:5].shape, entids[1:5].shape
        #print lexdata[1:5]
        print entids[1:5]
        pred = m.predict(lexdata[1:5])
        print pred.shape
        print np.argmax(pred, axis=1)-1
        print np.vectorize(lambda x: reventdic[x] if x in reventdic else None)(np.argmax(pred, axis=1)-1)
        tt.tock("predicted sample")
        tt.tick("training")
        m.train([lexdata[1:151]], entids[1:151]).adagrad(lr=lr).cross_entropy().grad_total_norm(0.5)\
            .split_validate(5, random=True).validinter(validinter).accuracy()\
            .train(numbats, epochs)
    else:
        #embed()
        tt.tick("predicting")
        print traindata[:5].shape, outdata[:5].shape
        pred = m.predict(traindata[:5], outdata[:5])
        print np.argmax(pred, axis=2)-1
        print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2)-1)
        tt.tock("predicted sample")

        tt.tick("training")
        m.train([traindata, outdata], golddata).adagrad(lr=lr).grad_total_norm(gradnorm).seq_cross_entropy()\
            .split_validate(splits=5, random=False).validinter(validinter).seq_accuracy().seq_cross_entropy()\
            .train(numbats, epochs)
        #embed()

        tt.tock("trained").tick("predicting")
        pred = m.predict(traindata[:50], outdata[:50])
        print np.vectorize(lambda x: reventdic[x])(np.argmax(pred, axis=2)-1)
        tt.tock("predicted sample")