def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseEntFeedsMaker(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = max(indata.goldfeed) + 1 tt.tick() print "max entity id+1: %d" % datanuments indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed return traindata, golddata, vocnuments, vocnumwords, datanuments
def test_fb_datafeed_mfqa_shape(self): gd, gmaxi = getglovedict(os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join(os.path.dirname(__file__), "../data/mfqa/mfqa.dic.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/mfqa/mfqa.tsv.sample") f = FreebaseSeqFeedMaker(dp, gd, ed, numwords=20, numchars=30) self.assertLessEqual(np.max(f.trainfeed[:][:, :, 1:]), 128) # ASCII range self.assertEqual(f.worddic, gd) self.assertEqual(f.trainfeed[0:5].shape, (5, 20, 31)) self.assertEqual(f.goldfeed[0:5].shape, (5, 2))
def test_fb_datafeed_shape(self): gd, gmaxi = getglovedict(os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join(os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/freebase/labelsrevlex.map.sample") f = FreebaseSeqFeedMaker(dp, gd, ed, numwords=10, numchars=30) self.assertEqual(f.worddic, gd) self.assertEqual(f.trainfeed[0:5].shape, (5, 10, 31)) self.assertEqual(f.goldfeed[0:5].shape, (5, 1))
def test_fb_datafeed_shape(self): gd, gmaxi = getglovedict( os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join( os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/freebase/labelsrevlex.map.sample") f = FreebaseSeqFeedMaker(dp, gd, ed, numwords=10, numchars=30) self.assertEqual(f.worddic, gd) self.assertEqual(f.trainfeed[0:5].shape, (5, 10, 31)) self.assertEqual(f.goldfeed[0:5].shape, (5, 1))
def test_fb_datafeed_mfqa_shape(self): gd, gmaxi = getglovedict( os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join(os.path.dirname(__file__), "../data/mfqa/mfqa.dic.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/mfqa/mfqa.tsv.sample") f = FreebaseSeqFeedMaker(dp, gd, ed, numwords=20, numchars=30) self.assertLessEqual(np.max(f.trainfeed[:][:, :, 1:]), 128) # ASCII range self.assertEqual(f.worddic, gd) self.assertEqual(f.trainfeed[0:5].shape, (5, 20, 31)) self.assertEqual(f.goldfeed[0:5].shape, (5, 2))
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords): tt = ticktock("fblexdataloader") ; tt.tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords) datanuments = np.max(indata.goldfeed)+1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, len(worddic)+1, datanuments+1, ed
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") ; tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseSeqFeedMakerEntidxs(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = np.max(indata.goldfeed)+1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, vocnumwords, datanuments+1, ed, gd
def loaddata(worddic, fbentdicp, fblexpath, wordoffset, numwords): tt = ticktock("fblexdataloader") tt.tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FBSeqFeedsMaker(fblexpath, ed, worddic, numwords=numwords) datanuments = np.max(indata.goldfeed) + 1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, len( worddic) + 1, datanuments + 1, ed
def test_fb_datafeed_validosplit(self): gd, gmaxi = getglovedict(os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join(os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/freebase/labelsrevlex.map.sample") f = FreebaseEntFeedsMaker(dp, gd, ed, numwords=10, numchars=30) self.assertEqual(f.worddic, gd) dfeeder = DataFeeder(*([f.trainfeed] + [f.goldfeed])) splits = 1 dfsplit = dfeeder.osplit(split=splits, random=False) dfeeds = dfeeder.feeds splitfeeds = dfsplit.feeds for x, y in zip(dfeeds, splitfeeds): self.assertEqual(x.__class__, y.__class__) self.assertEqual(x.ndim, y.ndim) self.assertEqual(y.shape[0], int(math.ceil(1.*x.shape[0]/splits))) for dim in range(1, len(x.shape)): self.assertEqual(x.shape[dim], y.shape[dim])
def loadlexdata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") ; tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseEntFeedsMaker(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = max(indata.goldfeed)+1 tt.tick() print "max entity id+1: %d" % datanuments indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed return traindata, golddata, vocnuments, vocnumwords, datanuments
def test_fb_datafeed_validosplit(self): gd, gmaxi = getglovedict( os.path.join(os.path.dirname(__file__), "../data/glove/miniglove.50d.txt")) ed, emaxid = getentdict(os.path.join( os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) dp = os.path.join(os.path.dirname(__file__), "../data/freebase/labelsrevlex.map.sample") f = FreebaseEntFeedsMaker(dp, gd, ed, numwords=10, numchars=30) self.assertEqual(f.worddic, gd) dfeeder = DataFeeder(*([f.trainfeed] + [f.goldfeed])) splits = 1 dfsplit = dfeeder.osplit(split=splits, random=False) dfeeds = dfeeder.feeds splitfeeds = dfsplit.feeds for x, y in zip(dfeeds, splitfeeds): self.assertEqual(x.__class__, y.__class__) self.assertEqual(x.ndim, y.ndim) self.assertEqual(y.shape[0], int(math.ceil(1. * x.shape[0] / splits))) for dim in range(1, len(x.shape)): self.assertEqual(x.shape[dim], y.shape[dim])
def loaddata(glovepath, fbentdicp, fblexpath, wordoffset, numwords, numchars): tt = ticktock("fblexdataloader") tt.tick() gd, vocnumwords = getglovedict(glovepath, offset=wordoffset) tt.tock("loaded %d worddic" % len(gd)).tick() ed, vocnuments = getentdict(fbentdicp, offset=0) tt.tock("loaded %d entdic" % len(ed)).tick() indata = FreebaseSeqFeedMakerEntidxs(fblexpath, gd, ed, numwords=numwords, numchars=numchars, unkwordid=wordoffset - 1) datanuments = np.max(indata.goldfeed) + 1 tt.tick() indata.trainfeed[0:9000] tt.tock("transformed") #embed() traindata = indata.trainfeed golddata = indata.goldfeed + 1 # no entity = id 0 return traindata, golddata, vocnuments, vocnumwords, datanuments + 1, ed, gd
def test_getentdic(self): d, maxid = getentdict(os.path.join( os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) self.assertEqual(maxid, 52) self.assertEqual(max(d.values()), maxid)
def test_getentdic(self): d, maxid = getentdict(os.path.join(os.path.dirname(__file__), "../data/freebase/entdic.small.map"), top=50) self.assertEqual(maxid, 52) self.assertEqual(max(d.values()), maxid)