def run(options): restaurant = lp.ReinstantiatingCompactRestaurant() nodeManager = lp.SimpleNodeManager(restaurant.getFactory()) discounts = lp.VectorDouble(DISCOUNTS) parameters = lp.SimpleParameters(discounts, options.alpha) seq = lp.VectorInt() lp.pushIntFileToVec(options.train_file, seq) print >> sys.stderr, "Train seq length: %i" % (seq.size(), ) # initialize model model = lp.HPYPModel(seq, nodeManager, restaurant, parameters, NUM_TYPES) #insert training observations into model using particle filter model.computeLosses(0, seq.size()) # add test observations to underlying sequence testOffset = seq.size() lp.pushIntFileToVec(options.test_file, seq) print >> sys.stderr, "Test seq length: %i" % (seq.size() - testOffset, ) if options.prediction == 2: predictMode = lp.HPYPModel.BELOW elif options.prediction == 1: predictMode = lp.HPYPModel.FRAGMENT else: predictMode = lp.HPYPModel.ABOVE if options.inference == 1: for i in xrange(BURN_IN_SAMPLES): print >> sys.stderr, "Burn in iteration %i" % (i, ) model.runGibbsSampler() if options.prediction != 3: loss = float( lp.prob2loss( model.predictSequence(testOffset, seq.size(), predictMode))) else: loss = float(np.mean(model.computeLosses(testOffset, seq.size()))) if options.inference == 2 and options.prediction != 3: losses = np.zeros((PREDICT_SAMPLES, seq.size() - testOffset)) for i in xrange(BURN_IN_SAMPLES): print >> sys.stderr, "Burn in iteration %i" % (i, ) model.runGibbsSampler() for i in xrange(PREDICT_SAMPLES): print >> sys.stderr, "Prediction iteration %i" % (i, ) model.runGibbsSampler() losses[i, :] = model.predictSequence(testOffset, seq.size(), predictMode) loss = float(np.mean(-np.log2(np.mean(losses, 0)))) print loss # make sure destructors are called in correct order del model del nodeManager
def buildModel(fn): """Build a byte-level SM model from the given file.""" global seq, model, nodeManager, parameters, restaurant #restaurant = libplump.SimpleFullRestaurant() #restaurant = libplump.HistogramRestaurant() #restaurant = libplump.KneserNeyRestaurant() restaurant = libplump.ReinstantiatingCompactRestaurant() #restaurant = libplump.StirlingCompactRestaurant() nodeManager = libplump.SimpleNodeManager(restaurant.getFactory()) parameters = libplump.SimpleParameters(DISCOUNTS, CONCENTRATION) seq = libplump.VectorInt() libplump.pushCharFileToVec(fn, seq) numTypes = 256 model = libplump.HPYPModel(seq, nodeManager, restaurant, parameters, numTypes) model.computeLosses(0, seq.size())
#restaurant = libplump.FractionalRestaurant() #restaurant = libplump.HistogramRestaurant() #restaurant = libplump.KneserNeyRestaurant() #restaurant = libplump.ReinstantiatingCompactRestaurant() #restaurant = libplump.StirlingCompactRestaurant() nodeManager = libplump.SimpleNodeManager(restaurant.getFactory()) parameters = libplump.SimpleParameters() #seq = libplump.vectori(range(10)) seq = libplump.VectorInt([0, 1, 2, 1, 2]) #seq = libplump.VectorInt(map(ord,'oacac')) #numTypes = max(seq) numTypes = 3 model = libplump.HPYPModel(seq, nodeManager, restaurant, parameters, numTypes) print model.computeLosses(0, len(seq)) for i in range(seq.size()): print model.toString() model.runGibbsSampler() print "Predictions after training:" for i in range(len(seq)): #print model.predict(0,i,i) dist = model.predictiveDistribution(0, i) print dist, sum(dist) # save model to file serializer = libplump.Serializer("model.dump") serializer.saveNodesAndPayloads(nodeManager, restaurant.getFactory())