Beispiel #1
0
 def __call__(self, data):
   vec = Vector()
   for i, (key, val) in enumerate(data):
     splits = val.split("****")
     if len(splits) <> 2:
       print >>sys.stderr,"skipping sent"
       continue
     sent, oracle = splits 
     s2 = sent.replace("\t\t\t", "\n")
     o2 = oracle.replace("\t\t\t", "\n")
     sent_forest = Forest.load(StringIO(s2), True, lm=None).next()
     oracle_forest = Forest.load(StringIO(o2), True, lm=None).next()
     assert sent_forest, oracle_forest
     #print >>sys.stderr, len(sent_forest)
     #print >>sys.stderr, len(oracle_forest)
     example_marg, example_partition  = fast_inside_outside.collect_marginals(sent_forest, self.weights)
     oracle_marg, oracle_partition  = fast_inside_outside.collect_marginals(oracle_forest, self.weights)
     vec += example_marg - oracle_marg
     vec["log_likelihood"] += example_partition-oracle_partition
     #vec["log_likelihood"] += example_partition-oracle_partition 
     self.processed += 1
   for feat in vec:
     yield feat, vec[feat]
Beispiel #2
0
    def compute_marginals(self, forest, oracle_forest):
        "computes the marginals of a -lm forest"

        # print >> logs, "Example TIME %s"%((end - start))

        # oracle_bleu, oracle_trans, oracle_fv, _ = oracle_forest.compute_oracle(Vector(), model_weight=0.0, bleu_weight=1.0)
        def non_local_scorer(cedge, ders):
            hyp = cedge.assemble(ders)
            return ((0.0, Vector()), hyp, hyp)

        # decoder = CubePruning(MarginalDecoder.FeatureAdder(self.weights), non_local_scorer, 20, 5, find_min=False)
        # best = decoder.run(forest.root)

        #     example_marginals = Vector()
        #     total = -INF
        #     for i in range(min(200, len(best))):
        #       M = max(best[i].score[0], total)
        #       m = min(best[i].score[0], total)
        #       total =  M + log(1.0 + exp(m - M))

        #     #print "before"
        #     print total
        #     for i in range(min(200, len(best))):
        #       #print exp(best[i].score[0] -total)
        #       example_marginals +=  exp(best[i].score[0] -total) * best[i].score[1]
        #     #print "after"
        #     partition = total

        # start = time.time()
        example_marg, partition = fast_inside_outside.collect_marginals(forest, self.weights)
        # end = time.time()
        # print >> logs, "marg TIME %s"%((end - start))

        # print "Best Log Likelihood %s "%(best[0].score[0] - partition)
        # start = time.time()
        # oracle_forest, oracle_item = oracle.oracle_extracter(forest, self.weights, 5, 2, extract=1)
        # end = time.time()
        # print >> logs, "oracle forest %s"%((end - start))

        # start = time.time()
        oracle_marg, oracle_partition = fast_inside_outside.collect_marginals(oracle_forest, self.weights)
        (oracle_best, oracle_subtree, oracle_best_fv) = oracle_forest.bestparse(self.weights, use_min=False)
        # end = time.time()
        # print >> logs, "oracle TIME %s"%((end - start))
        # logs.flush()
        # self.write_model("", oracle_marg)

        # print "Best   Score: %s"% best
        # print "Oracle Score: %s"% (self.weights.dot(oracle_fv))

        # for i in range(5):
        # print "Oracle Trans: %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score)
        # print "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation))

        #     forest.bleu.rescore(oracle_subtree)
        #     print "Oracle Trans: %s %s" %(oracle_subtree, forest.bleu.score_ratio_str())
        #     print "Best   Trans: %s"%best[0].full_derivation
        #     forest.bleu.rescore(best[0].full_derivation)
        #     print "Best BLEU   Score: %s"% (forest.bleu.score_ratio_str())
        #     print oracle_partition -partition, oracle_partition, partition

        average = 0.0
        #     for i in range(min(10, len(best))):
        #       print "  Best   Trans: %s"%best[i].full_derivation
        #       forest.bleu.rescore(best[i].full_derivation)
        #       average += len(best[i].full_derivation.split())
        #       print "  Best BLEU   Score: %s"% (forest.bleu.score_ratio_str())
        #       print "  Best Score: %s"% (best[i].score[0])
        # print "Average Length %s"%(average / float(i))
        # print "Local Difference: %s"%(oracle_partition-partition)
        return example_marg, oracle_marg, oracle_partition - partition  # log div