Esempi in Python per InsideComputation, esempi in Python per inside.InsideComputation

Esempio n. 1

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def conditional_unlabeled_kld(target, hypothesis, samples=1000, verbose=False):
    """
	Estimate the kld between the conditional probability distributions.

	for a given string $w$ D( P(unlabeled tree|w) | Q(tree|w)).

	difference between string KLD and tree KLD.
	"""
    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0

    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)

        ptree = inside_target.inside_bracketed_log_probability(t)
        pstring = inside_target.inside_log_probability(s)

        qtree = inside_hypothesis.inside_bracketed_log_probability(t)
        qstring = inside_hypothesis.inside_log_probability(s)

        total += (ptree - qtree) - (pstring - qstring)
        if verbose:
            logging.info("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f", s,
                         ptree, pstring, qtree, qstring)
    return total / samples

Esempio n. 2

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def labeled_exact_match(target,
                        hypothesis,
                        samples=1000,
                        test_viterbi=False,
                        verbose=False):
    """
	Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree.

	SLOW
	"""
    if test_viterbi:
        inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    ntmap = best_nonterminal_rmap(target, hypothesis, samples)
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if test_viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            relabeled_tree = utility.relabel_tree(th, ntmap)
            if relabeled_tree == t:
                total += 1
            elif verbose:
                logging.info("Mismatch in trees with parse of %s", s)
                print(relabeled_tree)
                print(t)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)
    return total / samples

Esempio n. 3

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def bracketed_match(target,
                    hypothesis,
                    test_viterbi=False,
                    samples=1000,
                    verbose=False,
                    exact_match=True):
    """
	Proportion of trees whose viterbi parse has the same shape as the original.
	test viterbi option means that it will test against the viterbi parse wrt the true grammar not the original tree
	"""
    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    ttotal = 0.0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if test_viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            if exact_match:
                num, denom = utility.zero_one_unlabeled(t, th)
            else:
                num, denom = utility.microaveraged_unlabeled(t, th)
            total += num
            ttotal += denom
            if verbose and num < denom:
                logging.info("Mismatch (%d / %d) with string %s", num, denom,
                             s)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)

    return total / ttotal

Esempio n. 4

0

Mostra file

File: pcfg.py Progetto: alexc17/syntheticpcfg

    def estimate_communicability(self,
                                 samples=1000,
                                 max_length=100,
                                 sampler=None):
        """
		Returns two estimates of the communicability; The second one is I think better in most cases.
		"""
        if sampler == None:
            mysampler = Sampler(self)
        else:
            mysampler = sampler
        insider = inside.InsideComputation(self)
        same = 0.0
        ratio = 0.0
        n = 0
        for i in range(samples):
            t = sampler.sample_tree()
            s = collect_yield(t)
            if len(s) <= max_length:
                n += 1
                mapt = insider.viterbi_parse(s)
                if t == mapt:
                    same += 1
                lpd = self.log_probability_derivation(mapt)
                lps = insider.inside_log_probability(s)
                ratio += math.exp(lpd - lps)
        return (same / n, ratio / n)

Esempio n. 5

0

Mostra file

	def string_density(self,length, samples):
		"""
		return an estimate of the proportion of strings of length n that are in the grammar.
		Do this by sampling uniformly from the derivations, 
		and computing the number of derivations for each such string, and dividing.
		"""
		derivations = self.get_total(length)

		strings = 1.0 * self.vocab ** length
		total = 0.0
		parser = inside.InsideComputation(self.grammar)
		inverse = 0.0
		for i in range(samples):

			tree = self.sample(length)
			w = collect_yield(tree)
			#print w
			#print w
			
			n = parser.count_parses(w)
			#print n
			if n == 0:
				raise ValueError("Generated a string which cannot be parsed.")
			total += n
			inverse += 1.0/n
		imean = inverse /samples
		return (derivations / strings) *  imean #, derivations, strings, 1.0/imean

Esempio n. 6

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def nonterminal_contingency_table(target,
                                  hypothesis,
                                  samples=1000,
                                  robust=False):
    counter = Counter()
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)

    def gather_pairs(tree1, tree2, counter):
        assert len(tree1) == len(tree2)
        counter[(tree1[0], tree2[0])] += 1
        if len(tree1) == 3:
            gather_pairs(tree1[1], tree2[1], counter)
            gather_pairs(tree1[2], tree2[2], counter)

    for i in range(samples):
        t = sampler.sample_tree()
        try:
            th = inside_hypothesis.bracketed_viterbi_parse(t)
            gather_pairs(t, th, counter)
        except utility.ParseFailureException as e:
            if robust:
                logging.info("Parse failure while doing the bracketed parse.")
            else:
                raise e
    return counter

Esempio n. 7

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def bracketed_kld(target, hypothesis, samples=1000, verbose=False):
    ### sample n trees from target. FAST
    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    for i in range(samples):
        t = sampler.sample_tree()

        lp = inside_target.inside_bracketed_log_probability(t)
        lq = inside_hypothesis.inside_bracketed_log_probability(t)
        if verbose:
            logging.info("Sample %d %s, target %f, hypothesis %f", i, t, lp,
                         lq)
        total += lp - lq
    return total / samples

Esempio n. 8

0

Mostra file

	def string_density_crude(self,length, samples):
		terminals = list(self.grammar.terminals)
		n = 0
		parser = inside.InsideComputation(self.grammar)
		for i in range(samples):
			s = tuple([ self.rng.choice(terminals) for x in range(length) ])
			if parser.count_parses(s) > 0:
				n += 1
		return n/float(samples)

Esempio n. 9

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def preterminal_contingency_table(target, hypothesis, samples=1000):
    counter = Counter()
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    for i in range(samples):
        t = sampler.sample_tree()
        tut = utility.tree_to_preterminals(t)
        s = utility.collect_yield(t)
        try:
            th = inside_hypothesis.viterbi_parse(s)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)
            continue
        tpt = utility.tree_to_preterminals(th)
        for a, b in zip(tut, tpt):
            counter[(a, b)] += 1
    return counter

Esempio n. 10

0

Mostra file

    def train_unary_once(self, my_pcfg, a, max_length):
        posteriors = defaultdict(float)
        max_length = min(max_length, len(self.length_distribution.weights) - 1)

        insidec = inside.InsideComputation(my_pcfg)
        total = 0
        for l in range(1, max_length + 1):
            s = tuple([a for i in range(l)])
            w = self.length_distribution.weights[l]
            #print(l,w)
            if w > 0:
                lp = w * insidec.add_posteriors(s, posteriors, w)
                total += lp
            #print("Posteriors",l,posteriors)
        logging.info("UNARY LP %f", total)
        my_pcfg.parameters = posteriors
        my_pcfg.normalise()
        return my_pcfg, total

Esempio n. 11

0

Mostra file

File: pcfg.py Progetto: alexc17/syntheticpcfg

    def monte_carlo_entropy(self, n, sampler=None):
        """
		Use a Monte Carlo approximation; return string entropy, unlabeled entropy and derivation entropy.
		"""
        string_entropy = 0
        unlabeled_tree_entropy = 0
        labeled_tree_entropy = 0
        if sampler == None:
            sampler = Sampler(self)
        insidec = inside.InsideComputation(self)
        for i in range(n):
            tree = sampler.sample_tree()
            lp1 = self.log_probability_derivation(tree)
            sentence = collect_yield(tree)
            lp2 = insidec.inside_bracketed_log_probability(tree)
            lp3 = insidec.inside_log_probability(sentence)
            string_entropy -= lp1
            unlabeled_tree_entropy -= lp2
            labeled_tree_entropy -= lp3
        return string_entropy / n, unlabeled_tree_entropy / n, labeled_tree_entropy / n

Esempio n. 12

0

Mostra file

File: pcfg.py Progetto: alexc17/syntheticpcfg

    def __init__(self,
                 pcfg,
                 cache_size=SAMPLE_CACHE_SIZE,
                 max_depth=SAMPLE_MAX_DEPTH,
                 random=None):
        ## construct indices for sampling
        if random == None:
            random = numpy.random.RandomState()
        assert pcfg.is_normalised()

        ## For reproducibility we need to have a fixed order.
        nts = list(pcfg.nonterminals)
        nts.sort()
        self.multinomials = {
            nt: Multinomial(pcfg, nt, cache_size, random)
            for nt in nts
        }
        self.start = pcfg.start
        self.max_depth = max_depth
        self.insider = inside.InsideComputation(pcfg)
        self.mypcfg = pcfg

Esempio n. 13

0

Mostra file

File: pcfg.py Progetto: alexc17/syntheticpcfg

    def estimate_ambiguity(self, samples=1000, max_length=100, sampler=None):
        """
		Monte Carlo estimate of the conditional entropy H(tree|string)
		"""
        if sampler == None:
            mysampler = Sampler(self)
        else:
            mysampler = sampler
        insider = inside.InsideComputation(self)
        total = 0.0
        n = 0.0
        for i in range(samples):
            tree = mysampler.sample_tree()
            s = collect_yield(tree)
            if len(s) > max_length:
                continue
            lp = insider.inside_log_probability(s)
            lpd = self.log_probability_derivation(tree)
            total += lp - lpd
            n += 1
        return total / n

Esempio n. 14

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def test_coverage(target, hypothesis, samples=1000):
    """
	Sample n strings from target and see if they are parsed by hypothesis.

	optimisation: parse bracketed string first. 
	"""
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    for _ in range(samples):
        t = sampler.sample_tree()
        try:
            vp = inside_hypothesis.bracketed_viterbi_parse(t)
            total += 1
        except utility.ParseFailureException as e:
            try:
                s = utility.collect_yield(t)
                vp = inside_hypothesis.viterbi_parse(s)
                total += 1
            except utility.ParseFailureException as e:
                pass
    return total / samples

Esempio n. 15

0

Mostra file

File: sample_corpus.py Progetto: alexc17/syntheticpcfg

                    action="store_true")

## Other options: control output format, what probs are calculated.

args = parser.parse_args()

mypcfg = pcfg.load_pcfg_from_file(args.inputfilename)

if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = pcfg.Sampler(mypcfg, random=prng)
insider = inside.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # defatul is string.
        s = utility.collect_yield(tree)
        if not args.maxlength or len(s) <= args.maxlength:
            if not args.omitprobs:

                lpt = mypcfg.log_probability_derivation(tree)
                lpb = insider._bracketed_log_probability(tree)[mypcfg.start]
                if args.omitinside:
                    outf.write("%e %e " % (lpt, lpb))
                else: