Esempi in Python per collect_yield, esempi in Python per utility.collect_yield

Esempio n. 1

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def conditional_unlabeled_kld(target, hypothesis, samples=1000, verbose=False):
    """
	Estimate the kld between the conditional probability distributions.

	for a given string $w$ D( P(unlabeled tree|w) | Q(tree|w)).

	difference between string KLD and tree KLD.
	"""
    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0

    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)

        ptree = inside_target.inside_bracketed_log_probability(t)
        pstring = inside_target.inside_log_probability(s)

        qtree = inside_hypothesis.inside_bracketed_log_probability(t)
        qstring = inside_hypothesis.inside_log_probability(s)

        total += (ptree - qtree) - (pstring - qstring)
        if verbose:
            logging.info("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f", s,
                         ptree, pstring, qtree, qstring)
    return total / samples

Esempio n. 2

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def labeled_exact_match(target,
                        hypothesis,
                        samples=1000,
                        test_viterbi=False,
                        verbose=False):
    """
	Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree.

	SLOW
	"""
    if test_viterbi:
        inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    ntmap = best_nonterminal_rmap(target, hypothesis, samples)
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if test_viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            relabeled_tree = utility.relabel_tree(th, ntmap)
            if relabeled_tree == t:
                total += 1
            elif verbose:
                logging.info("Mismatch in trees with parse of %s", s)
                print(relabeled_tree)
                print(t)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)
    return total / samples

Esempio n. 3

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def bracketed_match(target,
                    hypothesis,
                    test_viterbi=False,
                    samples=1000,
                    verbose=False,
                    exact_match=True):
    """
	Proportion of trees whose viterbi parse has the same shape as the original.
	test viterbi option means that it will test against the viterbi parse wrt the true grammar not the original tree
	"""
    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    ttotal = 0.0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if test_viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            if exact_match:
                num, denom = utility.zero_one_unlabeled(t, th)
            else:
                num, denom = utility.microaveraged_unlabeled(t, th)
            total += num
            ttotal += denom
            if verbose and num < denom:
                logging.info("Mismatch (%d / %d) with string %s", num, denom,
                             s)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)

    return total / ttotal

Esempio n. 4

0

Mostra file

    def string_density(self, length, samples):
        """
		return an estimate of the proportion of strings of length n that are in the grammar.
		Do this by sampling uniformly from the derivations, 
		and computing the number of derivations for each such string, and dividing.
		"""
        derivations = self.get_total(length)

        strings = 1.0 * self.vocab**length
        total = 0.0
        parser = wcfg.InsideComputation(self.grammar)
        inverse = 0.0
        for i in range(samples):

            tree = self.sample(length)
            w = collect_yield(tree)
            #print w
            #print w

            n = parser.count_parses(w)
            #print n
            if n == 0:
                raise ValueError("Generated a string which cannot be parsed.")
            total += n
            inverse += 1.0 / n
        imean = inverse / samples
        return (derivations /
                strings) * imean  #, derivations, strings, 1.0/imean

Esempio n. 5

0

Mostra file

def monte_carlo(target, f, samples, max_length, seed=None):
    ## f is called on each element
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target, random=rng)
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) <= max_length:
            f(t, i)

Esempio n. 6

0

Mostra file

def estimate_bijection(target_pcfg,
                       hypothesis_pcfg,
                       samples=1000,
                       seed=None,
                       max_length=math.inf,
                       verbose=False):
    ## Essential assumption
    assert len(target_pcfg.nonterminals) == len(hypothesis_pcfg.nonterminals)
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target_pcfg, random=rng)
    insider = wcfg.InsideComputation(hypothesis_pcfg)
    n = len(target_pcfg.nonterminals)

    c = defaultdict(Counter)

    for _ in range(samples):
        tree = sampler.sample_tree()

        s = utility.collect_yield(tree)
        if len(s) <= max_length:
            try:
                learned = insider.bracketed_viterbi_parse(tree)
                collect_nonterminal_pairs(tree, learned, c)
            except utility.ParseFailureException:
                print("Failed", s)
    #print(c)

    maximum_value = max(max(c2.values()) for nt, c2 in c.items())
    #print(maximum_value)
    ## Now use the Hungarian algorithm

    cost_matrix = np.zeros((n, n))
    target_list = list(target_pcfg.nonterminals)
    hypothesis_list = list(hypothesis_pcfg.nonterminals)
    for i, a in enumerate(target_list):
        for j, b in enumerate(hypothesis_list):
            count = c[a][b]
            cost_matrix[i, j] = maximum_value - count
            # if count == 0:
            # 	cost_matrix[i,j] = maximum_value
            # else:
            # 	cost_matrix[i,j] = 1/count  ## Maybe normalise so as to maximize something else?
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    answer = {}
    for i, j in zip(row_ind, col_ind):
        answer[target_list[i]] = hypothesis_list[j]
    #print(answer)
    return answer

Esempio n. 7

0

Mostra file

def labeled_exact_match(target,
                        hypothesis,
                        samples=1000,
                        max_length=30,
                        viterbi=False,
                        verbose=False,
                        seed=None):
    """
	Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree.
	Target has to be a pcfg; hypothesis can be any WCFG.

	Identical nonterminals
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target, random=rng)
    if viterbi:
        inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)

    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) >= max_length:
            continue
        n += 1
        if viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            #relabeled_tree = utility.relabel_tree(th, ntmap)
            relabeled_tree = th
            if relabeled_tree == t:
                total += 1
            elif verbose:
                logging.info("Mismatch in trees with parse of %s", s)
                print(relabeled_tree)
                print(t)
        except utility.ParseFailureException as e:
            # Treat this as a failure .

            print("Parse failure of %s " % s)
    return total / n

Esempio n. 8

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def preterminal_contingency_table(target, hypothesis, samples=1000):
    counter = Counter()
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    for i in range(samples):
        t = sampler.sample_tree()
        tut = utility.tree_to_preterminals(t)
        s = utility.collect_yield(t)
        try:
            th = inside_hypothesis.viterbi_parse(s)
        except utility.ParseFailureException as e:
            logging.warning("Parse failure", s)
            continue
        tpt = utility.tree_to_preterminals(th)
        for a, b in zip(tut, tpt):
            counter[(a, b)] += 1
    return counter

Esempio n. 9

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def string_kld(target, hypothesis, samples=1000, verbose=False):
    ### sample n trees from target.

    inside_target = inside.InsideComputation(target)
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        lp = inside_target.inside_log_probability(s)
        lq = inside_hypothesis.inside_log_probability(s)
        if verbose:
            logging.info("Sample %d %s, target %f, hypothesis %f", i, t, lp,
                         lq)
        total += lp - lq
    return total / samples

Esempio n. 10

0

Mostra file

def conditional_kld(target,
                    hypothesis,
                    samples=1000,
                    verbose=False,
                    max_length=math.inf,
                    seed=None):
    """
	Estimate the kld between the conditional probability distributions.

	for a given string $w$ D( P(tree|w) | Q(tree|w)).

	difference between string KLD and tree KLD.
	
	Target must be a pcfg, hyppthesis can be arbitrary wcfg, not even convergent, with same nonterminals.
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    sampler = wcfg.Sampler(target, random=rng)
    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) > max_length:
            if verbose:
                print("Skipping", len(s))
            continue
        n += 1
        ptree = target.log_probability_derivation(t)
        pstring = inside_target.inside_log_probability(s)

        qtree = hypothesis.log_probability_derivation(t)
        qstring = inside_hypothesis.inside_log_probability(s)

        total += (ptree - qtree) - (pstring - qstring)
        if verbose:
            print("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f" %
                  (s, ptree, pstring, qtree, qstring))
    return total / n

Esempio n. 11

0

Mostra file

File: evaluation.py Progetto: alexc17/syntheticpcfg

def test_coverage(target, hypothesis, samples=1000):
    """
	Sample n strings from target and see if they are parsed by hypothesis.

	optimisation: parse bracketed string first. 
	"""
    inside_hypothesis = inside.InsideComputation(hypothesis)
    sampler = pcfg.Sampler(target)
    total = 0.0
    for _ in range(samples):
        t = sampler.sample_tree()
        try:
            vp = inside_hypothesis.bracketed_viterbi_parse(t)
            total += 1
        except utility.ParseFailureException as e:
            try:
                s = utility.collect_yield(t)
                vp = inside_hypothesis.viterbi_parse(s)
                total += 1
            except utility.ParseFailureException as e:
                pass
    return total / samples

Esempio n. 12

0

Mostra file

File: sample_corpus.py Progetto: alexc17/syntheticpcfg

if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = pcfg.Sampler(mypcfg, random=prng)
insider = inside.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # defatul is string.
        s = utility.collect_yield(tree)
        if not args.maxlength or len(s) <= args.maxlength:
            if not args.omitprobs:

                lpt = mypcfg.log_probability_derivation(tree)
                lpb = insider._bracketed_log_probability(tree)[mypcfg.start]
                if args.omitinside:
                    outf.write("%e %e " % (lpt, lpb))
                else:
                    lps = insider.inside_log_probability(s)
                    outf.write("%e %e %e " % (lpt, lpb, lps))
            if args.yieldonly:
                outf.write(" ".join(s) + "\n")
            else:
                outf.write(utility.tree_to_string(tree) + "\n")
            i += 1

Esempio n. 13

0

Mostra file

    def f(t, i):
        nonlocal scores
        s = utility.collect_yield(t)
        scores['trees_denominator'] += 1
        scores['labeled_denominator'] += utility.count_labeled(t)
        scores['unlabeled_denominator'] += utility.count_unlabeled(t)

        gold_viterbi = inside_target.viterbi_parse(s)

        try:
            ## Viterbi/nonviterbi
            ## target, hyppothesis,left,right,random
            ## labeled unlabeled
            ## exact match / microaveraged
            hypo_viterbis = [
                (inside_hypothesis.viterbi_parse(s), "hypothesis%d" % i)
                for i, inside_hypothesis in enumerate(inside_hypotheses)
            ]
            lb = baselines.make_left_branch(s)
            rb = baselines.make_right_branch(s)
            rand = baselines.make_random_labeled(s)

            for target_tree, label1 in [(t, "original"),
                                        (gold_viterbi, "viterbi")]:
                for eval_tree, label2 in hypo_viterbis + [
                    (lb, "leftbranch"), (rb, "rightbranch"), (rand, "random"),
                    (gold_viterbi, "gold viterbi")
                ]:
                    scores[
                        label1 + ":" + label2 +
                        ":labeled:exact_match"] += 1 if target_tree == eval_tree else 0
                    scores[
                        label1 + ":" + label2 +
                        ":unlabeled:exact_match"] += 1 if utility.unlabeled_tree_equal(
                            target_tree, eval_tree) else 0
                    (x, n) = utility.microaveraged_labeled(
                        target_tree, eval_tree)
                    scores[label1 + ":" + label2 +
                           ":labeled:microaveraged"] += x
                    (x, n) = utility.microaveraged_unlabeled(
                        target_tree, eval_tree)
                    scores[label1 + ":" + label2 +
                           ":unlabeled:microaveraged"] += x

            # (x,n) = utility.microaveraged_labeled(t, hypo_viterbi)
            # if hypo_viterbi == t:
            # 	scores['labeled_exact_match'] += 1
            # if hypo_viterbi == gold_viterbi:
            # 	scores['labeled_exact_match_viterbi'] += 1
            # hvu = utility.tree_to_unlabeled_tree(hypo_viterbi)
            # goldu = utility.tree_to_unlabeled_tree(t)
            # if hvu == goldu:
            # 	scores['unlabeled_exact_match'] += 1
            # scores['labeled'] += x
            # scores['labeled_denominator'] += n
            # (x,n) = utility.microaveraged_unlabeled(t, hypo_viterbi)
            # scores['unlabeled'] += x
            # scores['unlabeled_denominator'] += n
            # ## Now some baselines.

            ## exact match
        except utility.ParseFailureException:
            print("Parse failure of %s " % s)