Ejemplo n.º 1
0
    parser.add_argument(
        '--outgrammars',
        type=str,
        help=
        'output filenames of grammars to be unkified. (optional -- default just adds .unk to the end of the filename) ',
        nargs='*')

    args = parser.parse_args()
    unk = args.unk
    if args.data:
        print("Unkifying using file.")
        myunk = Unkifier(args.filename, args.unk, args.mincount)
        frequent = myunk.frequent
    else:
        print("Using first pcfg, ", args.ingrammars[0])
        target = wcfg.load_wcfg_from_file(args.ingrammars[0])
        if args.minexpectation > 0:
            frequent = set(target.frequent_terminals(args.minexpectation))
        elif args.vocabsize > 0:
            frequent = target.most_frequent_terminals(args.vocabsize)
            rarest = frequent[-1]
            print("rarest", rarest, "expectation",
                  target.terminal_expectations()[rarest])
            frequent = set(frequent)
        else:
            raise ValueError("No option set")

    if args.outgrammars and len(args.outgrammars) == len(args.ingrammars):
        outg = args.outgrammars
    else:
        outg = [f + ".unk" for f in args.ingrammars]
Ejemplo n.º 2
0
#convert_wcfg_to_pcfg.py

import utility
import wcfg
import argparse

parser = argparse.ArgumentParser(
    description=
    'Convert Grammar from potentially inconsistent BUWCFG to a PCFG that defines the same condirional distribution of trees given strings.'
)
parser.add_argument('input', type=str, help='filename of input grammar')
parser.add_argument('output', type=str, help='filename of output grammar')

args = parser.parse_args()

mywcfg = wcfg.load_wcfg_from_file(args.input)

if not mywcfg.is_convergent():
    print("Renormalising divergent WCFG")
    mywcfg = mywcfg.renormalise_divergent_wcfg2()
    print(mywcfg.check_local_normalisation())
    print(mywcfg.compute_partition_function_fp())

    #print(mywcfg.compute_partition_function_fast())
assert mywcfg.is_convergent()
mywcfg.renormalise()
mywcfg.store(args.output)
Ejemplo n.º 3
0
parser.add_argument(
    '--min_count_nmf',
    type=int,
    default=100,
    help=
    "Minimum frequency of words that can be considered to be amchors for nonterminals.(default 100)"
)

parser.add_argument('--verbose',
                    action="store_true",
                    help="Print out some useful information")

args = parser.parse_args()
ll = locallearner.LocalLearner(args.input)
if args.cheat:
    target_pcfg = wcfg.load_wcfg_from_file(args.cheat)
    n = len(target_pcfg.nonterminals)
    print(f"Number of nonterminals {n}")
    ll.nonterminals = n

else:
    ll.nonterminals = args.nonterminals

ll.seed = args.seed
ll.number_clusters = args.number_clusters
ll.min_count_nmf = args.min_count_nmf

kernels = ll.find_kernels(verbose=args.verbose)
with open(args.output, 'w') as outf:
    json.dump(kernels, outf)
Ejemplo n.º 4
0
parser.add_argument("--seed", help="Choose random seed", type=int)
parser.add_argument("--maxlength",
                    help="limit samples to this length",
                    type=int)
parser.add_argument("--omitprobs",
                    help="don't compute probabilities",
                    action="store_true")
parser.add_argument("--yieldonly",
                    help="just output the yield",
                    action="store_true")

## Other options: control output format, what probs are calculated.

args = parser.parse_args()

mypcfg = wcfg.load_wcfg_from_file(args.inputfilename)

if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = wcfg.Sampler(mypcfg, random=prng)
insider = wcfg.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # default is string.
Ejemplo n.º 5
0
parser.add_argument('--json', type=str, 	help='location of the output json file if needed.')
parser.add_argument("--seed",help="Choose random seed",type=int)
parser.add_argument('--length', type=int, default=10, 	help='length to measure the string density at.')
parser.add_argument('--samples', type=int, default=1000, 	help='samples to measure the string density.')
parser.add_argument('--maxlength', type=int, default=20, 	help='limit on the length of strings when measuring ambiguity.')

args = parser.parse_args()


if args.seed:
	random.seed(args.seed)
	numpy.random.seed(args.seed)

verbose = False
result_dict = {}
target_pcfg = wcfg.load_wcfg_from_file(args.input)
target_ambiguity = target_pcfg.estimate_ambiguity(samples = args.samples, maxlength = args.maxlength)
result_dict["ambiguity"] =  target_ambiguity
print("Target grammar ambiguity H( tree | word): %e" % target_ambiguity)
## Now try string denisyt using a sensible approach.
us = uniformsampler.UniformSampler(target_pcfg, args.length)
sd = us.string_density(args.length,args.samples)
print("String density: %e" % sd)
result_dict["string density"] = sd


naivesd = target_pcfg.estimate_string_density(args.length, args.samples)
print("Naive String density: %e" % naivesd)
result_dict["naive string density"] = naivesd

try:
Ejemplo n.º 6
0
parser.add_argument("--seed", help="Choose random seed", type=int)
parser.add_argument("--verbose",
                    help="Print useful information",
                    action="store_true")

parser.add_argument('input', type=str, help='location of the target pcfg.')
parser.add_argument('output', type=str, help='location of the output wcfg.')

args = parser.parse_args()

if args.seed:
    random.seed(args.seed)
    numpy.random.seed(args.seed)

oracle_learner.N_SAMPLES = args.nsamples
oracle_learner.MAX_SAMPLES = args.maxsamples

i = args.input
target_pcfg = wcfg.load_wcfg_from_file(i)
logging.info("Loaded")
output_wcfg = args.output

ol = oracle_learner.OracleLearner(target_pcfg)
og = ol.test()
if og:
    og.store(output_wcfg)
else:
    ## create an empty file
    logging.warning("Error: Unanchored, empty wcfg")
    open(output_wcfg, 'a').close()
Ejemplo n.º 7
0
parser.add_argument("--batchsize",type=int,default=10000,help="Samples in batch size (default 10000)")
parser.add_argument("--maxbatches",type=float,default=math.inf,help="Number of batches (default is all of them)")
parser.add_argument("--alpha",type=float,default=0.75,help="Alpha parameter eta_k = (k+2)^{-alpha}, default 0.75")


import glob
args = parser.parse_args()
bsz = args.batchsize
batches = args.maxbatches
alpha = args.alpha
epochs = args.epochs

## Maybe set the parameters intelligently wrt to the size of the data etc.
## Create temporary directory

mywcfg = wcfg.load_wcfg_from_file(args.grammar)


tmpdir = mkdtemp()

#with TemporaryDirectory() as tmpdir:

print("Creating temp directory ", tmpdir)

## Convert file to MJIO format


mjio_filename1 = tmpdir + "/igrammar.mjio"

mjio_counts = tmpdir + "/ogrammar.counts"
Ejemplo n.º 8
0
import wcfg
import argparse
import evaluation
import logging

parser = argparse.ArgumentParser(description='Map the labels of the first grammar onto the second grammar and save it')

parser.add_argument('target', type=str, help='filename of grammar with the right nonterminals')
parser.add_argument('hypothesis', type=str, help='filename of grammar to be relabeled.')

parser.add_argument('output', type=str, help='filename of output grammar (isomorphic to hypothesis')
parser.add_argument('--samples', type=int, default=1000, help='Number of samples to use (default 1000)')
parser.add_argument('--verbose', action="store_true",  help='Print out relabelling')

args = parser.parse_args()

target = wcfg.load_wcfg_from_file(args.target)
hypothesis = wcfg.load_wcfg_from_file(args.hypothesis)

minn = min(target.nonterminal_expectations().values())
if args.samples < 10 / minn:
	logging.warning("May be too few samples to correctly estimate; since minimum nonterminal expectation is %f", minn)
mapping = evaluation.estimate_bijection(target, hypothesis,args.samples)

if args.verbose:
	for a,b in mapping.items():
		print(a,"->",b)
output = hypothesis.relabel({a:b for b,a in mapping.items()})

output.store(args.output)
Ejemplo n.º 9
0
import argparse
import evaluation
import utility 
import math
import sys
import json
from collections import defaultdict

parser = argparse.ArgumentParser(description='Evaluate kernels against a target pcfg')

parser.add_argument('target', type=str,help='filename of original (gold) grammar')
parser.add_argument('kernels', help='filename of kernels')
parser.add_argument('--json', help='filename of json file')
args = parser.parse_args()
scores = {}
target = wcfg.load_wcfg_from_file(args.target)

with open(args.kernels,'r') as inf:
	for line in inf:
		#print(line)
		kernels = json.loads(line)

results = set()
product = 1.0
te = target.terminal_expectations()
for a in kernels:
	if a == 'S':
		print("Skipping S")
		results.add('S')
	else:
		x = target.find_best_lhs(a)
Ejemplo n.º 10
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Estimate hyperparameters for a learner from a PCFG. Dont use this on a grammar you will learn. Of course.'
    )
    parser.add_argument('input', type=str, help='filename of input grammar')
    #	parser.add_argument('output', type=str, help='filename of output json file')
    parser.add_argument("--posterior",
                        help="Kernel posterior (default 0.9)",
                        default=0.9,
                        type=float)

    args = parser.parse_args()
    target = wcfg.load_wcfg_from_file(args.input)
    te = target.terminal_expectations()
    pe = target.production_expectations()
    ## Hyperparams
    threshold = args.posterior
    ## Samples needed
    ## Number of clusters
    ## Min count

    ## Number of clusters.

    number_clusters = 2 * len(target.nonterminals)

    result = {}
    result["kernel"] = True