def pcfg_data_likelihood(cfg_path, weights, data, counts, epsilon=1e-10):
    """Compute the log-likelihood of the real programs dataset 
    using PCFG with user-specified weights.

    @param cfg_path: string
                     path to PCFG dump
    @param weights: np.array
                    parameters of CFG.
    @param data: list of code segments
                 each code segment is a list of strings (space-sep)
    @param counts: each data point is not weighted equally
                   we weight by occurrence
    @param epsilon: default to use for empty trees [default: 1e-10]
    @return log_lik: float
                     log likelihood of dataset.
    """
    # space of possible integers (some of the language
    # requires a countably infinite number of possiblilities.
    # we only care about encoding the real program-space so
    # we only explicitly model the integers in the real set.
    integer_domain = get_integer_domain(data)
    pcfg = build_pcfg(cfg_path, weights, integer_domain, True)
    parser = ViterbiParser(pcfg)
    log_like = 0
    missing = 0
    for i, (code, cnt) in enumerate(zip(data, counts)):
        generator = parser.parse(code)
        if generator is not None:
            tree = next(generator)
            ll = tree.logprob()
        else:  # this program is not covered by the pCFG
            ll = np.log(epsilon)
            log_like += -ll * cnt
            missing += 1
    return log_like
def main(config):
    grammar_string = parse_induced_grammar( config.grammar )

    if config.output:
        with open(config.output, 'w') as f:
            f.write(grammar_string)
    grammar = PCFG.fromstring( grammar_string )
    grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things

    # Create directory for parse_trees if it does not already exist
    if config.textfile:
        if not os.path.exists(config.output_parse):
            os.makedirs(config.output_parse)
    
    if config.textfile:
        parser = ViterbiParser(grammar)
        with open(config.textfile, 'r') as f:
            lines = f.read().splitlines() 
        for i, line in enumerate(lines):
            if i==config.number_parses:
                break
            print(f"Parsing sentence {i+1}")
            sent = line.split()
            for t in parser.parse(sent):
                TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
Beispiel #3
0
 def parse_command(self, seqs, keep=3):
     non_terminals = get_nonterminals(self._pcfg)
     viterbi = ViterbiParser(self._pcfg)
     for seq, id in seqs:
         curr_trees = []
         for parse_option in get_parse_options(seq, non_terminals):
             try:
                 for t in viterbi.parse(parse_option):
                     curr_trees.append((t, parse_option))
             except ValueError:
                 print(parse_option)
         print(curr_trees)
         curr_trees = sorted(curr_trees, key=lambda tree: -tree[0].prob())
         print(seq, sum([tree[0].prob() for tree in curr_trees]),
               len(curr_trees))
         if keep != -1:
             curr_trees = curr_trees[:keep]
         print('now', len(curr_trees))
         for tree, parse_option in curr_trees:
             self._parsed_trees.append((parse_option, tree, id))
     print(len(seqs), len(self._parsed_trees))
     trees = [(tree[0], tree[1], tree[2]) for tree in self._parsed_trees]
     output_files = []
     for i, (option, tree, ind) in enumerate(trees):
         a = save_tree(tree,
                       None,
                       'parse{}'.format(i),
                       postscript=False,
                       prob=tree.prob(),
                       csb_id=ind)
         output_files.append(a)
     merge_pdfs(output_files, 'merged_parse.pdf')
Beispiel #4
0
def parse(parser: ViterbiParser, sentence):
    start_time = time.time()
    parser.trace(trace=1)
    for tree in parser.parse(sentence):
        print(tree)
        print(
            f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}"
        )
Beispiel #5
0
def parsing(sample, g):
	from nltk.parse.viterbi import ViterbiParser
	from nltk.draw.tree import draw_trees
	parser = ViterbiParser(g)
	for s in sample:
		print " ".join(s)
		t = parser.parse(s)
		if t:
			print t.logprob()
Beispiel #6
0
def parse_treebank(parser: ViterbiParser, sentences):
    start_time = time.time()
    parser.trace(trace=1)
    for sentence in treebank.parsed_sents(sentences[:3]):
        tokens = sentence.leaves()
        for tree in parser.parse(tokens):
            print(tree)
            print(
                f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}"
            )
def test_PCFG(grammar, shapes=False):
    ''' Test whether the grammar can parse a sentence '''
    #sent = [i.replace("'","") for i in TERMINALS[:5]]
    #sent = "in the middle center is a green square".split()
    if not shapes:
        sent = "2 2 2 12 2 12 2 2 12 2".split()
    else:
        sent = "in the middle center is a green square".split()
    sr = ViterbiParser(grammar)
    for t in sr.parse(sent):
        t.draw()
Beispiel #8
0
def apply_adaptor_grammar(target, source, env):
    """Apply an existing adaptor grammar model to new data.

    One of py-cfg's outputs is essentially a PCFG: this builder formats this a bit, then
    loads it as an NLTK PCFG, which is then applied to the provided word list to get new
    segmentations.  Note: the NLTK implementation is very slow, you may want to look into
    using one of Mark Johnson's other code bases, "cky.tbz", which is very fast and accepts
    a similar format to the py-cfg output.

    Sources: py-cfg grammar file, word list
    Targets: segmented word list
    """
    rules = {}
    nonterminals = set()
    with meta_open(source[0].rstr()) as ifd:
        for line in ifd:
            m = re.match(r"^(\S+)\s(\S+) --> (.*)$", line)
            if m:
                count = float(m.group(1))
                lhs = m.group(2)
                nonterminals.add(lhs)
                rhs = tuple(m.group(3).strip().split())
                rules[lhs] = rules.get(lhs, {})
                rules[lhs][rhs] = count
            else:
                m = re.match(r"^\((\S+)#\d+ (.*)$", line)
                lhs = m.group(1)
                rhs = tuple(re.sub(r"\(\S+", "", m.group(2)).replace(")", "").strip().split())
                rules[lhs] = rules.get(lhs, {})
                rules[lhs][rhs] = rules[lhs].get(rhs, 0) + 1
    productions = []
    for lhs, rhss in rules.iteritems():
        total = sum(rhss.values())
        for rhs, c in rhss.iteritems():
            mrhs = []
            for x in rhs:
                if x in nonterminals:
                    mrhs.append(Nonterminal(x))
                else:
                    mrhs.append(x)
            productions.append(ProbabilisticProduction(Nonterminal(lhs), mrhs, prob=(float(c) / total)))
    pcfg = PCFG(Nonterminal("Word"), productions)
    parser = ViterbiParser(pcfg)
    with meta_open(source[1].rstr()) as ifd:
        items = [l.strip().split() for l in ifd]
    with meta_open(target[0].rstr(), "w") as ofd:
        parsed = parser.parse_sents(items)
        for tree in [x.next() for x in parsed]:
            toks = [z for z in ["".join([unichr(int(y, base=16)) for y in x.leaves() if y not in ["^^^", "$$$"]]) for x in tree] if z != ""]
            if len(toks) == 1:
                ofd.write("%s\n" % (toks[0]))
            else:
                ofd.write(" ".join(["%s+" % toks[0]] + ["+%s+" % x for x in toks[1:-1]] + ["+%s" % toks[-1]]) + "\n")
    return None
Beispiel #9
0
def run_parser(corpus):
    """
	Runs the parser on a corpus.
	@param corpus: List of lists with input tokens
	"""
    for sentence in corpus:
        grammar = getGrammar(sentence)
        parser = Parser(grammar)
        sent = splitSentence(sentence)
        tree = parser.parse(sent)
        # tree.draw()
        # print tree.pprint(margin=30)
        extractDepParse(tree, sentence)
Beispiel #10
0
def overgeneration_coverage(pcfg, L, num_samples):
    """
    Test the overgeneration coverage with num_samples random messages with message length L.
    Returns % of successfull parses.
    """
    parser = ViterbiParser(pcfg)

    parse_total = 0  # Total number of messages tried to parse
    parse_success = 0  # Total number successfully parsed

    # Get the random messages
    vocabulary = get_terminals(pcfg)
    for i in range(0, num_samples):
        message = sample_message(L, vocabulary)
        parse_total += 1
        try:
            if parser.parse_one(message):
                parse_success += 1
        except ValueError:
            continue
    return parse_success / parse_total * 100
def sanity_test():
    """Unit Test to make sure this stuff is working.
    This function should NOT break.
    """
    from ..rubric_utils.load_params import (
        get_pcfg_params,
        get_pcfg_path,
        get_codeorg_data_root,
    )

    data_root = get_codeorg_data_root(1, 'raw')
    theta = get_pcfg_params(1, author='teacher', random=False)
    cfg_path = get_pcfg_path(1, author='teacher')

    data, counts = load_real_asts(data_root, 1, True)
    integer_domain = get_integer_domain(data)
    # CKY parser for p-cfgs...
    pcfg = build_pcfg(cfg_path, theta, integer_domain, False)
    parser = ViterbiParser(pcfg)
    generator = parser.parse(['Move', '(', '50', ')'])
    tree = next(generator)
    # print(tree.logprob())
    print(tree)
Beispiel #12
0
def main():
    # train = treebank.fileids()[:190]
    test = treebank.fileids()[190:]  # 10 last sentences

    # original grammar
    # pcfg = induce_grammar(train)
    # pickle.dump(pcfg, open("grammar.pcfg", 'wb'))

    # load grammar
    # pcfg : PCFG = pickle.load(open("grammar.pcfg", 'rb'))

    # fill in missing words
    # missing_words = get_missing_words(pcfg, test)
    # pcfg_unk = fill_missing_words(pcfg, missing_words)

    # pickle.dump(pcfg_unk, open("grammar_unk.pcfg", 'wb'))

    # load unk grammar
    pcfg_unk: PCFG = pickle.load(open("grammar_unk.pcfg", 'rb'))

    # use unk grammar on test sentences
    parser = ViterbiParser(pcfg_unk)
    parse_treebank(parser, test)
Beispiel #13
0
def analyse_viterbi(pcfg, messages):
    """
        Infers the Viterbi parses of the fixed induction set, split induction set and evaluation set
        Writes parses to txt file
        Computes message likelihood, tree diversity and evaluation coverage
        Writes these properties to a pickle file
        Returns a list of strings for summarized properties
        """

    # Get terminals
    prods_lexical = [
        prod for prod in pcfg.productions() if type(prod.rhs()[0]) == str
    ]
    terminals = set([prod.rhs()[0] for prod in prods_lexical])

    # Compute message likelihoods and tree depth
    parser = ViterbiParser(pcfg)
    message_count = len(messages)
    message_count_quarter = int(np.ceil(message_count / 4))
    lines_parse = []
    trees = []
    tree_depths = []
    logprobs = []
    failed_parses = []
    parsed_count_weighted = 0
    for i, sent in enumerate(messages):
        sent = list(sent)
        if all(sym in terminals for sym in sent):
            tree_list = list(parser.parse(sent))
            if len(
                    tree_list
            ) == 1:  # if the message can be parsed, tree_list contains one tree
                tree = tree_list[0]
                parse = to_parse_string(tree)
                trees.append(parse)
                tree_depths.append(tree_depth(tree))
                logprobs.append(
                    tree.logprob() / np.log(2)
                )  # convert natural logarithm from tree to log base 2 for description length
            else:
                parse = "NO_PARSE"
                logprobs.append(None)
                tree_depths.append(None)
                failed_parses.append(sent)
        else:
            parse = "NO_PARSE"
            logprobs.append(None)
            tree_depths.append(None)
            failed_parses.append(sent)

    # Compute final statistics
    parsed_count = len(ignore_none(logprobs))
    unparsed_count = message_count - parsed_count

    # Collect evaluation information (of unique messages)
    eval_stats = {
        'log2likelihoods': logprobs,  # corresponds to {data: frequencies}
        'unparsed_count': unparsed_count,
        'parsed_count': parsed_count,
        'failedparses': failed_parses,
    }

    # Evaluation coverage
    coverage = parsed_count / len(messages)
    eval_stats['coverage'] = coverage * 100
    eval_stats['average_log2likelihood'] = mean(logprobs) or float('nan')

    return eval_stats
Beispiel #14
0
        for tree in treebank.parsed_sents()[:i+1]:
            chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
            prod_gen = tree_to_productions(tree)
            tree_to_append = next(prod_gen)
            while tree_to_append:
                productions.append(tree_to_append)
                try:
                    tree_to_append = next(prod_gen)
                except Exception as e:
                    tree_to_append = False
    productions = get_productions(productions)
    return PCFG(Nonterminal('S'), productions)


pcfg_training = pcfg_learn(treebank, 400)
parser = ViterbiParser(pcfg_training)


def get_list_of_labelled_constituents(parse_tree, lst=None, first_index=None, last_index=None):
    if lst is None:
        return get_list_of_labelled_constituents(parse_tree, list(), 0, len(parse_tree.leaves()) - 1)
    if not len(lst):
        lst = [(parse_tree.label(), first_index, last_index)]
    if len(list(parse_tree.subtrees())) == 1:
        return [(parse_tree.label(), first_index, last_index)]
    else:
        for child in parse_tree:
            labelled_constituents = (child.label(), first_index, first_index + len(child.leaves()) - 1)
            get_list_of_labelled_constituents(child, lst, first_index, first_index + len(child.leaves()) - 1)
            first_index += len(child.leaves())
            last_index += len(child.leaves())
Beispiel #15
0
def main(test=False):
    """
    makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and
    serializes them all to disk for future use.

    The ViterbiParser runs in cubic time and give the most likely parse.
    The ShiftReduceParser runs in linear time and gives a single parse.

    https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk
    https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc
    https://www.nltk.org/_modules/nltk/grammar.html
    """
    vocabulary = chainer.datasets.get_ptb_words_vocabulary()
    freq_thresh = 0 ## ARBITRARY
    word_freqs = FreqDist(ptb.words())

    if not os.path.isfile('parsers/grammar.pkl'):

        productions = []
        add_dict = {}

        # use the entire treebank's parsed sentences to generate the CFG
        for i, tree in enumerate(ptb.parsed_sents()):

            # is it a good idea to combine this with my preprocessing?
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            # preprocess all productions by removing all tags
            these_productions = tree.productions()
            for production in these_productions:

                # remove all tags from the LHS (only keep primary tag)
                production._lhs = preprocess_nt(production._lhs)

                rhs = []
                for item in production._rhs:

                    # remove all tags from the Nonterminals on the RHS
                    if type(item) == nltk.grammar.Nonterminal:
                        rhs.append(preprocess_nt(item))

                    # replace numbers with N
                    elif is_number(item):
                        rhs.append('N')

                    # items not in dictionary replaced with <unk>
                    # dictionary requires lower
                    elif not is_key(vocabulary, item.lower()):
                        rhs.append('<unk>')

                    # replace infrequent words with <unk>
                    elif word_freqs[item] < freq_thresh:
                        rhs.append('<unk>')

                    # lowercase all entries in the grammar
                    else:
                        rhs.append(item.lower())

                production._rhs = tuple(rhs)

                if not is_key(add_dict, production.unicode_repr()):
                    add_dict[production.unicode_repr()] = True
                    productions.append(production)

        print('** {} productions found! **'.format(len(productions)))
        grammar = induce_pcfg(Nonterminal('S'), productions)

        with open('parsers/grammar.pkl', 'wb') as f:
            f.write(pickle.dumps(grammar))

    if not os.path.isfile('parsers/viterbi_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time

        with open('parsers/viterbi_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(viterbi_parser))

    if not os.path.isfile('parsers/shift_reduce_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        shift_reduce_parser = ShiftReduceParser(grammar, trace=0)     # linear time

        with open('parsers/shift_reduce_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(shift_reduce_parser))

    with open('data/ptb.train.txt', 'r') as f:
        data = f.readlines()

    if test:
        for sample in [1, 23, 20330, 20332, 443]:

            t1 = time.time()
            viterbi_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('viterbi      = {:.2f} sec for {} words'.format(
                t2-t1, len(data[sample].split())))

            t1 = time.time()
            shift_reduce_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('shift reduce = {:.2f} sec for {} words'.format(
            t2-t1, len(data[sample].split())))
Beispiel #16
0
import pickle
import time

# Benchmarks the speed of the Viterbi parser


def parse(parser: ViterbiParser, sentence):
    start_time = time.time()
    parser.trace(trace=1)
    for tree in parser.parse(sentence):
        print(tree)
        print(
            f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}"
        )


# load unk grammar
pcfg_unk = pickle.load(open("grammar_unk.pcfg", 'rb'))

# use unk grammar on test sentences
parser = ViterbiParser(pcfg_unk)

# one sentence
test = treebank.fileids()[190:]
first_sentence = None
for sentence in treebank.parsed_sents(test):
    first_sentence = sentence.leaves()
    break

for i in range(1, 7):
    parse(parser, first_sentence[:i])
Beispiel #17
0
def parse(parser: ViterbiParser, sentence):
    for tree in parser.parse(sentence):
        yield tree