def main(config):
    grammar_string = parse_induced_grammar( config.grammar )

    if config.output:
        with open(config.output, 'w') as f:
            f.write(grammar_string)
    grammar = PCFG.fromstring( grammar_string )
    grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things

    # Create directory for parse_trees if it does not already exist
    if config.textfile:
        if not os.path.exists(config.output_parse):
            os.makedirs(config.output_parse)
    
    if config.textfile:
        parser = ViterbiParser(grammar)
        with open(config.textfile, 'r') as f:
            lines = f.read().splitlines() 
        for i, line in enumerate(lines):
            if i==config.number_parses:
                break
            print(f"Parsing sentence {i+1}")
            sent = line.split()
            for t in parser.parse(sent):
                TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
def pcfg_data_likelihood(cfg_path, weights, data, counts, epsilon=1e-10):
    """Compute the log-likelihood of the real programs dataset 
    using PCFG with user-specified weights.

    @param cfg_path: string
                     path to PCFG dump
    @param weights: np.array
                    parameters of CFG.
    @param data: list of code segments
                 each code segment is a list of strings (space-sep)
    @param counts: each data point is not weighted equally
                   we weight by occurrence
    @param epsilon: default to use for empty trees [default: 1e-10]
    @return log_lik: float
                     log likelihood of dataset.
    """
    # space of possible integers (some of the language
    # requires a countably infinite number of possiblilities.
    # we only care about encoding the real program-space so
    # we only explicitly model the integers in the real set.
    integer_domain = get_integer_domain(data)
    pcfg = build_pcfg(cfg_path, weights, integer_domain, True)
    parser = ViterbiParser(pcfg)
    log_like = 0
    missing = 0
    for i, (code, cnt) in enumerate(zip(data, counts)):
        generator = parser.parse(code)
        if generator is not None:
            tree = next(generator)
            ll = tree.logprob()
        else:  # this program is not covered by the pCFG
            ll = np.log(epsilon)
            log_like += -ll * cnt
            missing += 1
    return log_like
Beispiel #3
0
 def parse_command(self, seqs, keep=3):
     non_terminals = get_nonterminals(self._pcfg)
     viterbi = ViterbiParser(self._pcfg)
     for seq, id in seqs:
         curr_trees = []
         for parse_option in get_parse_options(seq, non_terminals):
             try:
                 for t in viterbi.parse(parse_option):
                     curr_trees.append((t, parse_option))
             except ValueError:
                 print(parse_option)
         print(curr_trees)
         curr_trees = sorted(curr_trees, key=lambda tree: -tree[0].prob())
         print(seq, sum([tree[0].prob() for tree in curr_trees]),
               len(curr_trees))
         if keep != -1:
             curr_trees = curr_trees[:keep]
         print('now', len(curr_trees))
         for tree, parse_option in curr_trees:
             self._parsed_trees.append((parse_option, tree, id))
     print(len(seqs), len(self._parsed_trees))
     trees = [(tree[0], tree[1], tree[2]) for tree in self._parsed_trees]
     output_files = []
     for i, (option, tree, ind) in enumerate(trees):
         a = save_tree(tree,
                       None,
                       'parse{}'.format(i),
                       postscript=False,
                       prob=tree.prob(),
                       csb_id=ind)
         output_files.append(a)
     merge_pdfs(output_files, 'merged_parse.pdf')
Beispiel #4
0
def parse(parser: ViterbiParser, sentence):
    start_time = time.time()
    parser.trace(trace=1)
    for tree in parser.parse(sentence):
        print(tree)
        print(
            f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}"
        )
Beispiel #5
0
def parsing(sample, g):
	from nltk.parse.viterbi import ViterbiParser
	from nltk.draw.tree import draw_trees
	parser = ViterbiParser(g)
	for s in sample:
		print " ".join(s)
		t = parser.parse(s)
		if t:
			print t.logprob()
Beispiel #6
0
def parse_treebank(parser: ViterbiParser, sentences):
    start_time = time.time()
    parser.trace(trace=1)
    for sentence in treebank.parsed_sents(sentences[:3]):
        tokens = sentence.leaves()
        for tree in parser.parse(tokens):
            print(tree)
            print(
                f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}"
            )
def test_PCFG(grammar, shapes=False):
    ''' Test whether the grammar can parse a sentence '''
    #sent = [i.replace("'","") for i in TERMINALS[:5]]
    #sent = "in the middle center is a green square".split()
    if not shapes:
        sent = "2 2 2 12 2 12 2 2 12 2".split()
    else:
        sent = "in the middle center is a green square".split()
    sr = ViterbiParser(grammar)
    for t in sr.parse(sent):
        t.draw()
Beispiel #8
0
def run_parser(corpus):
    """
	Runs the parser on a corpus.
	@param corpus: List of lists with input tokens
	"""
    for sentence in corpus:
        grammar = getGrammar(sentence)
        parser = Parser(grammar)
        sent = splitSentence(sentence)
        tree = parser.parse(sent)
        # tree.draw()
        # print tree.pprint(margin=30)
        extractDepParse(tree, sentence)
def sanity_test():
    """Unit Test to make sure this stuff is working.
    This function should NOT break.
    """
    from ..rubric_utils.load_params import (
        get_pcfg_params,
        get_pcfg_path,
        get_codeorg_data_root,
    )

    data_root = get_codeorg_data_root(1, 'raw')
    theta = get_pcfg_params(1, author='teacher', random=False)
    cfg_path = get_pcfg_path(1, author='teacher')

    data, counts = load_real_asts(data_root, 1, True)
    integer_domain = get_integer_domain(data)
    # CKY parser for p-cfgs...
    pcfg = build_pcfg(cfg_path, theta, integer_domain, False)
    parser = ViterbiParser(pcfg)
    generator = parser.parse(['Move', '(', '50', ')'])
    tree = next(generator)
    # print(tree.logprob())
    print(tree)
Beispiel #10
0
def analyse_viterbi(pcfg, messages):
    """
        Infers the Viterbi parses of the fixed induction set, split induction set and evaluation set
        Writes parses to txt file
        Computes message likelihood, tree diversity and evaluation coverage
        Writes these properties to a pickle file
        Returns a list of strings for summarized properties
        """

    # Get terminals
    prods_lexical = [
        prod for prod in pcfg.productions() if type(prod.rhs()[0]) == str
    ]
    terminals = set([prod.rhs()[0] for prod in prods_lexical])

    # Compute message likelihoods and tree depth
    parser = ViterbiParser(pcfg)
    message_count = len(messages)
    message_count_quarter = int(np.ceil(message_count / 4))
    lines_parse = []
    trees = []
    tree_depths = []
    logprobs = []
    failed_parses = []
    parsed_count_weighted = 0
    for i, sent in enumerate(messages):
        sent = list(sent)
        if all(sym in terminals for sym in sent):
            tree_list = list(parser.parse(sent))
            if len(
                    tree_list
            ) == 1:  # if the message can be parsed, tree_list contains one tree
                tree = tree_list[0]
                parse = to_parse_string(tree)
                trees.append(parse)
                tree_depths.append(tree_depth(tree))
                logprobs.append(
                    tree.logprob() / np.log(2)
                )  # convert natural logarithm from tree to log base 2 for description length
            else:
                parse = "NO_PARSE"
                logprobs.append(None)
                tree_depths.append(None)
                failed_parses.append(sent)
        else:
            parse = "NO_PARSE"
            logprobs.append(None)
            tree_depths.append(None)
            failed_parses.append(sent)

    # Compute final statistics
    parsed_count = len(ignore_none(logprobs))
    unparsed_count = message_count - parsed_count

    # Collect evaluation information (of unique messages)
    eval_stats = {
        'log2likelihoods': logprobs,  # corresponds to {data: frequencies}
        'unparsed_count': unparsed_count,
        'parsed_count': parsed_count,
        'failedparses': failed_parses,
    }

    # Evaluation coverage
    coverage = parsed_count / len(messages)
    eval_stats['coverage'] = coverage * 100
    eval_stats['average_log2likelihood'] = mean(logprobs) or float('nan')

    return eval_stats
Beispiel #11
0
def parse(parser: ViterbiParser, sentence):
    for tree in parser.parse(sentence):
        yield tree