def pcfg_data_likelihood(cfg_path, weights, data, counts, epsilon=1e-10): """Compute the log-likelihood of the real programs dataset using PCFG with user-specified weights. @param cfg_path: string path to PCFG dump @param weights: np.array parameters of CFG. @param data: list of code segments each code segment is a list of strings (space-sep) @param counts: each data point is not weighted equally we weight by occurrence @param epsilon: default to use for empty trees [default: 1e-10] @return log_lik: float log likelihood of dataset. """ # space of possible integers (some of the language # requires a countably infinite number of possiblilities. # we only care about encoding the real program-space so # we only explicitly model the integers in the real set. integer_domain = get_integer_domain(data) pcfg = build_pcfg(cfg_path, weights, integer_domain, True) parser = ViterbiParser(pcfg) log_like = 0 missing = 0 for i, (code, cnt) in enumerate(zip(data, counts)): generator = parser.parse(code) if generator is not None: tree = next(generator) ll = tree.logprob() else: # this program is not covered by the pCFG ll = np.log(epsilon) log_like += -ll * cnt missing += 1 return log_like
def main(config): grammar_string = parse_induced_grammar( config.grammar ) if config.output: with open(config.output, 'w') as f: f.write(grammar_string) grammar = PCFG.fromstring( grammar_string ) grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things # Create directory for parse_trees if it does not already exist if config.textfile: if not os.path.exists(config.output_parse): os.makedirs(config.output_parse) if config.textfile: parser = ViterbiParser(grammar) with open(config.textfile, 'r') as f: lines = f.read().splitlines() for i, line in enumerate(lines): if i==config.number_parses: break print(f"Parsing sentence {i+1}") sent = line.split() for t in parser.parse(sent): TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
def parse_command(self, seqs, keep=3): non_terminals = get_nonterminals(self._pcfg) viterbi = ViterbiParser(self._pcfg) for seq, id in seqs: curr_trees = [] for parse_option in get_parse_options(seq, non_terminals): try: for t in viterbi.parse(parse_option): curr_trees.append((t, parse_option)) except ValueError: print(parse_option) print(curr_trees) curr_trees = sorted(curr_trees, key=lambda tree: -tree[0].prob()) print(seq, sum([tree[0].prob() for tree in curr_trees]), len(curr_trees)) if keep != -1: curr_trees = curr_trees[:keep] print('now', len(curr_trees)) for tree, parse_option in curr_trees: self._parsed_trees.append((parse_option, tree, id)) print(len(seqs), len(self._parsed_trees)) trees = [(tree[0], tree[1], tree[2]) for tree in self._parsed_trees] output_files = [] for i, (option, tree, ind) in enumerate(trees): a = save_tree(tree, None, 'parse{}'.format(i), postscript=False, prob=tree.prob(), csb_id=ind) output_files.append(a) merge_pdfs(output_files, 'merged_parse.pdf')
def parse(parser: ViterbiParser, sentence): start_time = time.time() parser.trace(trace=1) for tree in parser.parse(sentence): print(tree) print( f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}" )
def parsing(sample, g): from nltk.parse.viterbi import ViterbiParser from nltk.draw.tree import draw_trees parser = ViterbiParser(g) for s in sample: print " ".join(s) t = parser.parse(s) if t: print t.logprob()
def parse_treebank(parser: ViterbiParser, sentences): start_time = time.time() parser.trace(trace=1) for sentence in treebank.parsed_sents(sentences[:3]): tokens = sentence.leaves() for tree in parser.parse(tokens): print(tree) print( f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}" )
def test_PCFG(grammar, shapes=False): ''' Test whether the grammar can parse a sentence ''' #sent = [i.replace("'","") for i in TERMINALS[:5]] #sent = "in the middle center is a green square".split() if not shapes: sent = "2 2 2 12 2 12 2 2 12 2".split() else: sent = "in the middle center is a green square".split() sr = ViterbiParser(grammar) for t in sr.parse(sent): t.draw()
def apply_adaptor_grammar(target, source, env): """Apply an existing adaptor grammar model to new data. One of py-cfg's outputs is essentially a PCFG: this builder formats this a bit, then loads it as an NLTK PCFG, which is then applied to the provided word list to get new segmentations. Note: the NLTK implementation is very slow, you may want to look into using one of Mark Johnson's other code bases, "cky.tbz", which is very fast and accepts a similar format to the py-cfg output. Sources: py-cfg grammar file, word list Targets: segmented word list """ rules = {} nonterminals = set() with meta_open(source[0].rstr()) as ifd: for line in ifd: m = re.match(r"^(\S+)\s(\S+) --> (.*)$", line) if m: count = float(m.group(1)) lhs = m.group(2) nonterminals.add(lhs) rhs = tuple(m.group(3).strip().split()) rules[lhs] = rules.get(lhs, {}) rules[lhs][rhs] = count else: m = re.match(r"^\((\S+)#\d+ (.*)$", line) lhs = m.group(1) rhs = tuple(re.sub(r"\(\S+", "", m.group(2)).replace(")", "").strip().split()) rules[lhs] = rules.get(lhs, {}) rules[lhs][rhs] = rules[lhs].get(rhs, 0) + 1 productions = [] for lhs, rhss in rules.iteritems(): total = sum(rhss.values()) for rhs, c in rhss.iteritems(): mrhs = [] for x in rhs: if x in nonterminals: mrhs.append(Nonterminal(x)) else: mrhs.append(x) productions.append(ProbabilisticProduction(Nonterminal(lhs), mrhs, prob=(float(c) / total))) pcfg = PCFG(Nonterminal("Word"), productions) parser = ViterbiParser(pcfg) with meta_open(source[1].rstr()) as ifd: items = [l.strip().split() for l in ifd] with meta_open(target[0].rstr(), "w") as ofd: parsed = parser.parse_sents(items) for tree in [x.next() for x in parsed]: toks = [z for z in ["".join([unichr(int(y, base=16)) for y in x.leaves() if y not in ["^^^", "$$$"]]) for x in tree] if z != ""] if len(toks) == 1: ofd.write("%s\n" % (toks[0])) else: ofd.write(" ".join(["%s+" % toks[0]] + ["+%s+" % x for x in toks[1:-1]] + ["+%s" % toks[-1]]) + "\n") return None
def run_parser(corpus): """ Runs the parser on a corpus. @param corpus: List of lists with input tokens """ for sentence in corpus: grammar = getGrammar(sentence) parser = Parser(grammar) sent = splitSentence(sentence) tree = parser.parse(sent) # tree.draw() # print tree.pprint(margin=30) extractDepParse(tree, sentence)
def overgeneration_coverage(pcfg, L, num_samples): """ Test the overgeneration coverage with num_samples random messages with message length L. Returns % of successfull parses. """ parser = ViterbiParser(pcfg) parse_total = 0 # Total number of messages tried to parse parse_success = 0 # Total number successfully parsed # Get the random messages vocabulary = get_terminals(pcfg) for i in range(0, num_samples): message = sample_message(L, vocabulary) parse_total += 1 try: if parser.parse_one(message): parse_success += 1 except ValueError: continue return parse_success / parse_total * 100
def sanity_test(): """Unit Test to make sure this stuff is working. This function should NOT break. """ from ..rubric_utils.load_params import ( get_pcfg_params, get_pcfg_path, get_codeorg_data_root, ) data_root = get_codeorg_data_root(1, 'raw') theta = get_pcfg_params(1, author='teacher', random=False) cfg_path = get_pcfg_path(1, author='teacher') data, counts = load_real_asts(data_root, 1, True) integer_domain = get_integer_domain(data) # CKY parser for p-cfgs... pcfg = build_pcfg(cfg_path, theta, integer_domain, False) parser = ViterbiParser(pcfg) generator = parser.parse(['Move', '(', '50', ')']) tree = next(generator) # print(tree.logprob()) print(tree)
def main(): # train = treebank.fileids()[:190] test = treebank.fileids()[190:] # 10 last sentences # original grammar # pcfg = induce_grammar(train) # pickle.dump(pcfg, open("grammar.pcfg", 'wb')) # load grammar # pcfg : PCFG = pickle.load(open("grammar.pcfg", 'rb')) # fill in missing words # missing_words = get_missing_words(pcfg, test) # pcfg_unk = fill_missing_words(pcfg, missing_words) # pickle.dump(pcfg_unk, open("grammar_unk.pcfg", 'wb')) # load unk grammar pcfg_unk: PCFG = pickle.load(open("grammar_unk.pcfg", 'rb')) # use unk grammar on test sentences parser = ViterbiParser(pcfg_unk) parse_treebank(parser, test)
def analyse_viterbi(pcfg, messages): """ Infers the Viterbi parses of the fixed induction set, split induction set and evaluation set Writes parses to txt file Computes message likelihood, tree diversity and evaluation coverage Writes these properties to a pickle file Returns a list of strings for summarized properties """ # Get terminals prods_lexical = [ prod for prod in pcfg.productions() if type(prod.rhs()[0]) == str ] terminals = set([prod.rhs()[0] for prod in prods_lexical]) # Compute message likelihoods and tree depth parser = ViterbiParser(pcfg) message_count = len(messages) message_count_quarter = int(np.ceil(message_count / 4)) lines_parse = [] trees = [] tree_depths = [] logprobs = [] failed_parses = [] parsed_count_weighted = 0 for i, sent in enumerate(messages): sent = list(sent) if all(sym in terminals for sym in sent): tree_list = list(parser.parse(sent)) if len( tree_list ) == 1: # if the message can be parsed, tree_list contains one tree tree = tree_list[0] parse = to_parse_string(tree) trees.append(parse) tree_depths.append(tree_depth(tree)) logprobs.append( tree.logprob() / np.log(2) ) # convert natural logarithm from tree to log base 2 for description length else: parse = "NO_PARSE" logprobs.append(None) tree_depths.append(None) failed_parses.append(sent) else: parse = "NO_PARSE" logprobs.append(None) tree_depths.append(None) failed_parses.append(sent) # Compute final statistics parsed_count = len(ignore_none(logprobs)) unparsed_count = message_count - parsed_count # Collect evaluation information (of unique messages) eval_stats = { 'log2likelihoods': logprobs, # corresponds to {data: frequencies} 'unparsed_count': unparsed_count, 'parsed_count': parsed_count, 'failedparses': failed_parses, } # Evaluation coverage coverage = parsed_count / len(messages) eval_stats['coverage'] = coverage * 100 eval_stats['average_log2likelihood'] = mean(logprobs) or float('nan') return eval_stats
for tree in treebank.parsed_sents()[:i+1]: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') prod_gen = tree_to_productions(tree) tree_to_append = next(prod_gen) while tree_to_append: productions.append(tree_to_append) try: tree_to_append = next(prod_gen) except Exception as e: tree_to_append = False productions = get_productions(productions) return PCFG(Nonterminal('S'), productions) pcfg_training = pcfg_learn(treebank, 400) parser = ViterbiParser(pcfg_training) def get_list_of_labelled_constituents(parse_tree, lst=None, first_index=None, last_index=None): if lst is None: return get_list_of_labelled_constituents(parse_tree, list(), 0, len(parse_tree.leaves()) - 1) if not len(lst): lst = [(parse_tree.label(), first_index, last_index)] if len(list(parse_tree.subtrees())) == 1: return [(parse_tree.label(), first_index, last_index)] else: for child in parse_tree: labelled_constituents = (child.label(), first_index, first_index + len(child.leaves()) - 1) get_list_of_labelled_constituents(child, lst, first_index, first_index + len(child.leaves()) - 1) first_index += len(child.leaves()) last_index += len(child.leaves())
def main(test=False): """ makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and serializes them all to disk for future use. The ViterbiParser runs in cubic time and give the most likely parse. The ShiftReduceParser runs in linear time and gives a single parse. https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc https://www.nltk.org/_modules/nltk/grammar.html """ vocabulary = chainer.datasets.get_ptb_words_vocabulary() freq_thresh = 0 ## ARBITRARY word_freqs = FreqDist(ptb.words()) if not os.path.isfile('parsers/grammar.pkl'): productions = [] add_dict = {} # use the entire treebank's parsed sentences to generate the CFG for i, tree in enumerate(ptb.parsed_sents()): # is it a good idea to combine this with my preprocessing? tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) # preprocess all productions by removing all tags these_productions = tree.productions() for production in these_productions: # remove all tags from the LHS (only keep primary tag) production._lhs = preprocess_nt(production._lhs) rhs = [] for item in production._rhs: # remove all tags from the Nonterminals on the RHS if type(item) == nltk.grammar.Nonterminal: rhs.append(preprocess_nt(item)) # replace numbers with N elif is_number(item): rhs.append('N') # items not in dictionary replaced with <unk> # dictionary requires lower elif not is_key(vocabulary, item.lower()): rhs.append('<unk>') # replace infrequent words with <unk> elif word_freqs[item] < freq_thresh: rhs.append('<unk>') # lowercase all entries in the grammar else: rhs.append(item.lower()) production._rhs = tuple(rhs) if not is_key(add_dict, production.unicode_repr()): add_dict[production.unicode_repr()] = True productions.append(production) print('** {} productions found! **'.format(len(productions))) grammar = induce_pcfg(Nonterminal('S'), productions) with open('parsers/grammar.pkl', 'wb') as f: f.write(pickle.dumps(grammar)) if not os.path.isfile('parsers/viterbi_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time with open('parsers/viterbi_parser.pkl', 'wb') as f: f.write(pickle.dumps(viterbi_parser)) if not os.path.isfile('parsers/shift_reduce_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) shift_reduce_parser = ShiftReduceParser(grammar, trace=0) # linear time with open('parsers/shift_reduce_parser.pkl', 'wb') as f: f.write(pickle.dumps(shift_reduce_parser)) with open('data/ptb.train.txt', 'r') as f: data = f.readlines() if test: for sample in [1, 23, 20330, 20332, 443]: t1 = time.time() viterbi_parser.parse_one(data[sample].split()) t2 = time.time() print('viterbi = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split()))) t1 = time.time() shift_reduce_parser.parse_one(data[sample].split()) t2 = time.time() print('shift reduce = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split())))
import pickle import time # Benchmarks the speed of the Viterbi parser def parse(parser: ViterbiParser, sentence): start_time = time.time() parser.trace(trace=1) for tree in parser.parse(sentence): print(tree) print( f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}" ) # load unk grammar pcfg_unk = pickle.load(open("grammar_unk.pcfg", 'rb')) # use unk grammar on test sentences parser = ViterbiParser(pcfg_unk) # one sentence test = treebank.fileids()[190:] first_sentence = None for sentence in treebank.parsed_sents(test): first_sentence = sentence.leaves() break for i in range(1, 7): parse(parser, first_sentence[:i])
def parse(parser: ViterbiParser, sentence): for tree in parser.parse(sentence): yield tree