def get_productions(treebank): """ Returns CNF rules of grammar as derived from the input treebank. Rules are stored in dictionary with LHS:[RHS, RHS, RHS...] format and the [RHS] list DOES include duplicates for future probability calculations. Also returns start symbol of grammar. :param: treebank: list where each entry is a str value representing a tree from the input treebank file """ # grab the top of the tree/start symbol for the grammar # use first tree treebank[0], split on whitespace, use root node [0], omit "(" [1:] start = treebank[0].split()[0][1:] rules = {} for sentence in treebank: # converting the tree to CNF using NLTK magic t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) collapsed_tree = deepcopy(t) treetransforms.collapse_unary(collapsed_tree) cnf_tree = deepcopy(collapsed_tree) treetransforms.chomsky_normal_form(cnf_tree) # using more NLTK magic to grab the production rules from each tree for raw_rule in cnf_tree.productions(): rule = str(raw_rule) production = rule.split(" -> ") if production[0] not in rules: rules[production[0]] = [production[1]] else: rules[production[0]].append(production[1]) # now rules contains LHS:[RHS, RHS, RHS...] pairs return rules, start
def get_grammar(cls, train_trees, starting_symb='SENT'): """ This method returns a the grammar coputed from the training set. Inputs: ------- train_trees (list): List of trees to perform training startting_symbol (str): The root symbol """ productions = [] # Chmosky Normal Form for tree in train_trees: # Remove unary rules treetransforms.collapse_unary(tree) # Transform to CNF treetransforms.chomsky_normal_form(tree, horzMarkov=2) # Copute production and store is productions += tree.productions() # Define the root symbol SENT = Nonterminal(starting_symb) # Compute the grammar using PCFG grammar = induce_pcfg(SENT, productions) grammar.chomsky_normal_form() return grammar
def preprocess(y): treetransforms.collapse_unary(y, collapsePOS=True) treetransforms.chomsky_normal_form(y, horzMarkov=2, vertMarkov=1) traverse_tree(y) # we got the modified tree so we need to calculate the scores change_labels(y) y._label = 4
def trained_pcfg(): try: with open("pcfgcache.pkl",'rb') as input: print("Loading the PCFG...") gram = pickle.load(input) print("Loaded!") return gram except FileNotFoundError: print("Training the PCFG...") ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') productions = [] tb = treebank # Search for nltk_data/corpora/ptb and place all the wsj/XX/*.mrg files in useFullTreeBank = True n = 0 # check progress of training if useFullTreeBank: tb = ptb for t in tb.parsed_sents(): if n % 200 == 0: print(n) collapse_unary(t,True) chomsky_normal_form(t) n = n + 1 for p in t.productions(): productions.append(p) gram = grammar.induce_pcfg(grammar.Nonterminal('S'), productions) print("Trained!") print("Writing the PCFG...") with open("pcfgcache.pkl",'wb') as output: pickle.dump(gram, output, -1) print("Write successful!") return gram
def chomsky_normal_form(self): chomsky_parsed_senteces = [] for parsed_sentence in self.parsed_sentences: try: tree = deepcopy(parsed_sentence) treetransforms.collapse_unary(tree) cnfTree = deepcopy(tree) treetransforms.chomsky_normal_form(cnfTree) chomsky_parsed_senteces.append(cnfTree) except Exception: pass self.parsed_sentences = chomsky_parsed_senteces
def demo(): """ A demonstration showing how each tree transform can be used. """ from copy import deepcopy from nltk import tree, treetransforms from nltk.draw.tree import draw_trees # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) treetransforms.collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) treetransforms.un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print(sentence) print(sentence2) print("Sentences the same? ", sentence == sentence2) draw_trees(t, collapsedTree, cnfTree, parentTree, original)
def demo(): """ A demonstration showing how each tree transform can be used. """ from nltk.draw.tree import draw_trees from nltk import tree, treetransforms from copy import deepcopy # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = tree.Tree.parse(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) treetransforms.collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) treetransforms.un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print sentence print sentence2 print "Sentences the same? ", sentence == sentence2 draw_trees(t, collapsedTree, cnfTree, parentTree, original)
def get_trees(cls, path_to_dataset, train_split=0.8): """ This methods returns the train, test and eval set as list of trees Inputs: ------- path_to_dataset (str): The path to the copus to be split train_split (float): Proportion of training """ sentences = [] print('Collecting training, test and evaluation trees') with open(path_to_dataset) as f: for sentence in f: # Removes functional labels sent = re.sub(r'-\w+\ ', " ", sentence) sentences.append(sent.rstrip()) # Split Train / (Test + Eval) train_sent, test_sent = train_test_split(sentences, train_size=train_split, test_size=1 - train_split, shuffle=False) # Split Test / Eval eval_sent, test_sent = train_test_split(test_sent, train_size=0.5, shuffle=False) print(f'The total number of sentences {len(sentences)}') print(f'Number of train sentences {len(train_sent)} -- {round(100 * len(train_sent) / len(sentences), 1)} %') print(f'Number of test sentences {len(test_sent)} -- {round(100 * len(test_sent) / len(sentences), 1) } %') print(f'Number of evaluation sentences {len(eval_sent)} -- {round(100 * len(eval_sent) / len(sentences), 1)} %') train_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in train_sent] test_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in test_sent] eval_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in eval_sent] for (test, ev) in zip(test_trees, eval_trees): # Remove unary rules treetransforms.collapse_unary(test) treetransforms.collapse_unary(ev) # Transform to CNF treetransforms.chomsky_normal_form(test, horzMarkov=2) treetransforms.chomsky_normal_form(ev, horzMarkov=2) return train_trees, eval_trees, test_trees
def transform_sst_to_acd_trees(sents, tag='train', filter_len=20): # base parameters sweep_dim = 1 # how large chunks of text should be considered (1 for words) method = 'cd' # build_up, break_down, cd percentile_include = 99.5 # keep this very high so we don't add too many words at once num_iters = 25 # maximum number of iterations (rarely reached) new_data = [] cnt = time.time() for s_i, sent in enumerate(sents): # prepare inputs # print(len(sent)) if s_i % 10 == 0: print("phase {} time {} processed {}".format( tag, time.time() - cnt, s_i)) if len(sent) > filter_len: continue # print("{} time {} - 0".format(s_i, time.time() - cnt)) # cnt = time.time() sent = [w.lower() for w in sent] batch = batch_from_str_list(sent) scores_all = model(batch).data.numpy()[0] # predict label_pred = np.argmax(scores_all) # get predicted class # agglomerate # print("{} time {} - 1".format(s_i, time.time() - cnt)) # cnt = time.time() lists = agg.agglomerate( model, batch, percentile_include, method, sweep_dim, # only works for sweep_dim = 1 sent, label_pred, num_iters=num_iters ) # see agg_1d.agglomerate to understand what this dictionary contains # print("{} time {} - 1.5".format(s_i, time.time() - cnt)) lists = agg.collapse_tree(lists) # don't show redundant joins # print("{} time {} - 2".format(s_i, time.time() - cnt)) # cnt = time.time() # gather tree children = comp_to_tree(lists, sent) # print("{} time {} - 3".format(s_i, time.time() - cnt)) # cnt = time.time() # uniary combine the tree, then binarize it tree = deepcopy(children[0]) treetransforms.collapse_unary(tree) chomsky_normal_form(tree, factor='left') new_data.append(tree_to_str(tree)) if s_i % 100 == 0: json.dump(new_data, open('tmp.json', 'w')) return new_data