def preprocess(y): treetransforms.collapse_unary(y, collapsePOS=True) treetransforms.chomsky_normal_form(y, horzMarkov=2, vertMarkov=1) traverse_tree(y) # we got the modified tree so we need to calculate the scores change_labels(y) y._label = 4
def tree2matrix(tree, cnf=True): if cnf: cnf_tree = deepcopy(tree) treetransforms.chomsky_normal_form(cnf_tree) else: cnf_tree = deepcopy(tree) node_label = set([]) leaf_label = set([]) leaves = cnf_tree.leaves() leaves_position = [] for i in range(len(leaves)): leaves_position.append(cnf_tree.leaf_treeposition(i)) matrix = [] for i in range(len(leaves)): list_i = ['<pad>'] * len(leaves) leaf_i = leaves_position[i] for k in range(len(leaf_i) - 1, -1, -1): if set(leaf_i[k:]) == set([0]): tree_at_k = cnf_tree[leaf_i[:k]] label_k = tree_at_k.label() if k == len(leaf_i) - 1: leaf_label.add(label_k + "_leaf_label") else: node_label.add(label_k + "_node_label") list_i[i + len(tree_at_k.leaves()) - 1] = label_k matrix.append(list_i) node_label.add('<pad>') leaf_label.add('<pad>') node_label.remove('<pad>') leaf_label.remove('<pad>') return leaves, matrix, node_label, leaf_label
def get_productions(treebank): """ Returns CNF rules of grammar as derived from the input treebank. Rules are stored in dictionary with LHS:[RHS, RHS, RHS...] format and the [RHS] list DOES include duplicates for future probability calculations. Also returns start symbol of grammar. :param: treebank: list where each entry is a str value representing a tree from the input treebank file """ # grab the top of the tree/start symbol for the grammar # use first tree treebank[0], split on whitespace, use root node [0], omit "(" [1:] start = treebank[0].split()[0][1:] rules = {} for sentence in treebank: # converting the tree to CNF using NLTK magic t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) collapsed_tree = deepcopy(t) treetransforms.collapse_unary(collapsed_tree) cnf_tree = deepcopy(collapsed_tree) treetransforms.chomsky_normal_form(cnf_tree) # using more NLTK magic to grab the production rules from each tree for raw_rule in cnf_tree.productions(): rule = str(raw_rule) production = rule.split(" -> ") if production[0] not in rules: rules[production[0]] = [production[1]] else: rules[production[0]].append(production[1]) # now rules contains LHS:[RHS, RHS, RHS...] pairs return rules, start
def check_binary(tree_string): tree = Tree.fromstring(tree_string) cnfTree = deepcopy(tree) treetransforms.chomsky_normal_form(cnfTree) original = ' '.join(str(tree).split()) binary = ' '.join(str(cnfTree).split()) assert original == binary, f'Different: {original} ####### {binary}'
def get_grammar(cls, train_trees, starting_symb='SENT'): """ This method returns a the grammar coputed from the training set. Inputs: ------- train_trees (list): List of trees to perform training startting_symbol (str): The root symbol """ productions = [] # Chmosky Normal Form for tree in train_trees: # Remove unary rules treetransforms.collapse_unary(tree) # Transform to CNF treetransforms.chomsky_normal_form(tree, horzMarkov=2) # Copute production and store is productions += tree.productions() # Define the root symbol SENT = Nonterminal(starting_symb) # Compute the grammar using PCFG grammar = induce_pcfg(SENT, productions) grammar.chomsky_normal_form() return grammar
def trained_pcfg(): try: with open("pcfgcache.pkl",'rb') as input: print("Loading the PCFG...") gram = pickle.load(input) print("Loaded!") return gram except FileNotFoundError: print("Training the PCFG...") ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') productions = [] tb = treebank # Search for nltk_data/corpora/ptb and place all the wsj/XX/*.mrg files in useFullTreeBank = True n = 0 # check progress of training if useFullTreeBank: tb = ptb for t in tb.parsed_sents(): if n % 200 == 0: print(n) collapse_unary(t,True) chomsky_normal_form(t) n = n + 1 for p in t.productions(): productions.append(p) gram = grammar.induce_pcfg(grammar.Nonterminal('S'), productions) print("Trained!") print("Writing the PCFG...") with open("pcfgcache.pkl",'wb') as output: pickle.dump(gram, output, -1) print("Write successful!") return gram
def prepapre_sentiment_data(): with open("../data/rt-polarity.neg.txt", mode="r") as f: neg_sent = f.readlines() with open("../data/rt-polarity.pos.txt", mode="r") as f: pos_sent = f.readlines() sents_label = [0] * len(neg_sent) + [1] * len(pos_sent) trees = [] labels = [] count = 0 for sent, label in zip(neg_sent + pos_sent, sents_label): try: count += 1 if count % 200 == 0: print(count) a = list(parser.raw_parse(sent)) hytree = a[0] chomsky_normal_form(hytree) trees.append(hytree[0]) labels.append(label) except: continue with open("hynguyen.pickle", mode="wb") as f: pickle.dump((trees, labels), f)
def generate_data_v2(tree, cnf=True): if cnf: cnfTree = deepcopy(tree) treetransforms.chomsky_normal_form(cnfTree) try: pad_cnf_tree = deepcopy(cnfTree) except RecursionError as e: print(f'Error copy tree') raise e padding_leaves(pad_cnf_tree) else: pad_cnf_tree = deepcopy(tree) bf_tree, bf_lst_tree, bf_meta = bft(pad_cnf_tree) input_node = [] input_label = [] input_index = [] leaves_location = [ pad_cnf_tree.leaf_treeposition(i) for i in range(len(pad_cnf_tree.leaves())) ] for i in range(len(bf_lst_tree)): if len(bf_tree[i].leaves()) > 1: if '|' in bf_tree[i].label(): input_node.append("SPLIT_NODE_node_label") input_label.append("<pad>") else: input_node.append(bf_tree[i].label() + "_node_label") input_label.append('<pad>') else: input_label.append(bf_tree[i].label() + "_leaf_label") try: input_node.append(bf_tree[i].leaves()[0][7:]) except IndexError as e: # print(f'index {i}, {len(bf_tree)}, {len(bf_lst_tree)}') # tree.pretty_print() # # cnfTree.pretty_print() # print('pad_cnf_tree......') # pad_cnf_tree.pretty_print() # print('pad_cnf_tree --- separate.....') # print(input_node) # print(f'bf_tree...') # for x in bf_tree: # print(x) # print(f'bf_lst_tree...') # for x in bf_lst_tree: # print(x) # print('Searching...') # print(bf_tree[i - 1]) # print(bf_tree[i]) # print(bf_tree[i + 1]) # print(bf_tree[i].leaves()) raise e first_leaf = deepcopy(bf_lst_tree[i]) first_leaf.extend(bf_tree[i].leaf_treeposition(0)) first_leaf = leaves_location.index(tuple(first_leaf)) last_leaf = first_leaf + len(bf_tree[i].leaves()) - 1 input_index.append([first_leaf, last_leaf]) return input_node, input_label, input_index, bf_tree, bf_lst_tree
def get_test_dataset(treebank, n): test_trees = list() for i in range(400, n+400): for tree in treebank.parsed_sents()[:i+1]: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') test_trees.append(tree) if len(test_trees) == 50: return test_trees return test_trees
def chomsky_normal_form(self): chomsky_parsed_senteces = [] for parsed_sentence in self.parsed_sentences: try: tree = deepcopy(parsed_sentence) treetransforms.collapse_unary(tree) cnfTree = deepcopy(tree) treetransforms.chomsky_normal_form(cnfTree) chomsky_parsed_senteces.append(cnfTree) except Exception: pass self.parsed_sentences = chomsky_parsed_senteces
def demo(): """ A demonstration showing how each tree transform can be used. """ from copy import deepcopy from nltk import tree, treetransforms from nltk.draw.tree import draw_trees # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) treetransforms.collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) treetransforms.un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print(sentence) print(sentence2) print("Sentences the same? ", sentence == sentence2) draw_trees(t, collapsedTree, cnfTree, parentTree, original)
def demo(): """ A demonstration showing how each tree transform can be used. """ from nltk.draw.tree import draw_trees from nltk import tree, treetransforms from copy import deepcopy # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = tree.Tree.parse(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) treetransforms.collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) treetransforms.un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print sentence print sentence2 print "Sentences the same? ", sentence == sentence2 draw_trees(t, collapsedTree, cnfTree, parentTree, original)
def pcfg_learn(treebank, n): productions = list() for i in range(n): for tree in treebank.parsed_sents()[:i+1]: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') prod_gen = tree_to_productions(tree) tree_to_append = next(prod_gen) while tree_to_append: productions.append(tree_to_append) try: tree_to_append = next(prod_gen) except Exception as e: tree_to_append = False productions = get_productions(productions) return PCFG(Nonterminal('S'), productions)
def get_trees(cls, path_to_dataset, train_split=0.8): """ This methods returns the train, test and eval set as list of trees Inputs: ------- path_to_dataset (str): The path to the copus to be split train_split (float): Proportion of training """ sentences = [] print('Collecting training, test and evaluation trees') with open(path_to_dataset) as f: for sentence in f: # Removes functional labels sent = re.sub(r'-\w+\ ', " ", sentence) sentences.append(sent.rstrip()) # Split Train / (Test + Eval) train_sent, test_sent = train_test_split(sentences, train_size=train_split, test_size=1 - train_split, shuffle=False) # Split Test / Eval eval_sent, test_sent = train_test_split(test_sent, train_size=0.5, shuffle=False) print(f'The total number of sentences {len(sentences)}') print(f'Number of train sentences {len(train_sent)} -- {round(100 * len(train_sent) / len(sentences), 1)} %') print(f'Number of test sentences {len(test_sent)} -- {round(100 * len(test_sent) / len(sentences), 1) } %') print(f'Number of evaluation sentences {len(eval_sent)} -- {round(100 * len(eval_sent) / len(sentences), 1)} %') train_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in train_sent] test_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in test_sent] eval_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in eval_sent] for (test, ev) in zip(test_trees, eval_trees): # Remove unary rules treetransforms.collapse_unary(test) treetransforms.collapse_unary(ev) # Transform to CNF treetransforms.chomsky_normal_form(test, horzMarkov=2) treetransforms.chomsky_normal_form(ev, horzMarkov=2) return train_trees, eval_trees, test_trees
def extract_information(ori_bin_line): # try: label = get_label(line) sentence = ' '.join(Tree.fromstring(line).flatten()) parsed = list(get_corenlp_parser().parse_text(sentence))[0] cnfTree = deepcopy(parsed) treetransforms.chomsky_normal_form(cnfTree) pad_cnf_tree = deepcopy(cnfTree) padding_leaves(pad_cnf_tree) binary_tree = clean_node(pad_cnf_tree) # pad_cnf_tree tree = remove_atnodeset_single_nodeset(binary_tree) tree = remove_leaf_label(tree) tree_string = ' '.join(str(tree).split()) # tree_string_replace = re.sub(r'\(\w+\s', ' ( ', tree_string) # NP|<JJ-NN> tree_string_replace = re.sub(r'\([\w<>\|\.:\'`$,-]+\s', ' ( ', tree_string) # json.loads() try: tree_enc = get_tree_encodings(tree_string_replace) except Exception as e: # print(tree.) tree.pretty_print() # clean_node(tree).pretty_print() print(f'get_tree_encodings for line') print(line) print(tree_string) print(tree_string_replace) print() tree_from_string(line).pretty_print() sentiment_tree_enc = get_tree_encodings(line) print(sentiment_tree_enc) raise e obj = { "label": label, "sentence": sentence, "constituency_tree_encoding": tree_enc } return obj
def generate_all_productions(records, productions_file): productions = [] tree = [] for line in records: if len(line) > 2: tree = Tree.fromstring(line) #generate tree from string parentTree = deepcopy(tree) treetransforms.chomsky_normal_form( parentTree, horzMarkov=2, vertMarkov=3 ) #parent annotation (one level) and horizontal smoothing of order two productions = productions + parentTree.productions( ) #get tree productions and append to list with open(productions_file, 'w') as prod_write: #write productions to a file prod_write.write('%start ' + str(tree.label()) + '\n') # explicitly set the start marker for prod in productions: prod_write.write(str(prod) + "\n") print("Production rules generated")
def pcfg_cnf_learn(treebank, n): for tree in treebank.parsed_sents()[:n]: tree = filter_tree(tree) chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') yield tree
print x_axis print y_axis import matplotlib.pyplot as plt plt.title("Accuracy per distance") plt.scatter(x_axis, y_axis, c="blue", marker='*', label="accuracy index") plt.scatter(x_axis_labeled, y_axis_labeled, c="red", marker='o', label="accuracy label", alpha=0.5) plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) plt.show() if __name__ == '__main__': treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg') trees = treebank.parsed_sents() trees = trees[:5] cleaned_trees = [filter_tree(tree) for tree in trees] for t in cleaned_trees: chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^') parser, pcfg = get_parser(cleaned_trees) eval_trees(cleaned_trees, parser, pcfg) print "----------- Reporting Per Label -----------" print ACCURACY_PER_LABEL print len(ACCURACY_PER_LABEL) for item in ACCURACY_PER_LABEL: print item, "--- total -------> ", ACCURACY_PER_LABEL[item]['total'] print item, "--- precision ---> ", ACCURACY_PER_LABEL[item]['matches']/float(ACCURACY_PER_LABEL[item]['total']) print '&'*100 print ACCURACY_PER_DISTANCE calculate_accuracy_per_distance()
neg_sent.append(f.readline()) neg_sent.append(f.readline()) with open("data/rt-polarity.pos.txt",mode="r") as f: pos_sent.append(f.readline()) pos_sent.append(f.readline()) pos_sent.append(f.readline()) trees = [] labels = [0]*3 + [1]*3 sents = pos_sent + neg_sent for sent in sents: a = list(parser.raw_parse(sent)) hytree = a[0] chomsky_normal_form(hytree) trees.append(hytree[0]) rnn = RecursiveNeuralNetworl(embsize=300,mnb_size=6,wordvector=wordvector) trees[0].pretty_print() for tree,label in zip(trees,labels): root_node, softmax_layer, cost, pred = rnn.forward(tree,label) print("correct {0}, predict {1}, cost {2}".format(label,pred,cost)) # if have_1_child # child_have_1_child_and_unicode
def preprocess_trees_cnf(trees): for tree in trees: preprocess_tree(tree) chomsky_normal_form(tree, factor= 1, horzMarkov=1, vertMarkov=1,childChar='|',parentChar='^')
def convert_tree_to_cnf(tree): treetransforms.chomsky_normal_form(tree) for ind in tree.treepositions(): t = tree[ind] if len(t) != 0: tree[ind].set_label('0')