def print_trees(li): # assumes list of tuples of form (TREE, PROBABILITY) n = 1 for tup in li: print("Tree " + str(n) + ": " + str(tup[1])) n = n + 1 draw_trees(*[x[0] for x in li])
# hun_dp_fcfg.py from nltk import load_parser from nltk.draw import draw_trees if __name__ == '__main__': parser = load_parser('file:data/hun_dp.fcfg') sentences = [['ember'], ['kutya'], ['egy', 'kutya'], ['egy', 'ember'], ['az', 'ember'], ['a', 'kutya']] trees = [] for sentence in sentences: for tree in parser.parse(sentence): trees.append(tree) draw_trees(*trees)
def parse(sentence, cfg): grammar = parse_cfg(cfg) parser = ChartParser(grammar, TD_STRATEGY) words = sentence.split() draw.draw_trees(*parser.get_parse_list(words))
def main(): import sys print(sys.executable) print('Cleaning the corpus') with open(TREEPATH, 'r') as f: corpus = f.read() sentences = corpus.split('( (SENT (')[1:] # ### Get all possible word types in the corpus type_word = getTypes(corpus) # ### Remove all types with a dash, and write it in a txt file corpus_out = trimm_derived_rules(corpus, type_word) with open('corpus_out.txt', 'w') as f: f.write(corpus_out) with open('corpus_out.txt', 'r') as f: text = f.readlines() # ### Create splits print('Reading corpus') prop_train = 0.8 prop_test = 0.1 prop_dev = 0.1 #rand_idx = np.random.permutation(len(text)) rand_idx = range(len(text)) idx_end_train = int(prop_train * len(text)) idx_start_test = int((1 - prop_test) * len(text)) corpus_train = [text[t] for t in rand_idx[:idx_end_train]] corpus_dev = [text[t] for t in rand_idx[idx_end_train:idx_start_test]] corpus_test = [text[t] for t in rand_idx[idx_start_test:]] print('Training corpus ' + '(' + str(prop_train * 100) + '%) :' + str(len(corpus_train))) print('Development corpus ' + '(' + str(prop_dev * 100) + '%) :' + str(len(corpus_dev))) print('Test corpus ' + '(' + str(prop_test * 100) + '%) :' + str(len(corpus_test))) print('Converting all sentences to Chomsky Normal Format') print('Training set') corpus_train_rules = convert_to_CNF(corpus_train) print('Development set') corpus_dev_rules = convert_to_CNF(corpus_dev) print('Testing set') corpus_test_rules = convert_to_CNF(corpus_test) print('Done') print('Extracting the PCFG from the corpus') (grammar_parents, grammar_daughters, P) = extractPCFGu(corpus_train_rules, corpus_dev_rules, corpus_test_rules) print('Found ' + str( sum([ len(grammar_daughters[s]) for s in list(grammar_daughters.keys()) ])) + ' rules') print('length grammar_daughter ' + str(len(grammar_daughters))) grammar = {'parents': grammar_parents, 'daughters': grammar_daughters} import nltk.draw as dr print( 'Please write the text you would like to parse (one space between each word, once sentence per line)' ) print('Press ENTER twice to end') sentences = [] while True: sentence = input() if sentence: sentences.append(sentence) else: break print('Found ' + str(len(sentences)) + ' sentences.') print('Now processing...') for i, sentence in enumerate(sentences): print('sentence ' + str(1 + i) + ': ' + sentence) words = stdin_to_words(sentence) fail = False for word in words: if corpus.find(' ' + word[1:-1] + ')') == -1: print( word[1:-1] + ' not found in corpus, algorithm will fail, moving on to the next sentence.' ) fail = True while fail is False: print('Computing Probabilistic CKY table') b, t = P_CKY_u(words, grammar, P) idx_max = 1 print('Done') print('Building the most probable tree') tree = build_tree_u(words, grammar, b, t, 0, -1, idx_max) tr = Tree.fromstring('( ( SENT' + tree + '))') print( 'Drawing the tree: Please close the window to proceed to next sentence' ) dr.draw_trees(tr) fail = True print('Finished')
# hun_dp_fcfg.py from nltk import load_parser from nltk.draw import draw_trees if __name__ == '__main__': parser = load_parser('file:data/hun_dp.fcfg') sentences = [ ['ember'], ['kutya'], ['egy', 'kutya'], ['egy', 'ember'], ['az', 'ember'], ['a', 'kutya'] ] trees = [] for sentence in sentences: for tree in parser.parse(sentence): trees.append(tree) draw_trees(*trees)