Esempio n. 1
0
def print_trees(li):
  # assumes list of tuples of form (TREE, PROBABILITY)
  n = 1
  for tup in li:
    print("Tree " + str(n) + ": " + str(tup[1]))
    n = n + 1
  draw_trees(*[x[0] for x in li])
Esempio n. 2
0
def print_trees(li):
    # assumes list of tuples of form (TREE, PROBABILITY)
    n = 1
    for tup in li:
        print("Tree " + str(n) + ": " + str(tup[1]))
        n = n + 1
    draw_trees(*[x[0] for x in li])
Esempio n. 3
0
# hun_dp_fcfg.py
from nltk import load_parser
from nltk.draw import draw_trees

if __name__ == '__main__':
    parser = load_parser('file:data/hun_dp.fcfg')

    sentences = [['ember'], ['kutya'], ['egy', 'kutya'], ['egy', 'ember'],
                 ['az', 'ember'], ['a', 'kutya']]
    trees = []
    for sentence in sentences:
        for tree in parser.parse(sentence):
            trees.append(tree)

    draw_trees(*trees)
Esempio n. 4
0
def parse(sentence, cfg):
    grammar = parse_cfg(cfg)
    parser = ChartParser(grammar, TD_STRATEGY)
    words = sentence.split()
    draw.draw_trees(*parser.get_parse_list(words))
Esempio n. 5
0
def main():

    import sys
    print(sys.executable)

    print('Cleaning the corpus')
    with open(TREEPATH, 'r') as f:
        corpus = f.read()

    sentences = corpus.split('( (SENT (')[1:]

    # ### Get all possible word types in the corpus

    type_word = getTypes(corpus)

    # ### Remove all types with a dash, and write it in a txt file

    corpus_out = trimm_derived_rules(corpus, type_word)

    with open('corpus_out.txt', 'w') as f:
        f.write(corpus_out)

    with open('corpus_out.txt', 'r') as f:
        text = f.readlines()

    # ### Create splits

    print('Reading corpus')

    prop_train = 0.8
    prop_test = 0.1
    prop_dev = 0.1

    #rand_idx = np.random.permutation(len(text))
    rand_idx = range(len(text))
    idx_end_train = int(prop_train * len(text))
    idx_start_test = int((1 - prop_test) * len(text))

    corpus_train = [text[t] for t in rand_idx[:idx_end_train]]
    corpus_dev = [text[t] for t in rand_idx[idx_end_train:idx_start_test]]
    corpus_test = [text[t] for t in rand_idx[idx_start_test:]]

    print('Training corpus ' + '(' + str(prop_train * 100) + '%) :' +
          str(len(corpus_train)))
    print('Development corpus ' + '(' + str(prop_dev * 100) + '%) :' +
          str(len(corpus_dev)))
    print('Test corpus ' + '(' + str(prop_test * 100) + '%) :' +
          str(len(corpus_test)))

    print('Converting all sentences to Chomsky Normal Format')

    print('Training set')
    corpus_train_rules = convert_to_CNF(corpus_train)
    print('Development set')
    corpus_dev_rules = convert_to_CNF(corpus_dev)
    print('Testing set')
    corpus_test_rules = convert_to_CNF(corpus_test)

    print('Done')

    print('Extracting the PCFG from the corpus')
    (grammar_parents, grammar_daughters,
     P) = extractPCFGu(corpus_train_rules, corpus_dev_rules, corpus_test_rules)

    print('Found ' + str(
        sum([
            len(grammar_daughters[s]) for s in list(grammar_daughters.keys())
        ])) + ' rules')
    print('length grammar_daughter ' + str(len(grammar_daughters)))
    grammar = {'parents': grammar_parents, 'daughters': grammar_daughters}

    import nltk.draw as dr

    print(
        'Please write the text you would like to parse (one space between each word, once sentence per line)'
    )
    print('Press ENTER twice to end')

    sentences = []
    while True:
        sentence = input()

        if sentence:
            sentences.append(sentence)
        else:
            break

    print('Found ' + str(len(sentences)) + ' sentences.')
    print('Now processing...')

    for i, sentence in enumerate(sentences):
        print('sentence ' + str(1 + i) + ': ' + sentence)
        words = stdin_to_words(sentence)
        fail = False
        for word in words:
            if corpus.find(' ' + word[1:-1] + ')') == -1:
                print(
                    word[1:-1] +
                    ' not found in corpus, algorithm will fail, moving on to the next sentence.'
                )
                fail = True
        while fail is False:
            print('Computing Probabilistic CKY table')
            b, t = P_CKY_u(words, grammar, P)
            idx_max = 1
            print('Done')

            print('Building the most probable tree')
            tree = build_tree_u(words, grammar, b, t, 0, -1, idx_max)
            tr = Tree.fromstring('( ( SENT' + tree + '))')

            print(
                'Drawing the tree: Please close the window to proceed to next sentence'
            )
            dr.draw_trees(tr)

            fail = True

    print('Finished')
Esempio n. 6
0
# hun_dp_fcfg.py
from nltk import load_parser
from nltk.draw import draw_trees


if __name__ == '__main__':
    parser = load_parser('file:data/hun_dp.fcfg')

    sentences = [
            ['ember'],
            ['kutya'],
            ['egy', 'kutya'],
            ['egy', 'ember'],
            ['az', 'ember'],
            ['a', 'kutya']
            ]
    trees = []
    for sentence in sentences:
        for tree in parser.parse(sentence):
            trees.append(tree)

    draw_trees(*trees)