Esempio n. 1
0
def overgeneration_coverage(pcfg, L, num_samples):
    """
    Test the overgeneration coverage with num_samples random messages with message length L.
    Returns % of successfull parses.
    """
    parser = ViterbiParser(pcfg)

    parse_total = 0  # Total number of messages tried to parse
    parse_success = 0  # Total number successfully parsed

    # Get the random messages
    vocabulary = get_terminals(pcfg)
    for i in range(0, num_samples):
        message = sample_message(L, vocabulary)
        parse_total += 1
        try:
            if parser.parse_one(message):
                parse_success += 1
        except ValueError:
            continue
    return parse_success / parse_total * 100
Esempio n. 2
0
def main(test=False):
    """
    makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and
    serializes them all to disk for future use.

    The ViterbiParser runs in cubic time and give the most likely parse.
    The ShiftReduceParser runs in linear time and gives a single parse.

    https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk
    https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc
    https://www.nltk.org/_modules/nltk/grammar.html
    """
    vocabulary = chainer.datasets.get_ptb_words_vocabulary()
    freq_thresh = 0 ## ARBITRARY
    word_freqs = FreqDist(ptb.words())

    if not os.path.isfile('parsers/grammar.pkl'):

        productions = []
        add_dict = {}

        # use the entire treebank's parsed sentences to generate the CFG
        for i, tree in enumerate(ptb.parsed_sents()):

            # is it a good idea to combine this with my preprocessing?
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            # preprocess all productions by removing all tags
            these_productions = tree.productions()
            for production in these_productions:

                # remove all tags from the LHS (only keep primary tag)
                production._lhs = preprocess_nt(production._lhs)

                rhs = []
                for item in production._rhs:

                    # remove all tags from the Nonterminals on the RHS
                    if type(item) == nltk.grammar.Nonterminal:
                        rhs.append(preprocess_nt(item))

                    # replace numbers with N
                    elif is_number(item):
                        rhs.append('N')

                    # items not in dictionary replaced with <unk>
                    # dictionary requires lower
                    elif not is_key(vocabulary, item.lower()):
                        rhs.append('<unk>')

                    # replace infrequent words with <unk>
                    elif word_freqs[item] < freq_thresh:
                        rhs.append('<unk>')

                    # lowercase all entries in the grammar
                    else:
                        rhs.append(item.lower())

                production._rhs = tuple(rhs)

                if not is_key(add_dict, production.unicode_repr()):
                    add_dict[production.unicode_repr()] = True
                    productions.append(production)

        print('** {} productions found! **'.format(len(productions)))
        grammar = induce_pcfg(Nonterminal('S'), productions)

        with open('parsers/grammar.pkl', 'wb') as f:
            f.write(pickle.dumps(grammar))

    if not os.path.isfile('parsers/viterbi_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time

        with open('parsers/viterbi_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(viterbi_parser))

    if not os.path.isfile('parsers/shift_reduce_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        shift_reduce_parser = ShiftReduceParser(grammar, trace=0)     # linear time

        with open('parsers/shift_reduce_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(shift_reduce_parser))

    with open('data/ptb.train.txt', 'r') as f:
        data = f.readlines()

    if test:
        for sample in [1, 23, 20330, 20332, 443]:

            t1 = time.time()
            viterbi_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('viterbi      = {:.2f} sec for {} words'.format(
                t2-t1, len(data[sample].split())))

            t1 = time.time()
            shift_reduce_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('shift reduce = {:.2f} sec for {} words'.format(
            t2-t1, len(data[sample].split())))