Beispiel #1
0
 def __init__(self, camp, npc, pc, start, visualizer=None):
     self.camp = camp
     self.npc = npc
     self.pc = pc
     if not visualizer:
         visualizer = SimpleVisualizer()
     self.visualizer = visualizer
     self.root = None
     self.npc_offers = list()
     self.npc_grammar = grammar.Grammar()
     self.pc_grammar = grammar.Grammar()
     #self._get_dialogue_data()
     self.build(start)
Beispiel #2
0
 def test_empty_words(self):
     grammar = gmr.Grammar(gmr.Rule('N', ['Nothing'], preterminal=True))
     words = []
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual(0, len(trees))
     self.assertEqual([], trees)
Beispiel #3
0
 def test_empty_grammar(self):
     grammar = gmr.Grammar()
     words = ['Something']
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual(0, len(trees))
     self.assertEqual([], trees)
Beispiel #4
0
    def __init__(self):
        self.generation = 0
        evolver.TIME = time.time()
        evolver.SAVE_BEST = True
        evolver.CODON_SIZE = 100
        evolver.ELITE_SIZE = 1
        evolver.POPULATION_SIZE = 35
        evolver.GENERATION_SIZE = 35
        evolver.FRONT_FOLDER = "frontData"
        evolver.GENERATIONS = 5
        evolver.DEFAULT_FIT = 100000000000000
        evolver.MUTATION_PROBABILITY = 0.015
        evolver.CROSSOVER_PROBABILITY = 0.7
        evolver.GRAMMAR_FILE = "grammars/jon_pylon10.bnf"
        evolver.FITNESS_FUNCTION = evolver.StructuralFitness()
        evolver.IMG_COUNTER = 0

        self.pop_size = evolver.POPULATION_SIZE
        self.grammar = grammar.Grammar(evolver.GRAMMAR_FILE)
        self.individuals = evolver.initialise_population(evolver.POPULATION_SIZE)
        for idx, indiv in enumerate(self.individuals):
            indiv.uid = idx
        self.selection = lambda x: evolver.tournament_selection(x, evolver.POPULATION_SIZE)
        evolver.evaluate_fitness(self.individuals, self.grammar,
                                 evolver.FITNESS_FUNCTION)
        self.best_ever = min(self.individuals)
        self.fronts = []
        self.individuals.sort()

        print "creating meshes"
        evolver.create_meshes(self.individuals)
        evolver.print_stats(1, self.individuals)
Beispiel #5
0
def main():
    # gram1 = {"A": not_terminal.Not_terminal("A", [rule.Rule("B C"), rule.Rule("bad")]),
    #          "B": not_terminal.Not_terminal("B", [rule.Rule("big C boss"), rule.Rule("epsilon")]),
    #          "C": not_terminal.Not_terminal("C", [rule.Rule("cat"), rule.Rule("cow")])}

    # gram1 = {"S": not_terminal.Not_terminal("S", [rule.Rule("A uno B C"), rule.Rule("S dos")]),
    #          "A": not_terminal.Not_terminal("A", [rule.Rule("B C D"), rule.Rule("A tres"), rule.Rule("epsilon")]),
    #          "B": not_terminal.Not_terminal("B", [rule.Rule("D cuatro C tres"), rule.Rule("epsilon")]),
    #          "C": not_terminal.Not_terminal("C", [rule.Rule("cinco D B"), rule.Rule("epsilon")]),
    #          "D": not_terminal.Not_terminal("D", [rule.Rule("seis"), rule.Rule("epsilon")])}

    # gram1 = {"A": not_terminal.Not_terminal("A", [rule.Rule("ant C"), rule.Rule("B")]),
    #          "B": not_terminal.Not_terminal("B", [rule.Rule("cat C"), rule.Rule("C")]),
    #          "C": not_terminal.Not_terminal("C", [rule.Rule("D fat"), rule.Rule("D")]),
    #          "D": not_terminal.Not_terminal("D", [rule.Rule("B")])}

    # gram1 = {"A": not_terminal.Not_terminal("A", [rule.Rule("ant C"), rule.Rule("B")]),
    #          "B": not_terminal.Not_terminal("B", [rule.Rule("cat C"), rule.Rule("C")]),
    #          "C": not_terminal.Not_terminal("C", [rule.Rule("fat D"), rule.Rule("D")]),
    #          "D": not_terminal.Not_terminal("D", [rule.Rule("B")])}

    gramar = grammar.Grammar(gram1)

    # gramar = first(gramar) IMPORTANTE
    # first(gramar)
    # print(first_aux(gramar, "S"))
    first(gramar)
Beispiel #6
0
def main(fname):
    with open(fname, "r") as infile:
        tokens = scanner.scan(infile, rules)
    tokens.append(scanner.Symbol("$", "EOF", -1, -1, -1))
    #print(tokens)
    g = grammar.Grammar(grammar_dict)

    lr_parse_common.augment_grammar(g)

    for rule in g.rules:
        if rule.to_node is None:
            rule.to_node = lambda rule, children: ast.ASTNode(
                rule.lhs, children)

    kernel = slr1.LR0Item(g.rules[-1], 0)
    first_set = first_follow.get_first(g)
    follow = first_follow.get_follow(g, first_set)
    dfa = lr_parse_common.make_dfa(g, slr1.closure, kernel, first_set)
    action, goto_table = slr1.make_parse_table(dfa, follow, g)
    ast_root = lr_parse_common.parse(dfa, action, goto_table, tokens, g)

    print(ast.gen_ast_digraph(ast_root))

    gen_code = gen_ir.CodeGenVisitor(ast_root)
    gen_code.accept()
    with open(fname + ".ll", "w") as outfile:
        outfile.write(gen_code.get_code())
Beispiel #7
0
 def test_initializer(self):
     grammar = gmr.Grammar(gmr.Rule('S', ['VP']),
                           gmr.Rule('VP', ['V']),
                           gmr.Rule('V', ['initialize'], preterminal=True))
     self.assertIn(gmr.Rule('S', ['VP']), grammar)
     self.assertIn(gmr.Rule('VP', ['V']), grammar)
     self.assertIn(gmr.Rule('V', ['initialize'], preterminal=True), grammar)
     self.assertEqual(3, len(grammar))
Beispiel #8
0
 def test_regex_rule(self):
     grammar = gmr.Grammar(
         gmr.Rule('S', [gmr.Regex(r'[a-z]')], preterminal=True))
     words = ['hello']
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual(1, len(trees))
     self.assertEqual([['S', 'hello']], trees)
Beispiel #9
0
def grammarEnumeration(grammarFile, number, debug = False):
    g = grammar.Grammar(grammarFile)
    classification = g.classifyFirstNStrings(number, debug = debug)
    notInLang = sorted([k for k, v in classification.items() if not v], key=lambda x: (len(x),x))
    isInLang = sorted([k for k, v in classification.items() if v], key=lambda x: (len(x), x))
    print("In Language:")
    print('\t' + str(isInLang))
    print("Not In Language:")
    print('\t' + str(notInLang))
Beispiel #10
0
def isInGrammar(grammarFile, testString, debug = False):
    g = grammar.Grammar(grammarFile)
    alphabet = g.getAlphabet()
    tokens = tokenizer.Tokenizer(tokenizer.getTTLForAlphabet(alphabet), True)
    tokens.tokenize(testString)
    if g.isInLanguage(tokens, debug):
        print("Test String in Language!")
    else:
        print("Test String NOT in Language!")
Beispiel #11
0
def lastgen(resultsfolder):
    PATHNAME = "results/"+resultsfolder
    print "pathname", PATHNAME
    filename = get_last_gen(PATHNAME)
    lastgen = int(filename.rstrip('.dat').lstrip('gen'))
    print "the last generation was:", lastgen
    parsed_pop = parse_pop(PATHNAME+'/'+filename)
    sorted_pop = sort_pop(parsed_pop)
    
    BNF_GRAMMAR = GRAMMAR.Grammar(GRAMMAR_FILE)
    INDIVIDUALS = reinitialise_pop(sorted_pop)
    FITNESS_FUNCTION = bwbfitness.CFD_Fitness(debug=True,foampng=False)
    evaluate_fitness(INDIVIDUALS, BNF_GRAMMAR, FITNESS_FUNCTION)
Beispiel #12
0
 def load(self, gramfile):
     self.grammar = grammar.Grammar(gramfile)
     self.gramlist = [os.path.basename(gramfile)]
     # take basename of the gramfile as a
     for f in os.listdir(self.runtimedir):
         name, ext = os.path.splitext(f)
         if ext in ['.bgr']:
             os.unlink(os.path.join(self.runtimedir, f))
     with open(
             os.path.join(
                 self.runtimedir,
                 os.path.extsep.join([os.path.basename(gramfile), 'bgr'])),
             'wb') as o:
         cPickle.dump(self.grammar, o)
Beispiel #13
0
def constraint_test():
    """ Verify constraint checking methods."""
    import itertools, sys

    show_analysis = False
    #Generated via grammar
    gr = grammar.Grammar('grammars/test_constraints.bnf')
    inputs = ([1 for _ in range(100)], [ i%3 for i in range(100)])
    for _input in inputs: 
        output = gr.generate(_input)
        azr = analyser.Analyser('test',output['phenotype'],True)
        try:
            azr.create_graph()
        except ValueError as e:
            print(__name__, "ERROR", _input, e)
            continue
        azr.parse_graph()
        azr.apply_stresses()
        azr.create_slf_file()
        azr.test_slf_file()
        azr.parse_results()
        azr.print_stresses()
        if show_analysis:
            azr.show_analysis()
            
    #Fixed generated
    lengths = (1000, 10000)
    levels = (5, 10)
    for length_idx, level_idx in itertools.permutations([0,1]):
        try:
            GRAPH = constrained_offset_graph(lengths[length_idx],
                                             levels[length_idx])
        except ValueError as e:
            print(__name__, "ERROR", lengths[length_idx], levels[length_idx], e)
            continue
        GRAPH.save_graph("pylon")
        print "nodes:", GRAPH.number_of_nodes()
        print "edges", GRAPH.number_of_edges()
    #will it blend?
        azr = analyser.Analyser('test',"moo",True)
        azr.my_graph = GRAPH
        azr.parse_graph()
        azr.apply_stresses()
        azr.create_slf_file()
        azr.test_slf_file()
        azr.parse_results()
        azr.print_stresses()
        if show_analysis:
            azr.show_analysis()
Beispiel #14
0
 def test_ambiguity(self):
     grammar = gmr.Grammar(
         gmr.Rule('S', ['NP', 'VP']), gmr.Rule('NP', ['Det', 'Nominal']),
         gmr.Rule('NP', ['Det', 'Nominal', 'PP']),
         gmr.Rule('NP', ['Nominal']), gmr.Rule('VP', ['VP', 'PP']),
         gmr.Rule('VP', ['V', 'NP']), gmr.Rule('PP', ['Prep', 'NP']),
         gmr.Rule('Det', ['a'], preterminal=True),
         gmr.Rule('Nominal', ['I'], preterminal=True),
         gmr.Rule('Nominal', ['man'], preterminal=True),
         gmr.Rule('Nominal', ['telescope'], preterminal=True),
         gmr.Rule('V', ['saw'], preterminal=True),
         gmr.Rule('Prep', ['with'], preterminal=True))
     words = ['I', 'saw', 'a', 'man', 'with', 'a', 'telescope']
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual(2, len(trees))
     self.assertEqual(
         [
             # ... saw ... with a telescope
             [
                 'S', ['NP', ['Nominal', 'I']],
                 [
                     'VP',
                     [
                         'VP', ['V', 'saw'],
                         ['NP', ['Det', 'a'], ['Nominal', 'man']]
                     ],
                     [
                         'PP', ['Prep', 'with'],
                         ['NP', ['Det', 'a'], ['Nominal', 'telescope']]
                     ]
                 ]
             ],
             # ... man with a telescope
             [
                 'S', ['NP', ['Nominal', 'I']],
                 [
                     'VP', ['V', 'saw'],
                     [
                         'NP', ['Det', 'a'], ['Nominal', 'man'],
                         [
                             'PP', ['Prep', 'with'],
                             ['NP', ['Det', 'a'], ['Nominal', 'telescope']]
                         ]
                     ]
                 ]
             ]
         ],
         trees)
    def __init__(self):
        rospy.init_node("grammar_lu")

        with open("prohibited_words.txt", "r") as f:
            self.probibited_words = [line.strip() for line in f.readlines()]

        self.pub_results = rospy.Publisher('grammar_lu/results',
                                           String,
                                           queue_size=10)
        rospy.Subscriber("google_speech/recres_nbest", String,
                         self.recog_callback)
        rospy.Subscriber("grammar_lu/grammar", String, self.set_gram)
        self.gram = grammar.Grammar()
        self.gram.load("grammar_sample.txt")

        rospy.spin()
Beispiel #16
0
 def test_multiple_parses(self):
     grammar = gmr.Grammar(gmr.Rule('N', ['I'], preterminal=True),
                           gmr.Rule('V', ['made'], preterminal=True),
                           gmr.Rule('N', ['her'], preterminal=True),
                           gmr.Rule('V', ['duck'], preterminal=True),
                           gmr.Rule('N', ['duck'], preterminal=True),
                           gmr.Rule('S', ['N', 'V', 'N', 'V']),
                           gmr.Rule('S', ['N', 'V', 'N', 'N']))
     words = ['I', 'made', 'her', 'duck']
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual(2, len(trees))
     self.assertEqual(
         [['S', ['N', 'I'], ['V', 'made'], ['N', 'her'], ['V', 'duck']],
          ['S', ['N', 'I'], ['V', 'made'], ['N', 'her'], ['N', 'duck']]],
         trees)
Beispiel #17
0
def main():
    gram_dict = {'start': 'S->CC', 'other': ['C->cC|d']}
    gram = grammar.Grammar(gram_dict['start'], gram_dict['other'])
    gram.normalize()
    all_items, raw_goto = get_lr1_relation(gram)
    action_dict, goto_dict = get_parse_table(gram, all_items, raw_goto)
    print 'action_dict'
    for from_set, edges in action_dict.iteritems():
        for token, to_set in edges.iteritems():
            print from_set, token, to_set
    print 'goto_dict'
    for from_set, edges in goto_dict.iteritems():
        for token, to_set in edges.iteritems():
            print from_set, '-------', token, '-----', to_set
    return
    for itm in get_lr1_relation(gram):
        print itm
Beispiel #18
0
 def test_programming_language_parsing(self):
     grammar = gmr.Grammar(gmr.Rule('program',
                                    ['variable', 'operator', 'value']),
                           gmr.Rule('variable', [gmr.Regex(r'x')],
                                    preterminal=True),
                           gmr.Rule('operator', [gmr.Regex(r'[+\-=*/]')],
                                    preterminal=True),
                           gmr.Rule('value', [gmr.Regex(r'\d+')],
                                    preterminal=True),
                           distinguished_symbol='program')
     words = ['x', '=', '599993949']
     parser = psr.EarleyParser(grammar)
     trees = parser.parse(words)
     self.assertEqual([[
         'program', ['variable', 'x'], ['operator', '='],
         ['value', '599993949']
     ]], trees)
Beispiel #19
0
def main():
    if len(sys.argv) < 2:
        print "Please set a random seed"
        exit()
    else:
        random.seed(sys.argv[1])
        PATHNAME = "results/run"+str(sys.argv[1])

    if os.path.exists(PATHNAME):
        print "path already exists! going to overwrite results"
        exit()
    else:
        os.makedirs(PATHNAME)
        
    BNF_GRAMMAR = GRAMMAR.Grammar(GRAMMAR_FILE)
    INDIVIDUALS = initialise_population(POPULATION_SIZE)
    LAST_POP = search_loop(GENERATIONS, INDIVIDUALS, BNF_GRAMMAR, tournament_selection, FITNESS_FUNCTION, PATHNAME)
Beispiel #20
0
def main():
    cwd = os.getcwd()
    if cwd.startswith('/ichec/home'):
        print "Run it from the work dir!"
        exit()
    runstarttime = time.time()

    if os.path.exists(PATHNAME):
        print "path already exists! going to overwrite results"
        exit()
    else:
        os.makedirs(PATHNAME)

    BNF_GRAMMAR = GRAMMAR.Grammar(GRAMMAR_FILE)
    INDIVIDUALS = initialise_population(POPULATION_SIZE)
    LAST_POP = search_loop(GENERATIONS, INDIVIDUALS, BNF_GRAMMAR,
                           tournament_selection, FITNESS_FUNCTION, PATHNAME)
    endtime = time.time() - runstarttime
    print "total run took", endtime, "seconds"
Beispiel #21
0
    def test_parse(self):
        grammar = gmr.Grammar(
            gmr.Rule('S', ['VP']), gmr.Rule('VP', ['V', 'NP']),
            gmr.Rule('NP', ['Det', 'Nominal']),
            gmr.Rule('Det', ['that'], preterminal=True),
            gmr.Rule('Nominal', ['flight'], preterminal=True),
            gmr.Rule('V', ['Book'], preterminal=True))

        words = ['Book', 'that', 'flight']

        parser = psr.EarleyParser(grammar)
        trees = parser.parse(words)

        self.assertEqual([[
            'S',
            [
                'VP', ['V', 'Book'],
                ['NP', ['Det', 'that'], ['Nominal', 'flight']]
            ]
        ]], trees)
    def create_grammar(self, _input: str):
        lines = _input.split('\n')
        non_terminals = []
        terminals = set()
        productions = {}

        for line in lines:
            line = line.split('->')
            head = line[0][:-1]
            body = line[1][1:]

            non_terminals.append(head)
            productions[head] = body.split(' | ')

        for body in productions.values():
            for production in body:
                symbols = production.split(' ')
                for symbol in symbols:
                    if symbol not in non_terminals:
                        terminals.add(symbol)

        start = non_terminals[0]
        non_terminals = set(non_terminals)
        return grammar.Grammar(non_terminals, terminals, start, productions)
def set_cpp():
    return gr.Grammar(set_rules(), set_nonterm(), set_term(),
                      gr.Term("программа"))
Beispiel #24
0
        print(file_)
        if file_ in [".DS_Store", ".DS_Storebinarized.txt"]:
            continue
        #print (file_)
        f = codecs.open(subfolder + file_, encoding="utf-8")
        for line in f:
            #print (line[:-1])
            try:
                t = tree(string=line[:-1])
                sent = t.sentence
                if "=" in sent:
                    print(sent)
                    continue
                #devo mettere l'altro livello di "binarizzazione"
                t.binarize()
                t.normalize()
            except Exception as e:
                print(file_, line)
                print(e.with_traceback())
            rules.extend([grammar.fromTreetoRule(x) for x in t.allRules()])

print("")

G = grammar.Grammar(rules)

print(len(G.nonterminalrules))
print(len(G.terminalrules))
print(len(G.symbols))

pickle.dump(G, codecs.open("grammarPennTree5.txt", "wb"))
Beispiel #25
0
def grammarTest(inFile = '../ParserTongue/ebnf.ebnf'):
    ebnf = grammar.Grammar(inFile)
    for i, rule in enumerate(ebnf.getRuleList()):
        print("Rule " + str(i) + ":\n")
        print(str(rule))
        print()
Beispiel #26
0
def grammarFileGenIters(grammarFile, iters, _debug = False):
    g = grammar.Grammar(grammarFile)
    grammarGenIters(g, iters, _debug)
Beispiel #27
0
def grammarFileGen(grammarFile, _debug = False):
    g = grammar.Grammar(grammarFile)
    grammarGen(g, _debug)
Beispiel #28
0
    def parse(self, input):
        """Read and parse automat."""
        self.index = 0
        self.str = input
        self._line = 1
        self._pos = 0
        self._charLine = 1
        self._charPos = 0

        while True:
            # wait for keyword
            token = self._getToken()
            self._tShould(token, ['id', ''])
            keyword = token.string

            if token.type == '':
                break

            token = self._getToken()
            self._tShould(token, ['='])
            if keyword == 'grammar':
                if self.grammar is not False:
                    raise ValueError("Grammar is defined twice in this file.",
                                     3)

                # new empty grammar
                self.grammar = grammar.Grammar()

                # wait for opening brackets
                token = self._getToken()
                self._tShould(token, ['('])
                token = self._getToken()
                self._tShould(token, ['{'])

                # load non-terminals
                self._loadIdsArr(self.grammar.addNonTerminal)

                # comma and opening bracket
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                self._tShould(token, ['{'])

                # load terminals
                self._loadCharArr(self.grammar.addTerminal)

                # comma and opening bracket
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                self._tShould(token, ['{'])

                # load rules
                self._loadGrammarRules()

                # comma and one character
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                self._tShould(token, ['id'])
                self.grammar.setStartSymbol(token.string)

                # closing bracket and comma - end of grammar
                token = self._getToken()
                self._tShould(token, [')'])

            elif keyword == 'automaton':
                if self.grammar is False:
                    raise ValueError(
                        "Automaton must " + " be defined after grammar.", 3)
                if self.automaton is not False:
                    raise ValueError("Automaton is defined twice.", 3)
                self.automaton = True

                # new empty automat
                aut = automat.Automat()

                # automat alphabet are terminals and nonterminals from grammar
                for symbol in self.grammar.nonterminals:
                    aut.addAlpha(symbol)
                for symbol in self.grammar.terminals:
                    aut.addAlpha(symbol)

                # wait for opening brackets
                token = self._getToken()
                self._tShould(token, ['('])
                token = self._getToken()
                self._tShould(token, ['{'])

                # load states
                self._loadIdsArr(aut.addState)

                # comma and opening bracket
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                self._tShould(token, ['{'])

                # load rules
                self._loadAutomatRules(aut)

                # comma and start state
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                if token.type != 'id':
                    raise ValueError("Missing automat start state", 3)
                else:
                    aut.setStart(token.string)

                # comma and opening bracket
                token = self._getToken()
                self._tShould(token, [','])
                token = self._getToken()
                self._tShould(token, ['{'])

                self._loadIdsArr(aut.setTerminating)

                # closing bracket and nothing
                token = self._getToken()
                self._tShould(token, [')'])
                if not self.aut:
                    self.aut = aut
                else:
                    aut.join(self.aut)
                    self.aut = aut
            elif keyword == 'precedence':
                if self.grammar is False:
                    raise ValueError(
                        "Precedence must be defined after" + " grammar.", 3)
                if self.prec is not False:
                    raise ValueError("Precedence is defined twice.", 3)

                self.prec = PrecedenceTable()
                token = self._getToken()
                self._tShould(token, ['('])
                while self.loadPrecedenceRules():
                    pass

            elif keyword == 'levels':
                if self.grammar is False:
                    raise ValueError(
                        "Levels must be defined after" + " grammar.", 3)
                if self.levels is not False:
                    raise ValueError("Levels are defined twice.", 3)
                self.levels = True

                token = self._getToken()
                self._tShould(token, ['{'])

                aut = automat.Automat()

                # automat alphabet are terminals and nonterminals from grammar
                for symbol in self.grammar.nonterminals:
                    aut.addAlpha(symbol)
                for symbol in self.grammar.terminals:
                    aut.addAlpha(symbol)

                start = '0'
                aut.addState(start)
                aut.setStart(start)
                stringNum = 0
                charNum = 0
                while True:
                    lastState = start
                    token = self._getToken()
                    self._tShould(token, ['}', ';', 'str'])
                    if token.type == '}':
                        break
                    else:
                        while True:
                            if token.type == ';':
                                aut.setTerminating(lastState)
                                break

                            newState = str(stringNum) + "-" + str(charNum)
                            aut.addState(newState)
                            aut.addRule(lastState, token.string, newState)
                            lastState = newState

                            token = self._getToken()
                            self._tShould(token, ['str', ';'])

                            charNum += 1
                    stringNum += 1
                if not self.aut:
                    self.aut = aut
                else:
                    self.aut.join(aut)

            else:
                raise ValueError("Undefined keyword '" + keyword + "'", 3)
        if self.grammar is False:
            raise ValueError("No grammar specified in grammar input file.", 3)
Beispiel #29
0
def main():
	# extract args
	p = args.pickle
	verbose = args.very_verbose or args.verbose
	check = args.check
	ambiguous = args.ambiguous
	mlps = args.most_likely_productions
	lower_case = args.lower_case
	test = args.test
	non_terms_for_ml = mlps.split() if mlps and mlps.__class__ != bool else ['VP', 'S', 'NP', 'SBAR', 'PP']
	max_word_length = 15

	# loading grammar
	if p:
		if verbose:
			util.log_g("Loading grammar from pickle file %s" % (p))
		pkl_file = open(p, 'rb')
		G = pickle.load(pkl_file)
		pkl_file.close()
	else:
		if verbose:
			util.log_g("Loading grammar from treebank %s" % (args.treebank))
		f = open(args.treebank, 'r')
		G = grammar.Grammar(f, args.grammar_limit, verbose, lower_case)
		f.close()
		if args.save:
			output = open(args.save + '.pkl', 'wb')
			pickle.dump(G, output)
			output.close()
	if verbose: util.log_g("Grammar loaded.")
		
	# running checks and statistics
	if check:
		util.log_g("Testing probability consistencies.")
		util.log_g("Greatest divergence from unity: %0.20f." % max([abs(1 - i) for i in G.check_pcfg_sums()]))
	if check or ambiguous:
		util.log_g("Ambiguous word tests.")
		ambig = G.ambiguous()
		ambig_words = zip(*ambig)[0] if ambig else []
		if ambiguous and not ambiguous.__class__ == bool:
			for word in ambiguous.split():
				if word in ambig_words:
					util.log_g("'%s' is ambiguous." % (word))
					pprint.pprint(ambig[ambig_words.index(word)])
				else:
					util.log_g("'%s' is not ambiguous." % (word))
		else:
			util.log_g("4 randomly chosen syntactically ambiguous terminals:")
			pprint.pprint(ambig[0:4])
	if check or mlps:
		util.log_g("Most likely production for non-terminals %s:" % non_terms_for_ml)
		mlps = G.most_likely_productions(non_terms_for_ml)
		pprint.pprint(mlps)
		
	# running CYK
	if args.cyk:
		if args.cyk.__class__ == bool:
			util.log_p("Enter new line to exit.")
			while True:
				s = raw_input('Enter a sentence to parse: ')
				if len(s):
					if verbose:
						util.log_p("Start CYK")
					parse = cyk.CYK(G, s, verbose, lower_case)
					if verbose > 1:
						util.log_p("Covering productions:")
						pprint.pprint(parse.covering_productions())
						util.log_p("Covering productions string: %s" % parse.covering_productions_str())
					util.log_p("Viterbi Parse: %s" % parse.viterbi_parse())
				else:
					break
		else:
			f = open(args.cyk)
			limit = args.parser_test_limit
			start = args.parser_test_start
			i = 0
			if test:
				f_vit = open('viterbi_sentences.txt', 'w')
			else:
				f_cov = open('covering_productions.txt', 'w')
			for line in f:
				if limit and i >= limit:
					break
				i += 1
				if start and i < start:
					continue
				if max_word_length and len(line.split()) > max_word_length:
					out = "\n"
					if test:
						f_vit.write(out)
					else:
						f_cov.write(out)
				else:
					util.log_p("Sentence %d, parsing sentence: << %s >>" % (i, line.strip()))
					parse = cyk.CYK(G, line, verbose)
					# write parse results to output file
					if test:
						out = parse.viterbi_parse()
						if out == util.NOT_IN_GRAMMAR_ERROR:
							out = "\n"
						else:
							out += "\n"
						f_vit.write(out)
					else:
						out = parse.covering_productions_str()
						f_cov.write(out + "\n")
					if verbose:
						util.log_p("Wrote line: %s" % out)
					gc.collect() # collect cyk object
			f.close()
			if test:
				f_vit.close()
			else:
				f_cov.close()
Beispiel #30
0
def parse (spec, filename='stdin'):
    '''Construct a new Grammar from the specification SPEC.'''

    def error (msg):
        '''Prints MSG to stderr and exits with a non-zero error code.'''
        print >>sys.stderr, 'Error: %s'% (msg)
        sys.exit (1)

    def checkpoint ():
        '''Create a parser checkpoint.'''
        return (lexer.checkpoint (), len (stack))

    def restore (checkpoint):
        '''Restore a parser checkpoint.'''
        lexer.restore (checkpoint[0])
        stack.__setslice__(0, len (stack), stack[0:checkpoint[1]])
        return True


    lexer = Tokenizer (spec, r'[ \n\r\t\v\f]+', r'//[^\n\r]*?(?:[\n\r]|$)')
    stack = []                          # semantic stack
    g = grammar.Grammar ()              # the grammar to build

    def G ():
        while Declaration ():
            pass

        if not lexer.token ('%%'):
            error ('"%%" must separate declarations from rules')

        if not R ():
            error ('must have at least one rule')
        rule = stack.pop ()
        g.setStartSymbol (rule.lhs)
        g.addRule (rule)
        while R ():
            rule = stack.pop ()
            g.addRule (rule)

        return lexer.token ('$') != None

    def Declaration ():
        if AssocDecl ():     pass
        elif ImportDecl ():  pass
        elif IgnoreDecl ():  pass
        elif OptDecl():      pass
        else:                return False
        return True

    def AssocDecl ():
        if lexer.token ('%right'):   assoc = grammar.Grammar.RIGHT_ASSOCIATIVE
        elif lexer.token ('%left'):  assoc = grammar.Grammar.LEFT_ASSOCIATIVE
        else:                        return False

        if not Terminal ():
            error ('"associativity" decls require at least one operator')

        ops = [stack.pop ()]
        while Terminal ():
            ops.append (stack.pop ())

        g.declareOperatorAssocs (ops, assoc)
        return True

    def ImportDecl ():
        if not lexer.token ('%import'):  return False

        if not PyModuleName ():
            error ('"import" decls require a module name')

        module = stack.pop ()
        g.declareImport (module)
        return True

    def IgnoreDecl ():
        if not lexer.token ('%ignore'): return False

        if not Terminal ():
            error ('"ignore" decls require a terminal symbol')

        term = stack.pop ()
        g.declareIgnore (term)
        return True

    def OptDecl ():
        if not lexer.token ('%optional'): return False

        if not (Nonterminal () and Terminal ()):
            error ('invalid %optional decl')

        regex = stack.pop ()
        lhs = stack.pop ()
        g.declareOptional (lhs, regex)
        return True

    def R ():
        if not Nonterminal ():
            return False

        if not lexer.token (r'\->'):
            error ('rules LHSs must be followed by "->"')

        rule = grammar.Rule (stack.pop ())
        if not Production ():
            error ('rule "{0}" has no productions'.format(rule.lhs))
        (rhs, actions, prec, assoc, subsym) = stack.pop ()
        rule.addProduction (rhs=rhs, actions=actions, prec=prec, assoc=assoc, subsym=subsym)

        while lexer.token (r'\|'):
            if not Production ():
                error ('(%s) "|" must be followed by a production'% (rule.lhs))
            (rhs, actions, prec, assoc, subsym) = stack.pop ()
            rule.addProduction (rhs=rhs, actions=actions, prec=prec, assoc=assoc, subsym=subsym)

        if not lexer.token (';'):
            error ('(%s) rules must be ended by ";"'% (rule.lhs))

        stack.append (rule)
        return True

    def Production ():
        if not (EmptyProd () or NonEmptyProd ()):
            return False
        (rhs, prec, assoc, actions, subsym) = stack.pop ()

        action = None
        if Action ():
            action = stack.pop ()
        actions.append (action)
        if subsym and len(rhs) > 1:     # can't subparse a multi-nonterminal RHS...
            error ('"subparse" requires a one-element RHS, but you\'ve got %s' % rhs)
        stack.append ((rhs, actions, prec, assoc, subsym))
        return True

    def EmptyProd ():
        if not Epsilon ():
            return False
        stack.append (([stack.pop ()], -1, None, [None], False))
        return True

    def NonEmptyProd ():
        if not ActionSymbol ():
            return False
        sym, action = stack.pop ()
        rhs = [sym]
        actions = [action]
        prec = -1
        assoc = None
        subsym = False
        while ActionSymbol ():
            sym, action = stack.pop ()
            rhs.append (sym)
            actions.append (action)
        if PrecDecl ():
            prec = stack.pop ()
        elif TempAssocDecl ():
            assoc = stack.pop ()
        elif SubParse ():
            subsym = stack.pop ()
        stack.append ((rhs, prec, assoc, actions, subsym))
        return True

    def ActionSymbol ():
        cp = checkpoint ()
        action = None

        if Action ():
            action = stack.pop ()
        if not Symbol ():
            restore (cp)
            return False

        stack.append ((stack.pop (), action))
        return True

    def PrecDecl ():
        if not lexer.token ('%dprec'):
            return False
        if not Number ():
            error ('"dprec" decls require a numeric precedence')
        return True

    def TempAssocDecl ():
        if not lexer.token ('%prec'):
            return False
        if not Terminal ():
            error ('"prec" decls require a terminal')
        return True

    def SubParse ():
        if not lexer.token ('%subparse'):
            return False
        stack.append (True)
        return True

    def Symbol ():
        return Terminal () or Nonterminal ()

    def Terminal ():
        return String () or Regex ()

    def Nonterminal ():
        match = lexer.token (r'[a-zA-Z][a-zA-Z0-9_]*')
        if not match:
            return False
        stack.append (match)
        return True

    def String ():
        match = lexer.token (r'\'.*?\'')
        if not match:
            return False
        stack.append (re.compile (re.escape (match[1:-1])))
        return True

    def Regex ():
        match = lexer.token (r'/ (?: \\\\ | \\/ | [^/])* /')
        if not match:
            return False
        try:
            stack.append (re.compile (match[1:-1]))
        except:
            error ('invalid regular expression')
        return True

    def Epsilon ():
        if not lexer.token ('_'):
            return False
        stack.append (grammar.Grammar.EPSILON)
        return True

    def PyModuleName ():
        match = lexer.token (
            r'[a-zA-Z_][a-zA-Z0-9_]* (?: \. [a-zA-Z_][a-zA-Z0-9_]*)*')
        if not match:
            return False
        stack.append (match)
        return True

    def Action ():
        match = lexer.token (r'%\{ (?: . | [\n\r])*? %\}')
        if not match:
            return False
        stack.append (match[2:-2])
        return True

    def Number ():
        match = lexer.token (r'[0-9]+')
        if not match:
            return False
        try:
            stack.append (int (match))
        except:
            error ('number too large')
        return True

    # And finally, build and return a Grammar
    if not G ():  error ('invalid grammar')
    return g