# one for the rest of possessives # one for punctuation r"[-\w]+|'\w+|[^-\w\s]+", tokenstring, re.U # Use unicode classes, otherwise we would split # "são jaques" into ["s", "ão","jaques"] ) grammar=parse_grammar(""" S -> NP VP NP -> Det Nom | Nom | NP PP Det -> NP "'s" Nom -> N SRel | N VP -> Vi | Vt NP | VP PP PP -> Prep NP SRel -> Relpro VP Det -> 'a' | 'the' N -> 'fish' | 'frogs' | 'soup' | 'children' | 'books' Prep -> 'in' | 'for' Vt -> 'saw' | 'ate' | 'read' Vi -> 'fish' | 'swim' Relpro -> 'that' """) # Use this grammar for the rest of the assignment grammar2=parse_grammar([ "S -> Sdecl '.' | Simp '.' | Sq '?' ", "Sdecl -> NP VP", "Simp -> VP", "Sq -> Sqyn | Swhadv", "Sqyn -> Mod Sdecl | Aux Sdecl",
def tokenise(tokenstring): '''Split a string into a list of tokens, treating punctuation as separate tokens, and splitting contractions into their parts. So for example "I'm leaving." --> ["I","'m","leaving","."]''' return re.findall(r"[a-zA-Z]+|'[a-z]+|[,.?;:()-]", tokenstring) grammar=parse_grammar(""" S -> NP VP NP -> Det Nom | Nom | NP PP Det -> NP "'s" Nom -> N SRel | N VP -> Vi | Vt NP | VP PP PP -> Prep NP SRel -> Relpro VP Det -> 'a' | 'the' N -> 'fish' | 'frogs' | 'soup' | 'children' | 'books' Prep -> 'in' | 'for' Vt -> 'saw' | 'ate' | 'read' Vi -> 'fish' | 'swim' Relpro -> 'that' """) print grammar chart=CKY(grammar) chart.parse("the frogs swim".split()) # Should use # tokenise(s) once that's fixed chart.pprint() # Use this grammar for the rest of the assignment
# You will need three sub-patterns: # one for words and the first half of possessives # one for the rest of possessives # one for punctuation r'\b[a-zA-Z]+|\'?[a-zA-Z]+|[^ ]+', tokenstring) grammar = parse_grammar(""" S -> NP VP NP -> Det Nom | Nom | NP PP Det -> NP "'s" Nom -> N SRel | N VP -> Vi | Vt NP | VP PP PP -> Prep NP SRel -> Relpro VP Det -> 'a' | 'the' N -> 'fish' | 'frogs' | 'soup' | 'children' | 'books' Prep -> 'in' | 'for' Vt -> 'saw' | 'ate' | 'read' Vi -> 'fish' | 'swim' Relpro -> 'that' """) print(grammar) chart = CKY(grammar) chart.recognise("the frogs swim".split()) # Should use # tokenise(s) once that's fixed chart.pprint() # Q1: Uncomment this once you've completed Q1
mods = dict( zip(modNames[:len(modFilenames)], [importlib.import_module(n) for n in modFilenames])) except (ModuleNotFoundError, ImportError) as e: print("Filenames must be importable: %s" % e, file=sys.stderr) exit(2) from cfg_fix import parse_grammar, Tree grammar = parse_grammar(""" S -> NP VP NP -> Det Nom | Nom | NP PP Det -> NP "'s" Nom -> N SRel | N VP -> Vi | Vt NP | VP PP PP -> Prep NP SRel -> Relpro VP Det -> 'a' | 'the' N -> 'fish' | 'frogs' | 'soup' | 'children' | 'books' Prep -> 'in' | 'for' Vt -> 'saw' | 'ate' | 'read' Vi -> 'fish' | 'swim' Relpro -> 'that' """) #' chart = mods['cky'].CKY(grammar) def callCount(modName, methodName): # attempt to count calls to this method pat = re.compile('[^#]*\.' + methodName + '\(') return sum(1 for l in inspect.getsourcelines(mods[modName])[0] if pat.match(l))