def execute(text: str): groucho_grammer = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) parser = ChartParser(groucho_grammer) tokens = word_tokenize(text=SAMPLE_3) print(type(tokens)) print(tokens) for tree in parser.parse(tokens=[ 'The', 'little', 'bear', 'saw', 'the', 'fine', 'fat', 'trout', 'in', 'the', 'brook', ]): print(tree)
def generate_phrase(self, pool): try: adj = choice(list(pool.adjectives)) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) #adj = choice(list(pool.adjectives)) noun = choice(list(pool.comparisons[adj])) if en.noun.plural(noun.name) == noun.name: article = "the" else: article = en.noun.article(noun.name).split(" ")[0] replace_words = {'adj': adj, 'n': noun, 'det': article} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase except: return
def context_free_grammar(): cfg = CFG.fromstring("""\ ################# Rules ################# S -> NP VP S -> PP NP VP S -> Wh Aux NP VP NP -> ProperNoun | CC ProperNoun | N | ProperNoun NP | AP N | DET NP | N PP VP -> V | V NP | Adv VP | V NP VP AP -> Adj | Adj AP PP -> P NP | P NP VP ################# Lexicons ################# N -> 'milk'| 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear' ProperNoun -> 'Bart' | 'Homer' | 'Lisa' Aux -> 'do' | 'does' CC -> 'and' Adj -> 'blue' | 'healthy' | 'green' DET -> 'a' | 'the' Adv -> 'always' | 'never' P -> 'in' | 'before' | 'on' | 'when' Wh -> 'when' """) cfparser = ChartParser(cfg) sents = text.splitlines() for sent in sents: parses = cfparser.parse(sent.split()) print(sent) for tree in parses: print(tree)
def parse_original_sentences(grammar): ''' Uses given grammar to parse sentences from the file corpus.txt Writes the parse trees of each sentence in parsed_corpus.txt :param grammar: A context free grammar in the form of nltk.grammar.CFG :return: None (Output in parsed_corpus.txt) ''' parser = ChartParser(grammar) f = open("corpus.txt", "r") f_write = open("parsed_corpus.txt", "w") lines = f.readlines() count = 1 working = [] for line in lines: line = line.replace("didnt", "did not") s = "Tree {}:\n".format(count) sent = word_tokenize(line[:-2]) for tree in parser.parse(sent): s += str(tree) + "\n\n" working.append(count) break count += 1 f_write.write(s) f.close() f_write.close() print( "Parsed form of original corpus sentences using this CFG can be found in parsed_corpus.txt\n" )
def generate_phrase(self, pool): parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(pool.nouns)) try: replace_words = { 'n': [noun], 'v': [Word(self.conjugate(v.name)) for v in list(pool.verbs[noun])], 'adj': pool.epithets[noun], 'atv': [Word(self.conjugate(v, self.person)) for v in self.atv], 'eva': [Word(self.conjugate(v, self.person)) for v in self.eva], 'ej': pool.emotional_adjectives, 'en': pool.emotional_nouns, 'erb': pool.emotional_adverbs, 'person': [Word(self.persons[self.person][0])], 'pron': [Word(self.persons[self.person][1])] } except: return for pos in replace_words: while pos in phrase: try: word = choice(replace_words[pos]) phrase = self.replace_pos(pos, word, phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def recognizes(cfg, word): """ cfg : a nltk.grammar.CFG instance word : a string with tokens separated with spaces. A parser is created at every call of this function. """ return _recognizes(ChartParser(cfg), word.split())
def __init__(self, grammar): """ Initialize from a CFG. :type grammar: CFG :param grammar: The grammar for this oracle """ self._parser = ChartParser(grammar)
def parse_sentences(grammar): parser = ChartParser(grammar) sent = input("Parse a sentence (Q to quit): ") while sent != "Q": tokens = word_tokenize(sent) trees = parser.parse(tokens) print_trees(trees) sent = input("Parse a sentence (Q to quit): ")
def generate_name(G): grammar = CFG.fromstring(G) parser = ChartParser(grammar) gr = parser.grammar() tokens = produce(gr, gr.start()) name = ''.join(tokens) return name.title()
def accepted_length(cfg, x): """ Returns a list of every accepted word of a context-free grammar with a specific length """ terminals = _get_terminal_symbols(cfg) parser = ChartParser(cfg) accepted = [] for y in product(terminals, repeat=x): if _recognizes(parser, y): accepted.append(' '.join(y)) return accepted
def generate_impacts_question(attr, impacts, phase): impact = get_attribute_name(attr, impacts) parser = ChartParser(generate_impacts_grammar(impact, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': attr, 'topicId': 4 } return question
def generate_entities_question(attr, entities, phase): entity = get_attribute_name(attr, entities) parser = ChartParser(generate_entities_grammar(entity, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': attr, 'topicId': 3 } return question
def recognizesAll(cfg, words): """ Returns a list of boolean values corresponding to [recognizes(cfg,w) for w in words]. cfg : a nltk.grammar.CFG instance words must be a list of string with tokens separated with spaces. """ r = [] parser = ChartParser(cfg) for word in words: r.append(_recognizes(parser, word.split())) return r
def get_productions(sentence, grammar): trees = [] sent = sentence.split(' ') print sent cfgGrammar = CFG.fromstring(grammar) parser = ChartParser(cfgGrammar) for tree in parser.parse(sent): trees.append(str(tree).replace("\n", " ")) # print trees[0] t = Tree.fromstring(trees[0]) return t.productions()
def accepted_under(cfg, length): """ Returns a list of every accepted word of a context-free grammar under a given length. cfg : a nltk.grammar.CFG instance. """ terminals = _get_terminal_symbols(cfg) parser = ChartParser(cfg) accepted = [] for x in range(1, length): for y in product(terminals, repeat=x): if _recognizes(parser, y): accepted.append(' '.join(y)) return accepted
def generate_sources_question(attr, parent_attr, sources, phase): id = attr attribute = get_attribute_name(attr, sources) attribute = analyze_numerus(attribute) if parent_attr is not None: parent_attr = get_attribute_name(parent_attr, sources) parser = ChartParser( generate_sources_grammar(attribute, parent_attr, phase)) gr = parser.grammar() question = { 'text': ' '.join(produce(gr, gr.start())), 'answer': 0, 'questionId': 0, 'attrId': id, 'topicId': 1 } return question
def main(): cfparser = ChartParser(cfg) index = 0 for sent in text: index += 1 print_tree(sent, cfparser, index) print "Input testing sentece or the number of the above one: (q to quit)" str = sys.stdin.readline().strip() while str != "q": try: index = int(str) print_tree(text[index], cfparser, index) except IndexError: print "Index out of range. Please check." except ValueError: print_tree(str, cfparser, -1) print "Input testing sentece or the number of the above one: (q to quit)" str = sys.stdin.readline().strip()
def parse_blazon(blazon): blazon = blazon.lower() to_discard = set(string.punctuation) to_discard.remove("&") blazon = ''.join(c for c in blazon if c not in to_discard) # Convert raw data to tokens to be parsed tokens = word_tokenize(blazon) # Replace instances of '1st', '2nd', etc with their non abbreviated forms for (index, item) in enumerate(tokens): if (item in abbr_to_full): tokens[index] = abbr_to_full[item] elif (item == "&"): tokens[index] = "and" # Sanitise tokens tokens = disambiguate_colours(tokens) tokens = reorder(tokens) # Construct grammar and parser with open('app/parser_cfg.txt') as f: raw_cfg = f.read() parser_grammar = CFG.fromstring(raw_cfg) parser = ChartParser(parser_grammar) # Parse data into tree output_data = None for tree in parser.parse(tokens): output_data = tree if (output_data is None): print("Error: Parse failed, please check input is of correct format.") else: # Convert Tree to dict to prepare it for JSON serialisation output_data = tree_to_dict(output_data) # If a tincture is in the top level of the dictionary, change its name to "field" if ("tincture" in output_data.keys()): output_data["field"] = output_data["tincture"] output_data.pop("tincture") # Convert dict to JSON return (output_data)
def verify(self, grammar, tags): """ Verify tag sequence as grammatically correct or not """ # rd_parser = RecursiveDescentParser(grammar) rd_parser = ChartParser(grammar) valid = False try: for tree in rd_parser.parse(tags): valid = True break except ValueError: print "This is a grammatical structure I don't understand yet." return if valid: print "Valid" return True else: print "Invalid" return False
def generate_phrase(self): adj = choice([a for a in self.blackboard.pool.comparisons if len(self.blackboard.pool.comparisons[a]) > 0]) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) noun = choice(list(self.blackboard.pool.comparisons[adj])) noun.name = en.singularize(noun.name) article = en.referenced(noun.name).split(" ")[0] replace_words = {'adj': adj, 'n': noun, 'det': article} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos( pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def generate_phrase(self, pool): noun = random.choice(list(pool.nouns)) parser = ChartParser(self.grammar) gr = parser.grammar() phrase = self.produce(gr, gr.start()) phrase.append("?") try: adj = choice(pool.epithets[noun]) except: return replace_words = {'adj': adj, 'n': noun, 'be': self.conjugate("be")} for pos in replace_words: while pos in phrase: try: phrase = self.replace_pos(pos, replace_words[pos], phrase) except: return for w in phrase: if not isinstance(w, Word): phrase[phrase.index(w)] = Word(w) return phrase
def parse_sentences(grammar, sent): parser = ChartParser(grammar) tokens = word_tokenize(sent) trees = parser.parse(tokens) return trees
Nominal -> NOUN | Nominal PP | ADJ Nominal | Nominal NOUN PP -> Prep NP AdvC -> CONJ S ProperNoun -> 'Bart' | 'Homer' | 'Lisa' CONJ -> 'and' | 'when' ADV -> 'always' | 'never' V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear' DET -> 'a' | 'the' NOUN -> 'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' ADJ -> 'blue' | 'healthy' | 'green' Prep -> 'in' | 'before' | 'on' WH -> 'when' Aux -> 'do' | 'does' """) cfparser = ChartParser(cfg) text = """ Bart laughs Homer laughed Bart and Lisa drink milk Bart wears blue shoes Lisa serves Bart a healthy green salad Homer serves Lisa Bart always drinks milk Lisa thinks Homer thinks Bart drinks milk Homer never drinks milk in the kitchen before midnight when Homer drinks milk Bart laughs when does Lisa drinks the milk on the table when do Lisa and Bart wear shoes """
NN -> 'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' Adjective -> 'blue' | 'healthy' | 'green' Determinant -> 'a' | 'the' Adverb -> 'always' | 'never' | 'before' | 'when' Preposition -> 'in' | 'on' """) #NP -> ProperNoun #ProperNoun -> 'Homer' | 'Bart' #VP -> V #V -> 'laughs' | 'laughed' | # Produce Trees for Step 2 # Init Parser cf_parser = ChartParser(cf_grammar) # Init Sentences to test correct_grammar_sents = """\ Bart laughs Homer laughed Bart and Lisa drink milk Bart wears blue shoes Lisa serves Bart a healthy green salad Homer serves Lisa Bart always drinks milk Lisa thinks Homer thinks Bart drinks milk Homer never drinks milk in the kitchen before midnight when Homer drinks milk Bart laughs when does Lisa drink the milk on the table when do Lisa and Bart wear shoes
import nltk from nltk import ChartParser # Load grammar. grammar = nltk.data.load('labelgrammar.cfg') parser = ChartParser(grammar) def analyze_label(label): "Analyze a label using our CFG." tokenized_label = label.split() try: analysis = parser.parse(tokenized_label) trees = list(analysis) for tree in trees: print(tree) if len(trees) > 0: return analysis else: print('No analysis possible') return None except ValueError as e: print('No analysis possible:', e.strerror) return None
from nltk import data, ChartParser from nltk import pos_tag from nltk.corpus import inaugural data.clear_cache() G = data.load("file:mygrammar.cfg") RDP = ChartParser(G) # extract_short_sents :: Int?, Int?, Corpus?-> [[(String, String)]] def extract_short_sents(num=8, max_len=8, corpus=inaugural): li = [] num = num if num < len(corpus.fileids()) else len(corpus.fileids()) for i in range(num): for sent in corpus.sents(corpus.fileids()[i]): if len(sent) <= max_len: li.append(pos_tag(sent)) if len(li) / 3.0 == i: break return li # parse :: String -> ParseTree def parse(s): return RDP.parse(s.split()) if __name__ == "__main__": sents = [
def __init__(self, grammar_string): self.grammar = CFG.fromstring(grammar_string) self.parser = ChartParser(self.grammar) self.tokenizer = self._get_tokenizer()
def main(): # Check arguments if (len(sys.argv) == 1): print("Too few arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]") sys.exit(0) elif (len(sys.argv) > 3): print("Too many arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]") sys.exit(0) # Initialise paths WORKING_DIR = sys.path[0] INPUT_FILE = os.path.join(WORKING_DIR, sys.argv[1]) if (len(sys.argv) == 3): OUTPUT_FILE = os.path.join(WORKING_DIR, sys.argv[2]) else: # Extract base filename of input file OUTPUT_NAME = os.path.basename(INPUT_FILE) # Strip off file extension and add own (.esc for escutcheon) OUTPUT_NAME = "trees/" + os.path.splitext(OUTPUT_NAME)[0] + ".esc" OUTPUT_FILE = os.path.join(WORKING_DIR, OUTPUT_NAME) # Read in input data with open(INPUT_FILE) as f: raw_data = f.read().lower() to_discard = set(string.punctuation) to_discard.remove("&") raw_data = ''.join(c for c in raw_data if c not in to_discard) # Convert raw data to tokens to be parsed tokens = word_tokenize(raw_data) # Replace instances of '1st', '2nd', etc with their non abbreviated forms for (index, item) in enumerate(tokens): if (item in abbr_to_full): tokens[index] = abbr_to_full[item] elif (item == "&"): tokens[index] = "and" # Sanitise tokens tokens = disambiguate_colours(tokens) tokens = reorder(tokens) # Construct grammar and parser with open('parser_cfg.txt') as f: raw_cfg = f.read() parser_grammar = CFG.fromstring(raw_cfg) parser = ChartParser(parser_grammar) # Parse data into tree output_data = None for tree in parser.parse(tokens): output_data = tree if (output_data is None): print("Error: Parse failed, please check input is of correct format.") else: # Convert Tree to dict to prepare it for JSON serialisation output_data = tree_to_dict(output_data) # If a tincture is in the top level of the dictionary, change its name to "field" if ("tincture" in output_data.keys()): output_data["field"] = output_data["tincture"] output_data.pop("tincture") # Convert dict to JSON with open(OUTPUT_FILE, 'w+') as f: json.dump(output_data, f, indent=2)
def make_sentence(corpus, term_rules, *args, **kwargs): ''' Generate sentences with random structure and word choice using a context-free grammar The start point is taken from the sentence itself. Parameters ---------- corpus : str a string containing the full, cleaned corpus term_rules : str a string containing all the terminal rules for the corpus maxdepth : int The maximum allowed recursion depth before throwing a ValueError fixed_grammar : bool Turn off the random sentence selection and used a fixed grammar instead. sample_sentence : str When fixed_grammar is turned on, this is the sentence that will be parsed. This can be finicky with grammars containing specially punctuated constructions like quotations or positions args[0] : dict() Optional: a dictionary of kgrams and their subsequent words. If this variable exists then cfgen will use this to pick the next words with conditional weighting (The prescence of this argument turns on Markov text generation features.) Notes ----- Add the ability to turn off the kgram parsing, ideally by counting the number of unnamed arguments ----> Added this option ''' markov_flag = (not len(args) == 0) if markov_flag: kgram_dict = args[0] fixed_grammar = kwargs.pop('fixed_grammar', False) sample_sentence = kwargs.pop('sample_sentence', '') maxdepth = kwargs.pop('maxdepth', 25) if fixed_grammar: if sample_sentence == '': warnings.warn('When using fixed_grammar, user should specify ' \ 'the keyword argument "sample_sentence." Using a default simple sentence.') sample_sentence = 'The cow jumped over the moon.' else: pass flag = False attempts = 0 while not flag and attempts < 30: tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if has_parser and not fixed_grammar: rsent = choice(tokenizer.tokenize(corpus)) elif fixed_grammar: rsent = sample_sentence elif not has_parser and not fixed_grammar: # select from a parsed corpus of pre-approved grammars print("Usage library being built") rsent = "The dog walked up the stairs slowly." else: print("Usage library being built") rsent = "The dog walked up the stairs slowly." parsed_syntax = parse_sentence(rsent) # print(parsed_syntax) cfg_str = term_rules + parsed_syntax try: startpt = parsed_syntax[:parsed_syntax.find(' ->')] startpt = nltk.grammar.Nonterminal(startpt) grammar = CFG.fromstring(cfg_str) parser = ChartParser(grammar) gr = parser.grammar() if markov_flag: out_txt = (' '.join( produce_kgram(gr, startpt, kgram_dict, maxdepth=maxdepth, sent=[]))) else: out_txt = (' '.join(produce(gr, startpt, maxdepth=maxdepth))) flag = True except ValueError: warnings.warn( 'Badly formed sentence encountered, resampling the corpus.') attempts = attempts + 1 # now re-tag special characters swappairs = zip(replacements, to_replace) for member in swappairs: out_txt = out_txt.replace(member[0], member[1]) return out_txt
def make_sentence(self, do_markov=True, **kwargs): ''' Generate sentences with random structure and word choice using a context-free grammar The start point is taken from the sentence itself. Parameters ---------- do_markov : boolean that can be used to toggle the Markov word selection on or off maxdepth : int The maximum allowed recursion depth before throwing a ValueError fixed_grammar : bool Turn off the random sentence selection and used a fixed grammar instead. sample_sentence : str When fixed_grammar is turned on, this is the sentence that will be parsed. This can be finicky with grammars containing specially punctuated constructions like quotations or positions Notes ----- Add the ability to turn off the kgram parsing, ideally by counting the number of unnamed arguments ----> Added this option ''' corpus = self.corpus term_rules = self.term_rules if (self.order > 0) and do_markov: markov_flag = True else: markov_flag = False fixed_grammar = kwargs.pop('fixed_grammar', False) sample_sentence = kwargs.pop('sample_sentence', '') maxdepth = kwargs.pop('maxdepth', 25) if fixed_grammar: if sample_sentence == '': warnings.warn('When using fixed_grammar, user should specify ' \ 'the keyword argument "sample_sentence." Using a default simple sentence.') sample_sentence = 'The cow jumped over the moon.' else: pass flag = False attempts = 0 while not flag and attempts < 30: tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if has_parser and not fixed_grammar: rsent = choice(tokenizer.tokenize(corpus)) elif fixed_grammar: rsent = sample_sentence elif not has_parser and not fixed_grammar: # select from a parsed corpus of pre-approved grammars warnings.warn( "Usage library being built, falling back to simple sentence" ) rsent = "The dog walked up the stairs slowly." else: warnings.warn( "Usage library being built, falling back to simple sentence" ) rsent = "The dog walked up the stairs slowly." parsed_syntax = self.parse_sentence(rsent) # print(parsed_syntax) cfg_str = term_rules + parsed_syntax try: startpt = parsed_syntax[:parsed_syntax.find(' ->')] startpt = nltk.grammar.Nonterminal(startpt) grammar = CFG.fromstring(cfg_str) parser = ChartParser(grammar) gr = parser.grammar() if markov_flag: out_txt = (' '.join( self.produce_kgram(gr, startpt, maxdepth=maxdepth, sent=[]))) else: out_txt = (' '.join( self.produce(gr, startpt, maxdepth=maxdepth))) flag = True except ValueError: warnings.warn( 'Badly formed sentence encountered, resampling the corpus.' ) attempts += 1 # now re-tag special characters swappairs = zip(replacements, to_replace) for member in swappairs: out_txt = out_txt.replace(member[0], member[1]) return out_txt