def context_free_grammar(): cfg = CFG.fromstring("""\ ################# Rules ################# S -> NP VP S -> PP NP VP S -> Wh Aux NP VP NP -> ProperNoun | CC ProperNoun | N | ProperNoun NP | AP N | DET NP | N PP VP -> V | V NP | Adv VP | V NP VP AP -> Adj | Adj AP PP -> P NP | P NP VP ################# Lexicons ################# N -> 'milk'| 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear' ProperNoun -> 'Bart' | 'Homer' | 'Lisa' Aux -> 'do' | 'does' CC -> 'and' Adj -> 'blue' | 'healthy' | 'green' DET -> 'a' | 'the' Adv -> 'always' | 'never' P -> 'in' | 'before' | 'on' | 'when' Wh -> 'when' """) cfparser = ChartParser(cfg) sents = text.splitlines() for sent in sents: parses = cfparser.parse(sent.split()) print(sent) for tree in parses: print(tree)
def execute(text: str): groucho_grammer = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) parser = ChartParser(groucho_grammer) tokens = word_tokenize(text=SAMPLE_3) print(type(tokens)) print(tokens) for tree in parser.parse(tokens=[ 'The', 'little', 'bear', 'saw', 'the', 'fine', 'fat', 'trout', 'in', 'the', 'brook', ]): print(tree)
def parse_original_sentences(grammar): ''' Uses given grammar to parse sentences from the file corpus.txt Writes the parse trees of each sentence in parsed_corpus.txt :param grammar: A context free grammar in the form of nltk.grammar.CFG :return: None (Output in parsed_corpus.txt) ''' parser = ChartParser(grammar) f = open("corpus.txt", "r") f_write = open("parsed_corpus.txt", "w") lines = f.readlines() count = 1 working = [] for line in lines: line = line.replace("didnt", "did not") s = "Tree {}:\n".format(count) sent = word_tokenize(line[:-2]) for tree in parser.parse(sent): s += str(tree) + "\n\n" working.append(count) break count += 1 f_write.write(s) f.close() f_write.close() print( "Parsed form of original corpus sentences using this CFG can be found in parsed_corpus.txt\n" )
class GrammarOracle(Oracle): """ An oracle from a grammar. """ def __init__(self, grammar): """ Initialize from a CFG. :type grammar: CFG :param grammar: The grammar for this oracle """ self._parser = ChartParser(grammar) def generates(self, sentence): """ Decides whether the grammar generates the sentence. :type sentence: Sentence :param sentence: A sentence :rtype: bool :return: Whether the grammar generates the sentence """ try: parses = self._parser.parse(sentence.get_words()) return list(parses) != [] except: return False
def parse_sentences(grammar): parser = ChartParser(grammar) sent = input("Parse a sentence (Q to quit): ") while sent != "Q": tokens = word_tokenize(sent) trees = parser.parse(tokens) print_trees(trees) sent = input("Parse a sentence (Q to quit): ")
def generate_parse_tree(sentence, grammar): # then generate the parse trees tokens = word_tokenize(sentence) parser = ChartParser(grammar) # print type(grammar), type(parser) try: return parser.parse(tokens) except Exception: #print "Sentence '" + sentence + "' cannot be parsed using the given grammar." return Tree('Error', ['Error'])
def get_productions(sentence, grammar): trees = [] sent = sentence.split(' ') print sent cfgGrammar = CFG.fromstring(grammar) parser = ChartParser(cfgGrammar) for tree in parser.parse(sent): trees.append(str(tree).replace("\n", " ")) # print trees[0] t = Tree.fromstring(trees[0]) return t.productions()
def parse_blazon(blazon): blazon = blazon.lower() to_discard = set(string.punctuation) to_discard.remove("&") blazon = ''.join(c for c in blazon if c not in to_discard) # Convert raw data to tokens to be parsed tokens = word_tokenize(blazon) # Replace instances of '1st', '2nd', etc with their non abbreviated forms for (index, item) in enumerate(tokens): if (item in abbr_to_full): tokens[index] = abbr_to_full[item] elif (item == "&"): tokens[index] = "and" # Sanitise tokens tokens = disambiguate_colours(tokens) tokens = reorder(tokens) # Construct grammar and parser with open('app/parser_cfg.txt') as f: raw_cfg = f.read() parser_grammar = CFG.fromstring(raw_cfg) parser = ChartParser(parser_grammar) # Parse data into tree output_data = None for tree in parser.parse(tokens): output_data = tree if (output_data is None): print("Error: Parse failed, please check input is of correct format.") else: # Convert Tree to dict to prepare it for JSON serialisation output_data = tree_to_dict(output_data) # If a tincture is in the top level of the dictionary, change its name to "field" if ("tincture" in output_data.keys()): output_data["field"] = output_data["tincture"] output_data.pop("tincture") # Convert dict to JSON return (output_data)
def verify(self, grammar, tags): """ Verify tag sequence as grammatically correct or not """ # rd_parser = RecursiveDescentParser(grammar) rd_parser = ChartParser(grammar) valid = False try: for tree in rd_parser.parse(tags): valid = True break except ValueError: print "This is a grammatical structure I don't understand yet." return if valid: print "Valid" return True else: print "Invalid" return False
import nltk from nltk import ChartParser # Load grammar. grammar = nltk.data.load('../../Grammar/full_grammar.cfg') parser = ChartParser(grammar) with open('human_chunks.txt') as f: noun_chunks = [line.strip().split() for line in f] not_covered = [] for chunk in noun_chunks: try: result = parser.parse(chunk) print(f"Valid: {chunk}") except ValueError: print(f"Not covered: {chunk}") chunk = ' '.join(chunk) + '\n' not_covered.append(chunk) with open("not-covered.txt", 'w') as f: f.writelines(not_covered) num_chunks = len(noun_chunks) num_covered = len(noun_chunks) - len(not_covered) num_not_covered = len(not_covered) print(f"Number of unique noun chunks: {num_chunks}") print(f"Covered: {num_covered} ({(num_covered/num_chunks) * 100}%)") print( f"Not covered: {num_not_covered} ({(num_not_covered/num_chunks) * 100}%)")
Sp -> P Sa -> 'tells' 'you' 'that' | 'says' | 'says' 'that' | 'claims' | 'claims' 'that' | 'tells you' St -> PG Is Class | PG Quant Is Class | Quant -> Comp Count Comp -> 'exactly' Count -> 'one' Not -> 'neither' | 'nor' PG -> 'i' | PG PG | Not P | P | 'of' PG | PG 'and' PG P -> 'zoey' | 'mel' | 'peggy' | 'zippy' | 'sue' | 'sally' | 'homer' | 'bozo' | 'marge' | 'zed' | 'alice' | 'ted' | 'bart' | 'bob' | 'betty' Is -> 'is' 'a' | 'are' Class -> Kni | Kna Kni -> 'knight' | 'knights' Kna -> 'knave' | 'knaves' """) def preprocess(sent): return "".join([letter for letter in sent.lower() if letter in "qwertyuiopasdfghjklzxcvbnm "]).split() sents = ["Zoey tells you that mel is a Knave", "Mel says, `Neither Zoey nor I are knaves.'", "Peggy tells you that 'of Zippy and I, exactly one is a knight'."] sents = [preprocess(sent) for sent in sents] parser = ChartParser(kk_grammar) for sent in sents: for tree in parser.parse(sent): print(tree)
def main(): # Check arguments if (len(sys.argv) == 1): print("Too few arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]") sys.exit(0) elif (len(sys.argv) > 3): print("Too many arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]") sys.exit(0) # Initialise paths WORKING_DIR = sys.path[0] INPUT_FILE = os.path.join(WORKING_DIR, sys.argv[1]) if (len(sys.argv) == 3): OUTPUT_FILE = os.path.join(WORKING_DIR, sys.argv[2]) else: # Extract base filename of input file OUTPUT_NAME = os.path.basename(INPUT_FILE) # Strip off file extension and add own (.esc for escutcheon) OUTPUT_NAME = "trees/" + os.path.splitext(OUTPUT_NAME)[0] + ".esc" OUTPUT_FILE = os.path.join(WORKING_DIR, OUTPUT_NAME) # Read in input data with open(INPUT_FILE) as f: raw_data = f.read().lower() to_discard = set(string.punctuation) to_discard.remove("&") raw_data = ''.join(c for c in raw_data if c not in to_discard) # Convert raw data to tokens to be parsed tokens = word_tokenize(raw_data) # Replace instances of '1st', '2nd', etc with their non abbreviated forms for (index, item) in enumerate(tokens): if (item in abbr_to_full): tokens[index] = abbr_to_full[item] elif (item == "&"): tokens[index] = "and" # Sanitise tokens tokens = disambiguate_colours(tokens) tokens = reorder(tokens) # Construct grammar and parser with open('parser_cfg.txt') as f: raw_cfg = f.read() parser_grammar = CFG.fromstring(raw_cfg) parser = ChartParser(parser_grammar) # Parse data into tree output_data = None for tree in parser.parse(tokens): output_data = tree if (output_data is None): print("Error: Parse failed, please check input is of correct format.") else: # Convert Tree to dict to prepare it for JSON serialisation output_data = tree_to_dict(output_data) # If a tincture is in the top level of the dictionary, change its name to "field" if ("tincture" in output_data.keys()): output_data["field"] = output_data["tincture"] output_data.pop("tincture") # Convert dict to JSON with open(OUTPUT_FILE, 'w+') as f: json.dump(output_data, f, indent=2)
class GDev: # 1. We will create a grammer development tool. # Define a class caledd GDev. The __init__ method should take a # name (a string) as input, and store it in the member name. def __init__( self, name ): self.name = name return # 2. Define a method called load_grammar. It takes no arguments. # It expects the file name.cfg to exist, where name is the GDev name. # It loads a grammer from the file and stores it in the member grammer. def load_grammar( self ): s = open( self.name + '.cfg' ).read() self.grammar = CFG.fromstring(s) return # 3. Define a method called reload. It should call the method # load_grammar, even if the grammar has already been loaded before. # Then it should create a chart parser from the loaded grammar, and # store the parser in the member parser. def reload( self ): self.load_sents() self.load_grammar() self.parser = ChartParser( self.grammar ) return # 4. Define a method called parse. It should take one argument, a string. # It should call word_tokenize on the sentence, and pass the result to # the parser. The parse method should return a single tree. If the parser # returns more than one tree, then parse should return just the first one. # If the parser does not return any tress, then parse should return None. def parse( self, s ): try: return list( self.parser.parse( word_tokenize( s ) ) )[0] except: return None # 5. Define a method called load_sents. It takes no arguments. It expects # the file name.sents to exist. The file should contain one sentence per # line. Each sentence is either good or bad—good sentences are ones that # the grammar ought to generate, and bad sentences are ones that the # grammar should not generate. If the first character on the line is ’*’, # the sentence is bad, and otherwise it is good. The load_sents method # should produce a list of pairs (good, s) where good is True for good # sentences and False for bad ones, and s is the sentence itself (not # including the ’*’). The list of pairs should be stored in the member # sents. Create a file g1.sents containing the sentences Bob warbled, the # dog ate my telescope, and *Bob cat. def load_sents( self ): self.sents = [ ( True, line.rstrip('\r\n') ) \ if line[0] != '*' \ else (False, line.rstrip('\r\n')[1:]) \ for line in open(self.name + '.sents') ] # print( self.sents ) # 6. Define a method called parses. It should take no arguments. # It should iterate through the pairs (g,s) in sents, and it should # call parse on each sentence s in turn. For each sentence, it should # print an empty line, then the sentence, then the result of calling parse. def parses( self ): for s in self.sents: print( '\n' + s[1] ) print( self.parse( s[1] ) ) # 7. Write a method called regress that takes no arguments. It should go # through the pairs (good, s) in sents. For each, it should call parse on s. # Define the prediction to be True if parse returns a tree, and False otherwise. # If the prediction equals good, then the prediction is correct, and otherwise # the prediction is wrong. For each pair, print out one line of output. The output # line should start with '!!' if the prediction is wrong and ' ' (two spaces) # it is correct. Then print out a space. Then print '*' if good is False, and a # space if good is True. The output line ends with the sentence s. def regress( self ): prediction = False for s in self.sents: if self.parse( s[1] ) is not None: prediction = True else: prediction = False if prediction != s[0]: print( '!!' + ' ' , end = '') else: print( ' ' + ' ' , end = '') if s[0] == False: print( '*' , end = '') else: print( ' ' , end = '') print( s[1] ) # 8. Finally, the __call__ method should simply call reload and regress. # The idea is to use the set of example sentences to drive grammar development. # One adds sentences, calls gd() to see which ones are being handled correctly # or not, and then one edits the grammar to fix the prediction errors. After # each file edit, one needs merely call gd() to see the revised grammar's # predictions on the sentences. (Making sure that new revisions do not break # things that previously worked correctly is known as regression testing.) def __call__( self ): self.reload() self.regress()
class ContextFreeGrammarProcessor: def __init__(self, grammar_string): self.grammar = CFG.fromstring(grammar_string) self.parser = ChartParser(self.grammar) self.tokenizer = self._get_tokenizer() @property def start_index(self): return self.grammar.start() @property def last_index(self): return self.grammar.productions()[-1].lhs() @property def total_productions(self): return len(self.grammar.productions()) @property def lhs(self): return list(expression.lhs() for expression in self.grammar.productions()) @property def unique_lhs(self): return list(dict.fromkeys(self.lhs)) @property def unique_lhs_dictionary(self): return {left_rule: idx for idx, left_rule in enumerate(self.unique_lhs)} @property def production_dictionary(self): return {production: idx for idx, production in enumerate(self.grammar.productions())} def get_masks(self): mask = np.zeros((len(self.unique_lhs), self.total_productions)) for idx, symbol in enumerate(self.unique_lhs): mask[idx] = np.array([symbol == symbol_lhs for symbol_lhs in self.lhs], dtype=int) return mask def get_masks_idx(self): temp_mask = self.get_masks() res = [np.where(temp_mask[:, idx]==1)[0][0] for idx in range(self.total_productions)] return np.array(res) def _get_tokenizer(self): #TODO cleanup the function --> improve the logic long_tokens = list(filter(lambda symbol: len(symbol) > 1, self.grammar._lexical_index.keys())) replacements = ['$','%','^'] # ,'&'] assert len(long_tokens) == len(replacements) # for token in replacements: # assert not cfg._lexical_index.has_key(token) def tokenize(smiles): for idx, token in enumerate(long_tokens): smiles = smiles.replace(token, replacements[idx]) tokens = [] for token in smiles: try: ix = replacements.index(token) tokens.append(long_tokens[ix]) except: tokens.append(token) return tokens return tokenize def smile_to_production_seq(self, smile): production_seq = self.parser.parse(self.tokenizer(smile)).__next__().productions() return production_seq def to_one_hot(self, smile, max_depth=277): """ Args: smile: str Molecule represented in SMILE grammar max_depth: int Maximum number of productions used for composition of the SMILE string """ smile_to_prod_idx = [self.production_dictionary[production] for production in self.smile_to_production_seq(smile)] len_production_seq = len(smile_to_prod_idx) one_hot = np.zeros((max_depth, self.total_productions)) one_hot[np.arange(len_production_seq), smile_to_prod_idx] = 1. one_hot[np.arange(len_production_seq, max_depth),-1] = 1. return one_hot def sample_using_masks(self, logit_matrix): """ Implements Algorithm 1 from GrammarVAE paper: https://arxiv.org/abs/1703.01925 Args: logit_matrix: np.array """ # input: masks for selecting valid production rules masks = self.get_masks() stack = list() # initiate stack with the valid production rule (e.g. [smile] for SMILE CFG) stack.append(self.start_index) res = np.zeros_like(logit_matrix) eps = 1e-100 idx = 0 def pop_from_stack(stack_): try: res_ = stack_.pop() except: # the stack is empty, return 'end' production rule: Nothing -> None res_ = self.last_index return res_ while stack is not None and idx < logit_matrix.shape[0]: #print('Iteration: {}'.format(idx)) # 1. given (continuous) logit vector select valid production rule # pop the last pushed non-terminal production from the stack key = pop_from_stack(stack) #print(key) next_nonterminal = [self.unique_lhs_dictionary[key]] #print('Next nonterminal: {}'.format(next_nonterminal)) # select mask for mask for the last non-terminal rule mask = masks[next_nonterminal] #print(mask) # mask the logit vector so that only valid right-hand sides can be sampled masked_output = np.exp(logit_matrix[idx,:])*mask + eps #print(masked_output) # given the last non-terminal rule, sample a new valid production rule sampled_output = np.argmax(np.random.gumbel(size=masked_output.shape) + np.log(masked_output), axis=-1) #print('Sampled output: {}'.format(sampled_output)) # 2. one_hot encode the new sampled production rule res[idx, sampled_output] = 1.0 # 3. identify all non-terminals in RHS of selected production rhs = list() for idx_ in sampled_output: rhs.extend(list(filter(lambda a: (type(a) == grammar.Nonterminal) and (str(a) != 'None'), self.grammar.productions()[idx_].rhs()))) #print(rhs) # 4. push the selected non-terminals onto the stack in reverse order stack.extend(rhs[::-1]) idx += 1 #print("stack: {}".format(stack)) return res def from_logit_to_production_seq(self, logit): one_hot_vec = self.sample_using_masks(logit) one_hot_to_production_seq = [self.grammar.productions()[one_hot_vec[idx].argmax()] for idx in range(one_hot_vec.shape[0])] return one_hot_to_production_seq
DET -> 'a' | 'the' NOUN -> 'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table' ADJ -> 'blue' | 'healthy' | 'green' Prep -> 'in' | 'before' | 'on' WH -> 'when' Aux -> 'do' | 'does' """) cfparser = ChartParser(cfg) text = """ Bart laughs Homer laughed Bart and Lisa drink milk Bart wears blue shoes Lisa serves Bart a healthy green salad Homer serves Lisa Bart always drinks milk Lisa thinks Homer thinks Bart drinks milk Homer never drinks milk in the kitchen before midnight when Homer drinks milk Bart laughs when does Lisa drinks the milk on the table when do Lisa and Bart wear shoes """ sents = text.splitlines() for sent in sents: parses = cfparser.parse(sent.split()) print sent for tree in parses: print(tree)
def parse_sentences(grammar, sent): parser = ChartParser(grammar) tokens = word_tokenize(sent) trees = parser.parse(tokens) return trees
if os.path.exists(path): grammar_path = path break if grammar_path is None: raise FileNotFoundError("Could not find ATIS grammar in nltk data path!") grammar_handle = open(grammar_path, encoding='ISO-8859-1') else: grammar_handle = open(args.grammar_file) grammar = CFG.fromstring(grammar_handle.read()) grammar_handle.close() parser = ChartParser(grammar) if args.command == 'check': try: trees = parser.parse(args.text.split(' ')) try: tree = trees.__next__() print("The given sentence does conform to the grammar") if args.draw: tree.draw() sys.exit(0) except StopIteration: print(f"The given sentence does not conform to the grammar:\n 'Could not find a valid dependency graph'") sys.exit(1) except ValueError as e: print(f"The given sentence does not conform to the grammar:\n '{e}'") sys.exit(1) elif args.command == 'generate': sentences = []
class QueryParser(object): #PYPARSING preterminal definitions LBRACE = Suppress(Literal('(')) RBRACE = Suppress(Literal(')')) WRD = Regex("[0-9a-zA-Z_\-\—\,\.\?\!\>\<\=\/\:\;\&\{\}\+]+") ABL = LBRACE + Suppress(Literal('ABL')) + WRD + RBRACE ABN = LBRACE + Suppress(Literal('ABN')) + WRD + RBRACE ABX = LBRACE + Suppress(Literal('ABX')) + WRD + RBRACE AP = LBRACE + Suppress(Literal('AP')) + WRD + RBRACE AT = LBRACE + Suppress(Literal('AT')) + WRD + RBRACE BE = LBRACE + Suppress(Literal('BE')) + WRD + RBRACE BED = LBRACE + Suppress(Literal('BED')) + WRD + RBRACE BEDZ = LBRACE + Suppress(Literal('BEDZ')) + WRD + RBRACE BEG = LBRACE + Suppress(Literal('BEG')) + WRD + RBRACE BEM = LBRACE + Suppress(Literal('BEM')) + WRD + RBRACE BEN = LBRACE + Suppress(Literal('BEN')) + WRD + RBRACE BER = LBRACE + Suppress(Literal('BER')) + WRD + RBRACE BEZ = LBRACE + Suppress(Literal('BEZ')) + WRD + RBRACE CC = LBRACE + Suppress(Literal('CC')) + WRD + RBRACE CD = LBRACE + Suppress(Literal('CD')) + WRD + RBRACE CS = LBRACE + Suppress(Literal('CS')) + WRD + RBRACE DO = LBRACE + Suppress(Literal('DO')) + WRD + RBRACE DOD = LBRACE + Suppress(Literal('DOD')) + WRD + RBRACE DOZ = LBRACE + Suppress(Literal('DOZ')) + WRD + RBRACE DT = LBRACE + Suppress(Literal('DT')) + WRD + RBRACE DTI = LBRACE + Suppress(Literal('DTI')) + WRD + RBRACE DTS = LBRACE + Suppress(Literal('DTS')) + WRD + RBRACE DTX = LBRACE + Suppress(Literal('DTX')) + WRD + RBRACE EX = LBRACE + Suppress(Literal('EX')) + WRD + RBRACE FW = LBRACE + Suppress(Literal('FW')) + WRD + RBRACE HL = LBRACE + Suppress(Literal('HL')) + WRD + RBRACE HV = LBRACE + Suppress(Literal('HV')) + WRD + RBRACE HVD = LBRACE + Suppress(Literal('HVD')) + WRD + RBRACE HVG = LBRACE + Suppress(Literal('HVG')) + WRD + RBRACE HVN = LBRACE + Suppress(Literal('HVN')) + WRD + RBRACE HVZ = LBRACE + Suppress(Literal('HVZ')) + WRD + RBRACE IN = LBRACE + Suppress(Literal('IN')) + WRD + RBRACE JJ = LBRACE + Suppress(Literal('JJ')) + WRD + RBRACE JJR = LBRACE + Suppress(Literal('JJR')) + WRD + RBRACE JJS = LBRACE + Suppress(Literal('JJS')) + WRD + RBRACE JJT = LBRACE + Suppress(Literal('JJT')) + WRD + RBRACE MD = LBRACE + Suppress(Literal('MD')) + WRD + RBRACE NC = LBRACE + Suppress(Literal('NC')) + WRD + RBRACE NN = LBRACE + Suppress(Literal('NN')) + WRD + RBRACE NNS = LBRACE + Suppress(Literal('NNS')) + WRD + RBRACE NP = LBRACE + Suppress(Literal('NP')) + WRD + RBRACE NPS = LBRACE + Suppress(Literal('NPS')) + WRD + RBRACE NR = LBRACE + Suppress(Literal('NR')) + WRD + RBRACE NRS = LBRACE + Suppress(Literal('NRS')) + WRD + RBRACE OD = LBRACE + Suppress(Literal('OD')) + WRD + RBRACE PN = LBRACE + Suppress(Literal('PN')) + WRD + RBRACE PPL = LBRACE + Suppress(Literal('PPL')) + WRD + RBRACE PPLS = LBRACE + Suppress(Literal('PPLS')) + WRD + RBRACE PPO = LBRACE + Suppress(Literal('PPO')) + WRD + RBRACE PPS = LBRACE + Suppress(Literal('PPS')) + WRD + RBRACE PPSS = LBRACE + Suppress(Literal('PPSS')) + WRD + RBRACE QL = LBRACE + Suppress(Literal('QL')) + WRD + RBRACE QLP = LBRACE + Suppress(Literal('QLP')) + WRD + RBRACE RB = LBRACE + Suppress(Literal('RB')) + WRD + RBRACE RBR = LBRACE + Suppress(Literal('RBR')) + WRD + RBRACE RBT = LBRACE + Suppress(Literal('RBT')) + WRD + RBRACE RN = LBRACE + Suppress(Literal('RN')) + WRD + RBRACE RP = LBRACE + Suppress(Literal('RP')) + WRD + RBRACE TL = LBRACE + Suppress(Literal('TL')) + WRD + RBRACE TO = LBRACE + Suppress(Literal('TO')) + WRD + RBRACE UH = LBRACE + Suppress(Literal('UH')) + WRD + RBRACE VB = LBRACE + Suppress(Literal('VB')) + WRD + RBRACE VBD = LBRACE + Suppress(Literal('VBD')) + WRD + RBRACE VBG = LBRACE + Suppress(Literal('VBG')) + WRD + RBRACE VBN = LBRACE + Suppress(Literal('VBN')) + WRD + RBRACE VBZ = LBRACE + Suppress(Literal('VBZ')) + WRD + RBRACE WDT = LBRACE + Suppress(Literal('WDT')) + WRD + RBRACE WPO = LBRACE + Suppress(Literal('WPO')) + WRD + RBRACE WPS = LBRACE + Suppress(Literal('WPS')) + WRD + RBRACE WQL = LBRACE + Suppress(Literal('WQL')) + WRD + RBRACE WRB = LBRACE + Suppress(Literal('WRB')) + WRD + RBRACE PRETERM = ABL ^ ABN ^ ABX ^ AP ^ AT ^ BE ^ BED ^ BEDZ ^ BEG ^ BEM ^ BEN ^ BER ^ BEZ ^ CC ^ CD ^ CS ^ DO ^ DOD ^ DOZ ^ DT ^ DTI ^ DTS ^ DTX ^ EX ^ FW ^ HL ^ HV ^ HVD ^ HVG ^ HVN ^ HVZ ^ IN ^ JJ ^ JJR ^ JJS ^ JJT ^ MD ^ NC ^ NN ^ NNS ^ NP ^ NPS ^ NR ^ NRS ^ OD ^ PN ^ PPL ^ PPLS ^ PPO ^ PPS ^ PPSS ^ QL ^ QLP ^ RB ^ RBR ^ RBT ^ RN ^ RP ^ TL ^ TO ^ UH ^ VB ^ VBD ^ VBG ^ VBN ^ VBZ ^ WDT ^ WPO ^ WPS ^ WQL ^ WRB UKWORD = Group(LBRACE + Literal('WORD') + PRETERM + RBRACE) #PYPARSING - DSL primary entity company = Group(LBRACE + Literal('company') + OneOrMore(WRD) + RBRACE) entity = Group(LBRACE + Literal('entity') + OneOrMore(WRD) + RBRACE) relation = LBRACE + Literal('relation') + OneOrMore(WRD) + RBRACE attribute = LBRACE + Literal('attribute') + OneOrMore(WRD) + RBRACE CASHFLOW = LBRACE + Literal('CASHFLOW') + OneOrMore(WRD) + RBRACE BALANCESHEET = LBRACE + Literal('BALANCESHEET') + OneOrMore(WRD) + RBRACE INCOMESTMT = LBRACE + Literal('INCOMESTMT') + OneOrMore(WRD) + RBRACE REPORT = Group(LBRACE + Suppress(Literal('REPORT')) + (CASHFLOW ^ BALANCESHEET ^ INCOMESTMT) + RBRACE) DATE = Group(LBRACE + Literal('DATE') + WRD + RBRACE) RELATION = LBRACE + Suppress(Literal('RELATION')) + relation + RBRACE ATTRIBUTE = LBRACE + Suppress(Literal('ATTRIBUTE')) + attribute + RBRACE COMPANY = LBRACE + Suppress(Literal('COMPANY')) + company + RBRACE ENTITY = LBRACE + Suppress(Literal('ENTITY')) + entity + RBRACE GREATERTHAN = LBRACE + Literal('GREATERTHAN') + Suppress(WRD) + RBRACE LESSTHAN = LBRACE + Literal('LESSTHAN') + Suppress(WRD) + RBRACE EQUAL = LBRACE + Literal('EQUAL') + Suppress(WRD) + RBRACE GTEQUAL = LBRACE + Literal('GTEQUAL') + Suppress(WRD) + RBRACE LTEQUAL = LBRACE + Literal('LTEQUAL') + Suppress(WRD) + RBRACE USD = LBRACE + Literal('USD') + Suppress(Regex("[$]+")) + RBRACE UNIT = LBRACE + Literal('UNIT') + USD + RBRACE EQUALITY = LBRACE + Suppress(Literal('EQUALITY')) + (GREATERTHAN ^ LESSTHAN ^ EQUAL ^ GTEQUAL ^ LTEQUAL) + RBRACE QUANTITY = LBRACE + Suppress(Literal('QUANTITY')) + Optional(UNIT) + CD + RBRACE QUANTIFIER = LBRACE + Suppress(Literal('QUANTIFIER')) + EQUALITY + QUANTITY + RBRACE #PYPARSING - AST parsing rules FILTER = Group(LBRACE + Literal('FILTER') + (ATTRIBUTE ^ RELATION) + RBRACE) MODIFIER = Group(LBRACE + Literal('MODIFIER') + (DATE ^ QUANTIFIER) + RBRACE) FUNCTIONLIST = Forward() FUNCTION = LBRACE + Suppress(Literal('FUNCTION')) + FILTER + Optional(MODIFIER) + RBRACE FUNCTIONLIST << LBRACE + Suppress('FUNCTIONLIST') + FUNCTION + Optional(FUNCTIONLIST) + RBRACE SUBJECT = LBRACE + Suppress(Literal('SUBJECT')) + (ENTITY ^ COMPANY) + RBRACE FILTEROBJECT = Group(LBRACE + Literal('FILTEROBJECT') + REPORT + RBRACE) DSLI = Group(LBRACE + Literal('DSLI') + (SUBJECT ^ FUNCTION) + RBRACE) QBODY = Forward() QUERYOBJ = LBRACE + Suppress(Literal("QUERYOBJ")) + (DSLI ^ FILTEROBJECT ^ UKWORD) + RBRACE QBODY << LBRACE + Suppress(Literal('QBODY')) + QUERYOBJ + Optional(QBODY) + RBRACE IS = LBRACE + Suppress(Literal('IS')) + (BE ^ BED ^ BEDZ ^ BER ^ BEZ) + RBRACE WHICHQ = LBRACE + Suppress(Literal('WHICHQ')) + WPS + IS + QBODY + RBRACE HOWQ = LBRACE + Suppress(Literal('WHICHQ')) + WRB + IS + QBODY + RBRACE WHATQ = LBRACE + Suppress(Literal('WHICHQ')) + WDT + IS + QBODY + RBRACE QUESTION = Group(LBRACE + Suppress(Literal('QUESTION')) + (WHICHQ ^ HOWQ ^ WHATQ ^ QBODY) + RBRACE) QUERY = LBRACE + Suppress(Literal('QUERY')) + OneOrMore(QUESTION) + RBRACE DSLOBJ = Suppress(SkipTo(company ^ FILTER)) + (company ^ FILTER) def __init__(self, tokens): """init parser with tokens and parser build from CFG :param tokens: tagged query tokens """ self.tokens = tokens self.CFGParser = ChartParser(self.__getCFG()) def _getAST(self): """Gets the words from the token list and passes them through the parser to build an AST :return nltk AST """ parseTokens = [t[0] for t in self.tokens] ASTs = [] try: syntaxTrees = self.CFGParser.parse(parseTokens) for tree in syntaxTrees: ASTs.append(tree) devLogger.info("AST generated: " + str(tree)) if not(len(ASTs)): devLogger.warn("Did not generate any AST. AST list empty.") except Exception as e: devLogger.error("Could not parse tokens into AST: " + str(e)) return ASTs def __getCFG(self): """Creates the CFG by combining the class defined rules, the standard preterminal rules for POS tags -> e, and finally the POS to word rules for the given query :return nltk CFG """ tg = tokenGrammar for t in self.tokens: tg += "\n" + t[1] + ' -> ' + "'" + t[0] + "'" devLogger.info("Preterminal added to grammar: " + str(t)) return nltk.CFG.fromstring(tg) def parseAST(self): """Parses the NLTK AST into a DSL string and view filters :return (List(DSL String),List(Filter references)) """ ast = self._getAST() dslItems = [] filterObjects = [] #TODO right now only consider the first AST. In furutre we will have to pick best AST if len(ast) >= 1: astLimmited = ast[0] else: astLimmited = False if astLimmited: try: parsedAST = self.QUERY.parseString(astLimmited.pprint()) devLogger.info("Parsed AST: " + str(parsedAST)) except Exception as e: parsedAST = [] devLogger.error("Could not parse AST: " + str(e)) for parsed in parsedAST.asList(): filterObjects = [self.getFilterObjects(item) for item in parsed if item[0] == 'FILTEROBJECT'] dslStr = DSLString(filterObjects) for item in parsed: if item[0] == 'DSLI': dslStr.addDSLI(item[1:]) dslItems.append(dslStr.getString()) if len(filterObjects) < 1: filterObjects = [DefaultDataFilter] devLogger.info('DSL query list is: ' + str(dslItems)) devLogger.info('Filter reference list is: ' + str(filterObjects)) return dslItems, filterObjects def getFilterObjects(self, parsedItem): """Links to the appropriate filter class :param parsedItems: List(List()) of parsed query items :return Filter reference """ def filterSwitch(x): return { 'CASHFLOW': CashFlowFilter, 'BALANCESHEET': BalanceSheetFilter, 'INCOMESTMT': IncomeStatementFilter, }.get(x, False) return filterSwitch(parsedItem[1][0])
# Det -> 'an' | 'my' # N -> 'elephant' | 'pajamas' | 'cat' | 'dog' # P -> 'in' | 'outside' # ''') # with open('corpus.txt') as f: # diff_test = f.read().splitlines() # a = "Bob walked the telescope in John John saw Bob by a dog on my dog my dog in my elephant outside I killed an elephant I shot my pajamas in I outside an pajamas" a = "an man on my cat shot Bob outside an pajamas outside Bob with my pajamas in my dog with my cat by an telescope" # a = "I killed a pajamas by Mary" sent = a.split(' ') print sent parser = ChartParser(grammarA) print parser.parse(sent) for tree in parser.parse(sent): print tree, "\n\n" t = Tree.fromstring('''(S (NP (Det an) (N man) (PP (P on) (NP (Det my) (N cat)))) (VP (VP (VP (V shot) (NP Bob)) (PP (P outside) (NP (Det an) (N pajamas) (PP (P outside) (NP Bob))))) (PP (P with) (NP