Ejemplo n.º 1
0
def context_free_grammar():
    cfg = CFG.fromstring("""\
    ################# Rules #################
    S -> NP VP
    S -> PP NP VP
    S -> Wh Aux NP VP 
    NP -> ProperNoun | CC ProperNoun | N | ProperNoun NP | AP N | DET NP | N PP    
    VP -> V | V NP | Adv VP | V NP VP
    AP -> Adj | Adj AP
    PP -> P NP | P NP VP
    
    ################# Lexicons ################# 
    N -> 'milk'| 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table'
    V -> 'laughs' | 'laughed' | 'drink' | 'wears' | 'serves' | 'drinks' | 'thinks' | 'wear'
    ProperNoun -> 'Bart' | 'Homer' | 'Lisa'
    Aux -> 'do' | 'does'
    CC -> 'and'
    Adj -> 'blue' | 'healthy' | 'green' 
    DET -> 'a' | 'the' 
    Adv -> 'always' | 'never' 
    P -> 'in' | 'before' | 'on' | 'when'
    Wh -> 'when'
    """)
    cfparser = ChartParser(cfg)
    sents = text.splitlines()
    for sent in sents:
        parses = cfparser.parse(sent.split())
        print(sent)
        for tree in parses:
            print(tree)
Ejemplo n.º 2
0
def execute(text: str):
    groucho_grammer = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)
    parser = ChartParser(groucho_grammer)

    tokens = word_tokenize(text=SAMPLE_3)
    print(type(tokens))
    print(tokens)
    for tree in parser.parse(tokens=[
            'The',
            'little',
            'bear',
            'saw',
            'the',
            'fine',
            'fat',
            'trout',
            'in',
            'the',
            'brook',
    ]):
        print(tree)
Ejemplo n.º 3
0
def parse_original_sentences(grammar):
    '''
    Uses given grammar to parse sentences from the file corpus.txt
    Writes the parse trees of each sentence in parsed_corpus.txt
    :param grammar: A context free grammar in the form of nltk.grammar.CFG
    :return: None (Output in parsed_corpus.txt)
    '''
    parser = ChartParser(grammar)
    f = open("corpus.txt", "r")
    f_write = open("parsed_corpus.txt", "w")
    lines = f.readlines()
    count = 1
    working = []
    for line in lines:
        line = line.replace("didnt", "did not")
        s = "Tree {}:\n".format(count)
        sent = word_tokenize(line[:-2])
        for tree in parser.parse(sent):
            s += str(tree) + "\n\n"
            working.append(count)
            break
        count += 1
        f_write.write(s)

    f.close()
    f_write.close()
    print(
        "Parsed form of original corpus sentences using this CFG can be found in parsed_corpus.txt\n"
    )
Ejemplo n.º 4
0
class GrammarOracle(Oracle):
    """
    An oracle from a grammar.
    """
    def __init__(self, grammar):
        """
        Initialize from a CFG.

        :type grammar: CFG
        :param grammar: The grammar for this oracle
        """
        self._parser = ChartParser(grammar)

    def generates(self, sentence):
        """
        Decides whether the grammar generates the sentence.

        :type sentence: Sentence
        :param sentence: A sentence

        :rtype: bool
        :return: Whether the grammar generates the sentence
        """
        try:
            parses = self._parser.parse(sentence.get_words())
            return list(parses) != []
        except:
            return False
Ejemplo n.º 5
0
def parse_sentences(grammar):
    parser = ChartParser(grammar)
    sent = input("Parse a sentence (Q to quit): ")
    while sent != "Q":
        tokens = word_tokenize(sent)
        trees = parser.parse(tokens)
        print_trees(trees)
        sent = input("Parse a sentence (Q to quit): ")
Ejemplo n.º 6
0
def generate_parse_tree(sentence, grammar):
    # then generate the parse trees
    tokens = word_tokenize(sentence)
    parser = ChartParser(grammar)
    # print type(grammar), type(parser)
    try:
        return parser.parse(tokens)
    except Exception:
        #print "Sentence '" + sentence + "' cannot be parsed using the given grammar."
        return Tree('Error', ['Error'])
Ejemplo n.º 7
0
def get_productions(sentence, grammar):
    trees = []
    sent = sentence.split(' ')
    print sent
    cfgGrammar = CFG.fromstring(grammar)

    parser = ChartParser(cfgGrammar)
    for tree in parser.parse(sent):
        trees.append(str(tree).replace("\n", " "))

    # print trees[0]
    t = Tree.fromstring(trees[0])
    return t.productions()
Ejemplo n.º 8
0
def parse_blazon(blazon):
    blazon = blazon.lower()
    to_discard = set(string.punctuation)
    to_discard.remove("&")
    blazon = ''.join(c for c in blazon if c not in to_discard)
    # Convert raw data to tokens to be parsed
    tokens = word_tokenize(blazon)

    # Replace instances of '1st', '2nd', etc with their non abbreviated forms
    for (index, item) in enumerate(tokens):
        if (item in abbr_to_full):
            tokens[index] = abbr_to_full[item]
        elif (item == "&"):
            tokens[index] = "and"

    # Sanitise tokens
    tokens = disambiguate_colours(tokens)
    tokens = reorder(tokens)

    # Construct grammar and parser
    with open('app/parser_cfg.txt') as f:
        raw_cfg = f.read()

    parser_grammar = CFG.fromstring(raw_cfg)
    parser = ChartParser(parser_grammar)

    # Parse data into tree
    output_data = None
    for tree in parser.parse(tokens):
        output_data = tree

    if (output_data is None):
        print("Error: Parse failed, please check input is of correct format.")
    else:
        # Convert Tree to dict to prepare it for JSON serialisation
        output_data = tree_to_dict(output_data)
        # If a tincture is in the top level of the dictionary, change its name to "field"
        if ("tincture" in output_data.keys()):
            output_data["field"] = output_data["tincture"]
            output_data.pop("tincture")
        # Convert dict to JSON
        return (output_data)
Ejemplo n.º 9
0
    def verify(self, grammar, tags):
        """ Verify tag sequence as grammatically correct or not """
        # rd_parser = RecursiveDescentParser(grammar)
        rd_parser = ChartParser(grammar)
        valid = False

        try:
            for tree in rd_parser.parse(tags):
                valid = True
                break
        except ValueError:
            print "This is a grammatical structure I don't understand yet."
            return

        if valid:
            print "Valid"
            return True
        else:
            print "Invalid"
            return False
import nltk
from nltk import ChartParser

# Load grammar.
grammar = nltk.data.load('../../Grammar/full_grammar.cfg')
parser = ChartParser(grammar)

with open('human_chunks.txt') as f:
    noun_chunks = [line.strip().split() for line in f]

not_covered = []
for chunk in noun_chunks:
    try:
        result = parser.parse(chunk)
        print(f"Valid: {chunk}")
    except ValueError:
        print(f"Not covered: {chunk}")
        chunk = ' '.join(chunk) + '\n'
        not_covered.append(chunk)

with open("not-covered.txt", 'w') as f:
    f.writelines(not_covered)

num_chunks = len(noun_chunks)
num_covered = len(noun_chunks) - len(not_covered)
num_not_covered = len(not_covered)
print(f"Number of unique noun chunks: {num_chunks}")
print(f"Covered: {num_covered} ({(num_covered/num_chunks) * 100}%)")
print(
    f"Not covered: {num_not_covered} ({(num_not_covered/num_chunks) * 100}%)")
Ejemplo n.º 11
0
Sp -> P
Sa -> 'tells' 'you' 'that' | 'says' | 'says' 'that' | 'claims' | 'claims' 'that' | 'tells you'
St -> PG Is Class | PG Quant Is Class | 
Quant -> Comp Count
Comp -> 'exactly'
Count -> 'one'
Not -> 'neither' | 'nor' 
PG -> 'i' | PG PG | Not P | P | 'of' PG | PG 'and' PG
P -> 'zoey' | 'mel' | 'peggy' | 'zippy' | 'sue' | 'sally' | 'homer' | 'bozo' | 'marge' | 'zed' | 'alice' | 'ted' | 'bart' | 'bob' | 'betty'
Is -> 'is' 'a' | 'are'
Class -> Kni | Kna
Kni -> 'knight' | 'knights'
Kna -> 'knave' | 'knaves'
""")

def preprocess(sent):
    return "".join([letter for letter in sent.lower() if letter in "qwertyuiopasdfghjklzxcvbnm "]).split()

sents = ["Zoey tells you that mel is a Knave",
         "Mel says, `Neither Zoey nor I are knaves.'",
         "Peggy tells you that 'of Zippy and I, exactly one is a knight'."]
sents = [preprocess(sent) for sent in sents]
parser = ChartParser(kk_grammar)
for sent in sents:
    for tree in parser.parse(sent):
        print(tree)




Ejemplo n.º 12
0
def main():
    # Check arguments
    if (len(sys.argv) == 1):
        print("Too few arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]")
        sys.exit(0)
    elif (len(sys.argv) > 3):
        print("Too many arguments\nUsage: $ python generate.py <INPUT_FILE> [OUTPUT_FILE]")
        sys.exit(0)

    # Initialise paths
    WORKING_DIR = sys.path[0]
    INPUT_FILE = os.path.join(WORKING_DIR, sys.argv[1])

    if (len(sys.argv) == 3):
        OUTPUT_FILE = os.path.join(WORKING_DIR, sys.argv[2])
    else:
        # Extract base filename of input file
        OUTPUT_NAME = os.path.basename(INPUT_FILE)
        # Strip off file extension and add own (.esc for escutcheon)
        OUTPUT_NAME = "trees/" + os.path.splitext(OUTPUT_NAME)[0] + ".esc"
        OUTPUT_FILE = os.path.join(WORKING_DIR, OUTPUT_NAME)

    # Read in input data
    with open(INPUT_FILE) as f:
        raw_data = f.read().lower()

        to_discard = set(string.punctuation)
        to_discard.remove("&")

        raw_data = ''.join(c for c in raw_data if c not in to_discard)

    # Convert raw data to tokens to be parsed
    tokens = word_tokenize(raw_data)

    # Replace instances of '1st', '2nd', etc with their non abbreviated forms
    for (index, item) in enumerate(tokens):
        if (item in abbr_to_full):
            tokens[index] = abbr_to_full[item]
        elif (item == "&"):
            tokens[index] = "and"

    # Sanitise tokens
    tokens = disambiguate_colours(tokens)
    tokens = reorder(tokens)

    # Construct grammar and parser
    with open('parser_cfg.txt') as f:
        raw_cfg = f.read()

    parser_grammar = CFG.fromstring(raw_cfg)
    parser = ChartParser(parser_grammar)

    # Parse data into tree
    output_data = None
    for tree in parser.parse(tokens):
        output_data = tree

    if (output_data is None):
        print("Error: Parse failed, please check input is of correct format.")
    else:
        # Convert Tree to dict to prepare it for JSON serialisation
        output_data = tree_to_dict(output_data)

        # If a tincture is in the top level of the dictionary, change its name to "field"
        if ("tincture" in output_data.keys()):
            output_data["field"] = output_data["tincture"]
            output_data.pop("tincture")

        # Convert dict to JSON
        with open(OUTPUT_FILE, 'w+') as f:
            json.dump(output_data, f, indent=2)
Ejemplo n.º 13
0
class GDev:
	# 1. We will create a grammer development tool.
	# Define a class caledd GDev. The __init__ method should take a 
	# name (a string) as input, and store it in the member name.
	def __init__( self, name ):
		self.name = name
		return

	# 2. Define a method called load_grammar. It takes no arguments.
	# It expects the file name.cfg to exist, where name is the GDev name.
	# It loads a grammer from the file and stores it in the member grammer.
	def load_grammar( self ):
		s = open( self.name + '.cfg' ).read()
		self.grammar = CFG.fromstring(s)
		return

	# 3. Define a method called reload. It should call the method 
	# load_grammar, even if the grammar has already been loaded before.
	# Then it should create a chart parser from the loaded grammar, and
	# store the parser in the member parser. 
	def reload( self ):
		self.load_sents()
		self.load_grammar()
		self.parser = ChartParser( self.grammar )
		return

	# 4. Define a method called parse. It should take one argument, a string.
	# It should call word_tokenize on the sentence, and pass the result to 
	# the parser. The parse method should return a single tree. If the parser
	# returns more than one tree, then parse should return just the first one.
	# If the parser does not return any tress, then parse should return None.
	def parse( self, s ):
		try:
			return list( self.parser.parse( word_tokenize( s ) ) )[0]
		except:
			return None

	# 5. Define a method called load_sents. It takes no arguments. It expects
	# the file name.sents to exist. The file should contain one sentence per 
	# line. Each sentence is either good or bad—good sentences are ones that 
	# the grammar ought to generate, and bad sentences are ones that the 
	# grammar should not generate. If the first character on the line is ’*’, 
	# the sentence is bad, and otherwise it is good. The load_sents method 
	# should produce a list of pairs (good, s) where good is True for good 
	# sentences and False for bad ones, and s is the sentence itself (not 
	# including the ’*’). The list of pairs should be stored in the member 
	# sents. Create a file g1.sents containing the sentences Bob warbled, the 
	# dog ate my telescope, and *Bob cat.
	def load_sents( self ):
		self.sents = [ ( True, line.rstrip('\r\n') ) \
						if line[0] != '*' \
						else (False, line.rstrip('\r\n')[1:]) \
						for line in open(self.name + '.sents') ]
		# print( self.sents )


	# 6. Define a method called parses. It should take no arguments. 
	# It should iterate through the pairs (g,s) in sents, and it should 
	# call parse on each sentence s in turn. For each sentence, it should
	# print an empty line, then the sentence, then the result of calling parse.
	def parses( self ):
		for s in self.sents:
			print( '\n' + s[1] )
			print( self.parse( s[1] ) )

	# 7. Write a method called regress that takes no arguments. It should go 
	# through the pairs (good, s) in sents. For each, it should call parse on s.
	# Define the prediction to be True if parse returns a tree, and False otherwise.
	# If the prediction equals good, then the prediction is correct, and otherwise
	# the prediction is wrong. For each pair, print out one line of output. The output
	# line should start with '!!' if the prediction is wrong and '  ' (two spaces) 
	# it is correct. Then print out a space. Then print '*' if good is False, and a 
	# space if good is True. The output line ends with the sentence s.
	def regress( self ):
		prediction = False
		for s in self.sents:
			if self.parse( s[1] ) is not None: 
				prediction = True
			else:
				prediction = False
			if prediction != s[0]:
				print( '!!' + ' ' , end = '')
			else:
				print( '  ' + ' ' , end = '')
			if s[0] == False:
				print( '*' , end = '')
			else:
				print( ' ' , end = '')
			print( s[1] )

	# 8. Finally, the __call__ method should simply call reload and regress.
	# The idea is to use the set of example sentences to drive grammar development.
	# One adds sentences, calls gd() to see which ones are being handled correctly
	# or not, and then one edits the grammar to fix the prediction errors. After
	# each file edit, one needs merely call gd() to see the revised grammar's 
	# predictions on the sentences. (Making sure that new revisions do not break 
	# things that previously worked correctly is known as regression testing.)
	def __call__( self ):
		self.reload()
		self.regress()
Ejemplo n.º 14
0
class ContextFreeGrammarProcessor:
  def __init__(self, grammar_string):
    self.grammar = CFG.fromstring(grammar_string)
    self.parser = ChartParser(self.grammar)
    self.tokenizer = self._get_tokenizer()

  @property
  def start_index(self):
    return self.grammar.start()

  @property
  def last_index(self):
    return self.grammar.productions()[-1].lhs()

  @property
  def total_productions(self):
    return len(self.grammar.productions())

  @property
  def lhs(self):
    return list(expression.lhs() for expression in self.grammar.productions())

  @property
  def unique_lhs(self):
    return list(dict.fromkeys(self.lhs))

  @property
  def unique_lhs_dictionary(self):
    return {left_rule: idx for idx, left_rule in enumerate(self.unique_lhs)}
    
  @property
  def production_dictionary(self):
    return {production: idx for idx, production in enumerate(self.grammar.productions())}

  def get_masks(self):
    mask = np.zeros((len(self.unique_lhs), self.total_productions))
    for idx, symbol in enumerate(self.unique_lhs):
      mask[idx] = np.array([symbol == symbol_lhs for symbol_lhs in self.lhs], dtype=int)
    return mask 

  def get_masks_idx(self):
    temp_mask = self.get_masks()
    res = [np.where(temp_mask[:, idx]==1)[0][0] for idx in range(self.total_productions)]
    return np.array(res)
    
  def _get_tokenizer(self):
    #TODO cleanup the function --> improve the logic  
    long_tokens = list(filter(lambda symbol: len(symbol) > 1, self.grammar._lexical_index.keys()))
    replacements = ['$','%','^'] # ,'&']
    assert len(long_tokens) == len(replacements)
    # for token in replacements: 
    #     assert not cfg._lexical_index.has_key(token)
    
    def tokenize(smiles):
        for idx, token in enumerate(long_tokens):
            smiles = smiles.replace(token, replacements[idx])
        tokens = []
        for token in smiles:
            try:
                ix = replacements.index(token) 
                tokens.append(long_tokens[ix])
            except:
                tokens.append(token)
        return tokens
    return tokenize

  def smile_to_production_seq(self, smile): 
    production_seq = self.parser.parse(self.tokenizer(smile)).__next__().productions()
    return production_seq
  
  def to_one_hot(self, smile, max_depth=277):
    """
    Args:
      smile: str
        Molecule represented in SMILE grammar
      max_depth: int
        Maximum number of productions used for composition of the SMILE string 
    """
    smile_to_prod_idx = [self.production_dictionary[production] for production in self.smile_to_production_seq(smile)]
    len_production_seq = len(smile_to_prod_idx)
    one_hot = np.zeros((max_depth, self.total_productions))
    one_hot[np.arange(len_production_seq), smile_to_prod_idx] = 1.
    one_hot[np.arange(len_production_seq, max_depth),-1] = 1.
    return one_hot

  def sample_using_masks(self, logit_matrix):
    """
    Implements Algorithm 1 from GrammarVAE paper: https://arxiv.org/abs/1703.01925
    Args: 
      logit_matrix: np.array
    """
    # input: masks for selecting valid production rules 
    masks = self.get_masks()  

    stack = list()
    # initiate stack with the valid production rule (e.g. [smile] for SMILE CFG)
    stack.append(self.start_index)
    res = np.zeros_like(logit_matrix)
    eps = 1e-100
    idx = 0

    def pop_from_stack(stack_):
      try: 
        res_ = stack_.pop()
      except: 
        # the stack is empty, return 'end' production rule: Nothing -> None 
        res_ = self.last_index
      return res_

    while stack is not None and idx < logit_matrix.shape[0]:
      #print('Iteration: {}'.format(idx))
      # 1. given (continuous) logit vector select valid production rule
      # pop the last pushed non-terminal production from the stack
      key = pop_from_stack(stack)
      #print(key)
      next_nonterminal = [self.unique_lhs_dictionary[key]]
      #print('Next nonterminal: {}'.format(next_nonterminal))
      # select mask for mask for the last non-terminal rule
      mask = masks[next_nonterminal]
      #print(mask)
      # mask the logit vector so that only valid right-hand sides can be sampled
      masked_output = np.exp(logit_matrix[idx,:])*mask + eps
      #print(masked_output)
      # given the last non-terminal rule, sample a new valid production rule
      sampled_output = np.argmax(np.random.gumbel(size=masked_output.shape) + np.log(masked_output), axis=-1)
      #print('Sampled output: {}'.format(sampled_output))
      # 2. one_hot encode the new sampled production rule 
      res[idx, sampled_output] = 1.0

      # 3. identify all non-terminals in RHS of selected production
      rhs = list()
      for idx_ in sampled_output: 
        rhs.extend(list(filter(lambda a: (type(a) == grammar.Nonterminal) and (str(a) != 'None'),
                     self.grammar.productions()[idx_].rhs())))
      #print(rhs)
      # 4. push the selected non-terminals onto the stack in reverse order
      stack.extend(rhs[::-1])
      idx += 1
      #print("stack: {}".format(stack))
    return res


  def from_logit_to_production_seq(self, logit):
    one_hot_vec = self.sample_using_masks(logit)
    one_hot_to_production_seq = [self.grammar.productions()[one_hot_vec[idx].argmax()] 
                                 for idx in range(one_hot_vec.shape[0])]
    return one_hot_to_production_seq
Ejemplo n.º 15
0
DET        ->    'a' | 'the'
NOUN       ->    'milk' | 'shoes' | 'salad' | 'kitchen' | 'midnight' | 'table'
ADJ        ->    'blue' | 'healthy' | 'green'
Prep       ->    'in' | 'before' | 'on'
WH         ->    'when'
Aux        ->    'do' | 'does'
""")

cfparser = ChartParser(cfg)
text = """
Bart laughs
Homer laughed
Bart and Lisa drink milk
Bart wears blue shoes
Lisa serves Bart a healthy green salad
Homer serves Lisa
Bart always drinks milk
Lisa thinks Homer thinks Bart drinks milk
Homer never drinks milk in the kitchen before midnight
when Homer drinks milk Bart laughs
when does Lisa drinks the milk on the table
when do Lisa and Bart wear shoes
"""

sents = text.splitlines()
for sent in sents:
    parses = cfparser.parse(sent.split())
    print sent
    for tree in parses:
        print(tree)
Ejemplo n.º 16
0
def parse_sentences(grammar, sent):
	parser = ChartParser(grammar)
	tokens = word_tokenize(sent)
	trees = parser.parse(tokens)
	return trees
Ejemplo n.º 17
0
            if os.path.exists(path):
                grammar_path = path
                break
        if grammar_path is None:
            raise FileNotFoundError("Could not find ATIS grammar in nltk data path!")
        grammar_handle = open(grammar_path, encoding='ISO-8859-1')
    else:
        grammar_handle = open(args.grammar_file)

    grammar = CFG.fromstring(grammar_handle.read())
    grammar_handle.close()
    parser = ChartParser(grammar)

    if args.command == 'check':
        try:
            trees = parser.parse(args.text.split(' '))
            try:
                tree = trees.__next__()
                print("The given sentence does conform to the grammar")
                if args.draw:
                    tree.draw()
                sys.exit(0)
            except StopIteration:
                print(f"The given sentence does not conform to the grammar:\n  'Could not find a valid dependency graph'")
                sys.exit(1)
        except ValueError as e:
            print(f"The given sentence does not conform to the grammar:\n  '{e}'")
            sys.exit(1)

    elif args.command == 'generate':
        sentences = []
Ejemplo n.º 18
0
class QueryParser(object):
    #PYPARSING preterminal definitions
    LBRACE = Suppress(Literal('('))
    RBRACE = Suppress(Literal(')'))
    WRD = Regex("[0-9a-zA-Z_\-\—\,\.\?\!\>\<\=\/\:\;\&\{\}\+]+")
    ABL = LBRACE + Suppress(Literal('ABL')) + WRD + RBRACE
    ABN = LBRACE + Suppress(Literal('ABN')) + WRD + RBRACE
    ABX = LBRACE + Suppress(Literal('ABX')) + WRD + RBRACE
    AP = LBRACE + Suppress(Literal('AP')) + WRD + RBRACE
    AT = LBRACE + Suppress(Literal('AT')) + WRD + RBRACE
    BE = LBRACE + Suppress(Literal('BE')) + WRD + RBRACE
    BED = LBRACE + Suppress(Literal('BED')) + WRD + RBRACE
    BEDZ = LBRACE + Suppress(Literal('BEDZ')) + WRD + RBRACE
    BEG = LBRACE + Suppress(Literal('BEG')) + WRD + RBRACE
    BEM = LBRACE + Suppress(Literal('BEM')) + WRD + RBRACE
    BEN = LBRACE + Suppress(Literal('BEN')) + WRD + RBRACE
    BER = LBRACE + Suppress(Literal('BER')) + WRD + RBRACE
    BEZ = LBRACE + Suppress(Literal('BEZ')) + WRD + RBRACE
    CC = LBRACE + Suppress(Literal('CC')) + WRD + RBRACE
    CD = LBRACE + Suppress(Literal('CD')) + WRD + RBRACE
    CS = LBRACE + Suppress(Literal('CS')) + WRD + RBRACE
    DO = LBRACE + Suppress(Literal('DO')) + WRD + RBRACE
    DOD = LBRACE + Suppress(Literal('DOD')) + WRD + RBRACE
    DOZ = LBRACE + Suppress(Literal('DOZ')) + WRD + RBRACE
    DT = LBRACE + Suppress(Literal('DT')) + WRD + RBRACE
    DTI = LBRACE + Suppress(Literal('DTI')) + WRD + RBRACE
    DTS = LBRACE + Suppress(Literal('DTS')) + WRD + RBRACE
    DTX = LBRACE + Suppress(Literal('DTX')) + WRD + RBRACE
    EX = LBRACE + Suppress(Literal('EX')) + WRD + RBRACE
    FW = LBRACE + Suppress(Literal('FW')) + WRD + RBRACE
    HL = LBRACE + Suppress(Literal('HL')) + WRD + RBRACE
    HV = LBRACE + Suppress(Literal('HV')) + WRD + RBRACE
    HVD = LBRACE + Suppress(Literal('HVD')) + WRD + RBRACE
    HVG = LBRACE + Suppress(Literal('HVG')) + WRD + RBRACE
    HVN = LBRACE + Suppress(Literal('HVN')) + WRD + RBRACE
    HVZ = LBRACE + Suppress(Literal('HVZ')) + WRD + RBRACE
    IN = LBRACE + Suppress(Literal('IN')) + WRD + RBRACE
    JJ = LBRACE + Suppress(Literal('JJ')) + WRD + RBRACE
    JJR = LBRACE + Suppress(Literal('JJR')) + WRD + RBRACE
    JJS = LBRACE + Suppress(Literal('JJS')) + WRD + RBRACE
    JJT = LBRACE + Suppress(Literal('JJT')) + WRD + RBRACE
    MD = LBRACE + Suppress(Literal('MD')) + WRD + RBRACE
    NC = LBRACE + Suppress(Literal('NC')) + WRD + RBRACE
    NN = LBRACE + Suppress(Literal('NN')) + WRD + RBRACE
    NNS = LBRACE + Suppress(Literal('NNS')) + WRD + RBRACE
    NP = LBRACE + Suppress(Literal('NP')) + WRD + RBRACE
    NPS = LBRACE + Suppress(Literal('NPS')) + WRD + RBRACE
    NR = LBRACE + Suppress(Literal('NR')) + WRD + RBRACE
    NRS = LBRACE + Suppress(Literal('NRS')) + WRD + RBRACE
    OD = LBRACE + Suppress(Literal('OD')) + WRD + RBRACE
    PN = LBRACE + Suppress(Literal('PN')) + WRD + RBRACE
    PPL = LBRACE + Suppress(Literal('PPL')) + WRD + RBRACE
    PPLS = LBRACE + Suppress(Literal('PPLS')) + WRD + RBRACE
    PPO = LBRACE + Suppress(Literal('PPO')) + WRD + RBRACE
    PPS = LBRACE + Suppress(Literal('PPS')) + WRD + RBRACE
    PPSS = LBRACE + Suppress(Literal('PPSS')) + WRD + RBRACE
    QL = LBRACE + Suppress(Literal('QL')) + WRD + RBRACE
    QLP = LBRACE + Suppress(Literal('QLP')) + WRD + RBRACE
    RB = LBRACE + Suppress(Literal('RB')) + WRD + RBRACE
    RBR = LBRACE + Suppress(Literal('RBR')) + WRD + RBRACE
    RBT = LBRACE + Suppress(Literal('RBT')) + WRD + RBRACE
    RN = LBRACE + Suppress(Literal('RN')) + WRD + RBRACE
    RP = LBRACE + Suppress(Literal('RP')) + WRD + RBRACE
    TL = LBRACE + Suppress(Literal('TL')) + WRD + RBRACE
    TO = LBRACE + Suppress(Literal('TO')) + WRD + RBRACE
    UH = LBRACE + Suppress(Literal('UH')) + WRD + RBRACE
    VB = LBRACE + Suppress(Literal('VB')) + WRD + RBRACE
    VBD = LBRACE + Suppress(Literal('VBD')) + WRD + RBRACE
    VBG = LBRACE + Suppress(Literal('VBG')) + WRD + RBRACE
    VBN = LBRACE + Suppress(Literal('VBN')) + WRD + RBRACE
    VBZ = LBRACE + Suppress(Literal('VBZ')) + WRD + RBRACE
    WDT = LBRACE + Suppress(Literal('WDT')) + WRD + RBRACE
    WPO = LBRACE + Suppress(Literal('WPO')) + WRD + RBRACE
    WPS = LBRACE + Suppress(Literal('WPS')) + WRD + RBRACE
    WQL = LBRACE + Suppress(Literal('WQL')) + WRD + RBRACE
    WRB = LBRACE + Suppress(Literal('WRB')) + WRD + RBRACE
    PRETERM = ABL ^ ABN ^ ABX ^ AP ^ AT ^ BE ^ BED ^ BEDZ ^ BEG ^ BEM ^ BEN ^ BER ^ BEZ ^ CC ^ CD ^ CS ^ DO ^ DOD ^ DOZ ^ DT ^ DTI ^ DTS ^ DTX ^ EX ^ FW ^ HL ^ HV ^ HVD ^ HVG ^ HVN ^ HVZ ^ IN ^ JJ ^ JJR ^ JJS ^ JJT ^ MD ^ NC ^ NN ^ NNS ^ NP ^ NPS ^ NR ^ NRS ^ OD ^ PN ^ PPL ^ PPLS ^ PPO ^ PPS ^ PPSS ^ QL ^ QLP ^ RB ^ RBR ^ RBT ^ RN ^ RP ^ TL ^ TO ^ UH ^ VB ^ VBD ^ VBG ^ VBN ^ VBZ ^ WDT ^ WPO ^ WPS ^ WQL ^ WRB
    UKWORD = Group(LBRACE + Literal('WORD') + PRETERM + RBRACE)

    #PYPARSING - DSL primary entity
    company = Group(LBRACE + Literal('company') + OneOrMore(WRD) + RBRACE)
    entity = Group(LBRACE + Literal('entity') + OneOrMore(WRD) + RBRACE)
    relation = LBRACE + Literal('relation') + OneOrMore(WRD) + RBRACE
    attribute = LBRACE + Literal('attribute') + OneOrMore(WRD) + RBRACE
    CASHFLOW = LBRACE + Literal('CASHFLOW') + OneOrMore(WRD) + RBRACE
    BALANCESHEET = LBRACE + Literal('BALANCESHEET') + OneOrMore(WRD) + RBRACE
    INCOMESTMT = LBRACE + Literal('INCOMESTMT') + OneOrMore(WRD) + RBRACE
    REPORT = Group(LBRACE + Suppress(Literal('REPORT')) + (CASHFLOW ^ BALANCESHEET ^ INCOMESTMT) + RBRACE)
    DATE = Group(LBRACE + Literal('DATE') + WRD + RBRACE)
    RELATION = LBRACE + Suppress(Literal('RELATION')) + relation + RBRACE
    ATTRIBUTE = LBRACE + Suppress(Literal('ATTRIBUTE')) + attribute + RBRACE
    COMPANY = LBRACE + Suppress(Literal('COMPANY')) + company + RBRACE
    ENTITY = LBRACE + Suppress(Literal('ENTITY')) + entity + RBRACE
    GREATERTHAN = LBRACE + Literal('GREATERTHAN') + Suppress(WRD) + RBRACE
    LESSTHAN = LBRACE + Literal('LESSTHAN') + Suppress(WRD) + RBRACE
    EQUAL = LBRACE + Literal('EQUAL') + Suppress(WRD) + RBRACE
    GTEQUAL = LBRACE + Literal('GTEQUAL') + Suppress(WRD) + RBRACE
    LTEQUAL = LBRACE + Literal('LTEQUAL') + Suppress(WRD) + RBRACE
    USD = LBRACE + Literal('USD') + Suppress(Regex("[$]+")) + RBRACE
    UNIT = LBRACE + Literal('UNIT') + USD + RBRACE
    EQUALITY = LBRACE + Suppress(Literal('EQUALITY')) + (GREATERTHAN ^ LESSTHAN ^ EQUAL ^ GTEQUAL ^ LTEQUAL) + RBRACE
    QUANTITY = LBRACE + Suppress(Literal('QUANTITY')) + Optional(UNIT) + CD + RBRACE
    QUANTIFIER = LBRACE + Suppress(Literal('QUANTIFIER')) + EQUALITY + QUANTITY + RBRACE

    #PYPARSING - AST parsing rules
    FILTER = Group(LBRACE + Literal('FILTER') + (ATTRIBUTE ^ RELATION) + RBRACE)
    MODIFIER = Group(LBRACE + Literal('MODIFIER') + (DATE ^ QUANTIFIER) + RBRACE)
    FUNCTIONLIST = Forward()
    FUNCTION = LBRACE + Suppress(Literal('FUNCTION')) + FILTER + Optional(MODIFIER) + RBRACE
    FUNCTIONLIST << LBRACE + Suppress('FUNCTIONLIST') + FUNCTION + Optional(FUNCTIONLIST) + RBRACE
    SUBJECT = LBRACE + Suppress(Literal('SUBJECT')) + (ENTITY ^ COMPANY) + RBRACE
    FILTEROBJECT = Group(LBRACE + Literal('FILTEROBJECT') + REPORT + RBRACE)
    DSLI = Group(LBRACE + Literal('DSLI') + (SUBJECT ^ FUNCTION) + RBRACE)
    QBODY = Forward()
    QUERYOBJ = LBRACE + Suppress(Literal("QUERYOBJ")) + (DSLI ^ FILTEROBJECT ^ UKWORD) + RBRACE
    QBODY << LBRACE + Suppress(Literal('QBODY')) + QUERYOBJ + Optional(QBODY) + RBRACE
    IS = LBRACE + Suppress(Literal('IS')) + (BE ^ BED ^ BEDZ ^ BER ^ BEZ) + RBRACE
    WHICHQ = LBRACE + Suppress(Literal('WHICHQ')) + WPS + IS + QBODY + RBRACE
    HOWQ = LBRACE + Suppress(Literal('WHICHQ')) + WRB + IS + QBODY + RBRACE
    WHATQ = LBRACE + Suppress(Literal('WHICHQ')) + WDT + IS + QBODY + RBRACE
    QUESTION = Group(LBRACE + Suppress(Literal('QUESTION')) + (WHICHQ ^ HOWQ ^ WHATQ ^ QBODY) + RBRACE)
    QUERY = LBRACE + Suppress(Literal('QUERY')) + OneOrMore(QUESTION) + RBRACE

    DSLOBJ = Suppress(SkipTo(company ^ FILTER)) + (company ^ FILTER)

    def __init__(self, tokens):
        """init parser with tokens and parser build from CFG
        :param tokens: tagged query tokens
        """
        self.tokens = tokens
        self.CFGParser = ChartParser(self.__getCFG())

    def _getAST(self):
        """Gets the words from the token list and passes them
        through the parser to build an AST
        :return nltk AST
        """
        parseTokens = [t[0] for t in self.tokens]
        ASTs = []
        try:
            syntaxTrees = self.CFGParser.parse(parseTokens)
            for tree in syntaxTrees:
                ASTs.append(tree)
                devLogger.info("AST generated: " + str(tree))
            if not(len(ASTs)):
                devLogger.warn("Did not generate any AST. AST list empty.")
        except Exception as e:
            devLogger.error("Could not parse tokens into AST: " + str(e))
        return ASTs

    def __getCFG(self):
        """Creates the CFG by combining the class defined rules,
        the standard preterminal rules for POS tags -> e, and
        finally the POS to word rules for the given query
        :return nltk CFG
        """
        tg = tokenGrammar
        for t in self.tokens:
            tg += "\n" + t[1] + ' -> ' + "'" + t[0] + "'"
            devLogger.info("Preterminal added to grammar: " + str(t))
        return nltk.CFG.fromstring(tg)

    def parseAST(self):
        """Parses the NLTK AST into a DSL string and view filters
        :return (List(DSL String),List(Filter references))
        """
        ast = self._getAST()
        dslItems = []
        filterObjects = []

        #TODO right now only consider the first AST. In furutre we will have to pick best AST
        if len(ast) >= 1:
            astLimmited = ast[0]
        else:
            astLimmited = False

        if astLimmited:
            try:
                parsedAST = self.QUERY.parseString(astLimmited.pprint())
                devLogger.info("Parsed AST: " + str(parsedAST))
            except Exception as e:
                parsedAST = []
                devLogger.error("Could not parse AST: " + str(e))
            for parsed in parsedAST.asList():
                filterObjects = [self.getFilterObjects(item) for item in parsed if item[0] == 'FILTEROBJECT']
                dslStr = DSLString(filterObjects)
                for item in parsed:
                    if item[0] == 'DSLI':
                        dslStr.addDSLI(item[1:])
                dslItems.append(dslStr.getString())

        if len(filterObjects) < 1:
                filterObjects = [DefaultDataFilter]

        devLogger.info('DSL query list is: ' + str(dslItems))
        devLogger.info('Filter reference list is: ' + str(filterObjects))
        return dslItems, filterObjects


    def getFilterObjects(self, parsedItem):
        """Links to the appropriate filter class
        :param parsedItems: List(List()) of parsed query items
        :return Filter reference
        """
        def filterSwitch(x):
            return {
                'CASHFLOW': CashFlowFilter,
                'BALANCESHEET': BalanceSheetFilter,
                'INCOMESTMT': IncomeStatementFilter,
            }.get(x, False)

        return filterSwitch(parsedItem[1][0])
Ejemplo n.º 19
0
# Det -> 'an' | 'my'
# N -> 'elephant' | 'pajamas' | 'cat' | 'dog'
# P -> 'in' | 'outside'
# ''')

# with open('corpus.txt') as f:
#     diff_test = f.read().splitlines()

# a = "Bob walked the telescope in John John saw Bob by a dog on my dog my dog in my elephant outside I killed an elephant I shot my pajamas in I outside an pajamas"
a = "an man on my cat shot Bob outside an pajamas outside Bob with my pajamas in my dog with my cat by an telescope"
# a = "I killed a pajamas by Mary"
sent = a.split(' ')

print sent
parser = ChartParser(grammarA)
print parser.parse(sent)

for tree in parser.parse(sent):
    print tree, "\n\n"

t = Tree.fromstring('''(S
  (NP (Det an) (N man) (PP (P on) (NP (Det my) (N cat))))
  (VP
    (VP
      (VP (V shot) (NP Bob))
      (PP
        (P outside)
        (NP (Det an) (N pajamas) (PP (P outside) (NP Bob)))))
    (PP
      (P with)
      (NP