def _slow_day19a(s): import nltk rules, received = s.split('\n\n') rules = sorted(rules.splitlines(), key=lambda x: not x.startswith('0: ')) grammar = nltk.CFG.fromstring( line.replace(':', ' ->', 1) for line in rules) parser = nltk.ChartParser(grammar) result = 0 for n, line in enumerate(received.splitlines()): res = parser.parse(list(line)) try: _ = next(iter(res)) result += 1 except StopIteration: pass return result
def defgrammar(): Grammar = nltk.CFG.fromstring("""S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | my N -> 'elephant' | 'pajamas' V -> 'shot' P -> in """) sent = "I shot an elephant".split() parser - nltk.ChartParser(Grammar) trees = parser.parse(sent) for tree in trees: print(tree)
def sentence_parse_example(): groucho_grammar = nltk.parse_cfg(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) sent = ["I", "shot", "an", "elephant", "in", "my", "pajamas"] parser = nltk.ChartParser(groucho_grammar) trees = parser.nbest_parse(sent) for tree in trees: print tree
def definegrammar_pasrereult(): Grammar = nltk.CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) sent = "I shot an elephant".split() parser = nltk.ChartParser(Grammar) trees = parser.parse(sent) for tree in trees: print tree
def another_test(): grammar = nltk.parse_cfg(""" S -> NP VP NP -> 'DT' 'NN' VP -> 'VB' | 'VBP' VP -> 'VB' 'NN' """) # Make your POS sentence into a list of tokens. sentence = "DT NN VB NN".split(" ") # Load the grammar into the ChartParser. cp = nltk.ChartParser(grammar) # Generate and print the nbest_parse from the grammar given the sentence tokens. for tree in cp.nbest_parse(sentence): print(tree)
def encode(smiles): assert type(smiles) == list GCFG = zinc_grammar.GCFG tokenize = get_zinc_tokenizer(GCFG) tokens = map(tokenize, smiles) parser = nltk.ChartParser(GCFG) parse_trees = [parser.parse(t).__next__() for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] productions = GCFG.productions() prod_map = {} for ix, prod in enumerate(productions): prod_map[prod] = ix indices = [ np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq ] return indices
def __init__(self, weights_file, latent_rep_size=56): """ Load the (trained) zinc encoder/decoder, grammar model. """ self._grammar = zinc_grammar self._model = models.model_zinc self.MAX_LEN = self._model.MAX_LEN self._productions = self._grammar.GCFG.productions() self._prod_map = {} for ix, prod in enumerate(self._productions): self._prod_map[prod] = ix self._parser = nltk.ChartParser(self._grammar.GCFG) self._tokenize = get_zinc_tokenizer(self._grammar.GCFG) self._n_chars = len(self._productions) self._lhs_map = {} for ix, lhs in enumerate(self._grammar.lhs_list): self._lhs_map[lhs] = ix self.vae = self._model.MoleculeVAE() self.vae.load(self._productions, weights_file, max_length=self.MAX_LEN, latent_rep_size=latent_rep_size)
def parse_tree(data): grammar = nltk.CFG.fromstring(""" S -> NP N S -> DT NP NP -> JJ NN NP -> NN NN NP -> DT NN NP -> JJ NP """) cp = nltk.ChartParser(grammar) for d in data: text = d["text"] tokens = nltk.pos_tag(nltk.word_tokenize(text)) for tree in cp.parse(tokens): print tree
def process2(s): tokens = nltk.word_tokenize(s) tagged = nltk.pos_tag(tokens) grammar = nltk.parse_cfg(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) parser = nltk.ChartParser(grammar) trees = parser.nbest_parse(tagged) return trees
def to_one_hot(smiles): """ Encode a list of smiles strings to one-hot vectors """ assert type(smiles) == list tokens = list(map(tokenize, smiles)) parser = nltk.ChartParser(G.GCFG) parse_trees = [next(parser.parse(t)) for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [ np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq ] one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32) for i in range(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions), indices[i]] = 1. one_hot[i][np.arange(num_productions, MAX_LEN), -1] = 1. return one_hot
def __init__(self, weights_file, latent_rep_size=2): """ Load the (trained) equation encoder/decoder, grammar model. """ self._grammar = the_grammar self._model = molecules.model_gr self.MAX_LEN = 15 # TODO: read from elsewhere self._productions = self._grammar.GCFG.productions() self._prod_map = {} for ix, prod in enumerate(self._productions): self._prod_map[prod] = ix self._parser = nltk.ChartParser(self._grammar.GCFG) self._tokenize = tokenize self._n_chars = len(self._productions) self._lhs_map = {} for ix, lhs in enumerate(self._grammar.lhs_list): self._lhs_map[lhs] = ix self.vae = self._model.MoleculeVAE() self.vae.load(self._productions, weights_file, max_length=self.MAX_LEN, latent_rep_size=latent_rep_size)
def __init__(self, vae: EquationVaeTorch): """ Load the (trained) equation encoder/decoder, grammar model. """ self._grammar = eq_grammar self._model = expr_model_pt self.MAX_LEN = 15 self._productions = self._grammar.GCFG.productions() self._prod_map = {} for ix, prod in enumerate(self._productions): self._prod_map[prod] = ix self._parser = nltk.ChartParser(self._grammar.GCFG) self._tokenize = tokenize self._n_chars = len(self._productions) self._lhs_map = {} for ix, lhs in enumerate(self._grammar.lhs_list): self._lhs_map[lhs] = ix self.vae: EquationVaeTorch = vae
def __init__(self, boundaryEPs, operationalPEs, availableDomains): kernelGrammar = """ S -> "IN" OPBLOCK OPBLOCK -> TBRANCH | NTBRANCH | TPBLOCK OPBLOCK | TPBLOCK EN ROPBLOCK -> INTBRANCH | TPBLOCK ROPBLOCK | TPBLOCK TPBLOCK -> PORDER | MASKPELEM PORDER -> "[" MASKPELEM NPELEM "]" POEXCEPTION | "[" MASKPELEM NPELEM "]" POEXCEPTION -> "(" PELEM PELEM ")" POEXCEPTION | "(" PELEM PELEM ")" | "(" PELEM PELEM "*" ")" POEXCEPTION | "(" PELEM PELEM "*" ")" TBRANCH -> TPBLOCK "{" OPBLOCK NEXTTBRANCH "}" NEXTTBRANCH -> "/" OPBLOCK NEXTTBRANCH | "/" OPBLOCK NTBRANCH -> TPBLOCK "{" ROPBLOCK NEXTNTBRANCH "}" OPBLOCK INTBRANCH -> TPBLOCK "{" ROPBLOCK NEXTNTBRANCH "}" ROPBLOCK NEXTNTBRANCH -> "/" ROPBLOCK NEXTNTBRANCH | "/" ROPBLOCK NPELEM -> MASKPELEM NPELEM | MASKPELEM MASKPELEM -> PELEM | PELEM "<" DOMAIN ">" """ grammarPELEM = 'PELEM ->' for PE in operationalPEs: grammarPELEM += ' "' + PE + '" |' grammarPELEM = grammarPELEM[:len(grammarPELEM) - 1] + '\n' grammarEP = 'EN ->' for EN in boundaryEPs: grammarEP += ' "' + EN + '" |' grammarEP = grammarEP[:len(grammarEP) - 1] + '\n' grammarDomain = 'DOMAIN ->' if len(availableDomains) != 0: for domain in availableDomains: grammarDomain += ' "' + domain + '" |' grammarDomain = grammarDomain[:len(grammarDomain) - 1] self.__boundaryEPs = boundaryEPs self.__operationalPEs = operationalPEs self.__mainParser = nltk.ChartParser( nltk.CFG.fromstring(kernelGrammar + grammarPELEM + grammarEP + grammarDomain)) self.__status = 0
def date_parse(self, dates: Set[str]): # support formats: # on 2020-12-12, on 2020/12/12 # 2020-12-12, 2020/12/12 # December 12, 2020 # December 12 # December 2020 # on December 12th # December 12th 2020 # December 12th, 2020 # December 12th # the twelfth of December # the 12th of December # in 2020 # in December 2020 DateParseCFG = nltk.CFG.fromstring(""" DATE -> IN YEAR SEP MONTH_NUM SEP DAY | YEAR SEP MONTH_NUM SEP DAY | MONTH_STR DAY SEP YEAR | MONTH_STR DAY | MONTH_STR YEAR | IN MONTH_STR NN_NUM | MONTH_STR NN_NUM YEAR | MONTH_STR NN_NUM SEP YEAR | MONTH_STR NN_NUM | DT NN_STR IN MONTH_STR | DT NN_NUM IN MONTH_STR | IN YEAR | IN MONTH_STR YEAR SEP -> "/" | "-" | "," YEAR -> DIGIT DIGIT DIGIT DIGIT MONTH_NUM -> DIGIT | DIGIT DIGIT DAY -> DIGIT | DIGIT DIGIT DT -> "the" IN -> "of" | "in" | "on" NN_STR -> "first" | "second" | "third" | "fourth" | "fifth" | "sixth" | "seventh" | "eighth" | "ninth" | "tenth" | "eleventh" | "twelfth" | "thirteenth" | "fourteenth" | "fifteenth" | "sixteenth" | "seventeenth" | "eighteenth" | "nineteenth" | "twentieth" | "twenty-first" | "twenth-second" | "twenty-third" | "twenty-fourth" | "twenty-fifth" | "twenty-sixth" | "twenty-seventh" | "twenty-eighth" | "twenth-ninth" | "thirtieth" | "thirty-first" MONTH_STR -> "January" | "February" | "March" | "April" | "May" | "June" | "July" | "August" | "September" | "October" | "November" | "December" NN_NUM -> "1st" | "2nd" | "3rd" | "4th" | "5th" | "6th" | "7th" | "8th" | "9th" | "10th" | "11th" | "12th" | "13th" | "14th" | "15th" | "16th" | "17th" | "18th" | "19th" | "20th" | "21st" | "22nd" | "23rd" | "24th" | "25th" | "26th" | "27th" | "28th" | "29th" | "30th" | "31st" DIGIT -> "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" """) date_parser = nltk.ChartParser(DateParseCFG) for date in dates: if date.find('/') != -1 or date.find('-') != -1: # if the format is yyyy/mm/dd then each character is a token tokens = [ch for ch in date] else: tokens = [] for t in date.split(): if t.isnumeric(): tokens.extend([num for num in t]) else: tokens.append(t) for tree in date_parser.parse(tokens): print(tree) tree.draw()
def ambiguity(): groucho_grammar = nltk.parse_cfg(""" S -> NP VP PP -> P NP NP -> Det N | Det N PP | 'I' VP -> V NP | VP PP Det -> 'an' | 'my' N -> 'elephant' | 'pajamas' V -> 'shot' P -> 'in' """) sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] parser = nltk.ChartParser(groucho_grammar) trees = parser.nbest_parse(sent) for tree in trees: print tree
def __init__(self, grammar, device, hparams=stgs.VAE_HPARAMS): """ Load trained encoder/decoder and grammar model :param grammar: A nas_grammar.Grammar object :param hparams: dict, hyperparameters for the VAE and the grammar model """ self._grammar = grammar self.device = device self.hp = hparams self.max_len = self.hp['max_len'] self._productions = self._grammar.GCFG.productions() self._prod_map = make_prod_map(grammar.GCFG) self._parser = nltk.ChartParser(grammar.GCFG) self._tokenize = make_tokenizer(grammar.GCFG) self._n_chars = len(self._productions) self._lhs_map = grammar.lhs_map self.vae = NA_VAE(self.hp) self.vae.eval()
def to_one_hot(smiles): """ Encode a list of smiles strings to one-hot vectors """ assert type(smiles) == list prod_map = {} for ix, prod in enumerate(zinc_grammar.GCFG.productions()): prod_map[prod] = ix tokenize = molecule_vae.get_zinc_tokenizer(zinc_grammar.GCFG) tokens = list(map(tokenize, smiles)) parser = nltk.ChartParser(zinc_grammar.GCFG) parse_trees = [next(parser.parse(t)) for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq] one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.int8) for i in range(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions),indices[i]] = 1. one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1. return one_hot
def get_grammar(self, tagged_sents): res = [] grammarfile = nltk.data.load('file:grammar.cfg') rd = nltk.ChartParser(grammarfile) for tagged_sent in tagged_sents: b = False for tree in rd.parse(tagged_sent.split()): if tree != "": res.append(f"[color=00ff00]{tree}[/color]") b = True if b == False: res.append( "[color=ff0000]Sentence is grammatically wrong according to your grammar[/color]" ) return res
def load(self, filepath): cfg_string = ''.join(list(open(filepath).readlines())) # parse from nltk cfg_grammar = nltk.CFG.fromstring(cfg_string) # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar) self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar) # our info for rule macthing self.head_to_rules = head_to_rules = {} self.valid_tokens = valid_tokens = set() rule_ranges = {} total_num_rules = 0 first_head = None for line in cfg_string.split('\n'): if len(line.strip()) > 0: head, rules = line.split('->') head = Nonterminal(head.strip()) # remove space rules = [_.strip() for _ in rules.split('|')] # split and remove space rules = [ tuple([ Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split() ]) for rule in rules ] head_to_rules[head] = rules for rule in rules: for t in rule: if isinstance(t, str): valid_tokens.add(t) if first_head is None: first_head = head rule_ranges[head] = (total_num_rules, total_num_rules + len(rules)) total_num_rules += len(rules) self.first_head = first_head self.rule_ranges = rule_ranges self.total_num_rules = total_num_rules
def to_one_hot(smiles): """ Encode a list of smiles strings to one-hot vectors """ token = tokenize(smiles) parser = nltk.ChartParser(zinc_grammar.GCFG) parse_tree = parser.parse(token).next() draw_trees(parse_tree) print(type(parse_tree)) exit(0) productions_seq = parse_tree.productions() print(smiles) for i in productions_seq: print(i) exit(0) indices = [prod_map[prod] for prod in productions_seq] one_hot = np.zeros(shape=(MAX_LEN, NRULES), dtype=np.float32) num_productions = len(indices) one_hot[np.arange(num_productions), indices] = 1. one_hot[np.arange(num_productions, MAX_LEN), -1] = 1. return one_hot
def to_one_hot(strs): """ Encode a list of strs strings to one-hot vectors """ prod_map = {} for ix, prod in enumerate(toy_grammar.GCFG.productions()): prod_map[prod] = ix tokens = map(lambda x: x.split(), strs) parser = nltk.ChartParser(toy_grammar.GCFG) parse_trees = [parser.parse(t).next() for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [ np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq ] one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32) for i in xrange(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions), indices[i]] = 1. one_hot[i][np.arange(num_productions, MAX_LEN), -1] = 1. return one_hot
def make_one_hot(cfg: nltk.CFG, tokenizer, prod_map, sents, max_len = 25, n_chars = 34): """ Encodes a list of sentences (strings) into a one-hot vector representing the production rules used to generate it. """ if not isinstance(sents, list): sents = [sents] tokens = list(map(tokenizer, sents)) # tokenize sentences parse_trees = [next(nltk.ChartParser(cfg).parse(t)) for t in tokens] # build parse tree for each sentence prod_seq = [tree.productions() for tree in parse_trees] # list productions used in each parse tree indices = [] # list of vectors identifying the production rules used in each sentence for entry in prod_seq: indices.append(np.array([prod_map[prod] for prod in entry], dtype=int)) one_hot = np.zeros((len(indices), max_len, n_chars), dtype=np.float32) for i in range(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions), indices[i]] = 1. one_hot[i][np.arange(num_productions, max_len), -1] = 1. # fill last column of # unused production slots with 1, which corresponds to the rule "Nothing -> None". return torch.tensor(one_hot)
def one(): # Грамматика grammar = nltk.CFG.fromstring(""" S -> NP VP VP -> V NP | VP PP PP -> P NP NP -> Det N | Det N PP | "Путешественник" V -> "шел" Det -> "несколько" | "небольшими" N -> "недель" | "остановками" P -> "с" """) text = "Путешественник шел несколько недель с небольшими остановками" words = nltk.word_tokenize(text) # Деревья синтаксического разбора trees = nltk.ChartParser(grammar) # Вывод print("\t\t" + text) for t in trees.parse(words): print(t)
def three(): # Грамматика grammar = nltk.CFG.fromstring(""" S -> NP VP VP -> V NP | VP PP PP -> P NP NP -> Det N | Det N PP | "Он" V -> "бежал" Det -> "воспрянув" | "мокрому" N -> "асфальту" | "духом" P -> "по" """) text = "Он бежал воспрянув духом по мокрому асфальту" words = nltk.word_tokenize(text) # Деревья синтаксического разбора trees = nltk.ChartParser(grammar) # Вывод print("\t\t" + text) for t in trees.parse(words): print(t)
def draw_1(s): m = s l = fool.cut(s)[0] print(l) p = product_grammar(m) grammar = CFG.fromstring(""" S -> NP L NP|NP vshi NP y|NP L P NP|NP L P NP F|NP vshi R|T vshi R NP -> nr nr| nr ude n| nr n|NP ude NP|NP NP|z ude n|a ude n|v ude n|nr|n|b ude|ns ude|ns|ns ude NP|m n|m q n|A\ |d m|m|NP c NP|NP p NP VP -> v NP|v VP L ->vshi d vshi P ->p|vi p F ->f T ->t R ->r|r NP|r ude NP A ->a|d a|m q|d a ude """ + p) cp = nltk.ChartParser(grammar) trees = cp.parse(l) for s in trees: print(s)
def main(): f = open('cfg_sentences.txt').readlines() sentences = ' '.join(f).replace('\n', '') text = nltk.word_tokenize(sentences) tagged_text = nltk.pos_tag(text) # Generate grammar from ruleset cfg_rules = generate_cfg_rules() grammar = CFG.fromstring(cfg_rules) # Display sentence trees if our grammar can parse the sentence chart_parser = nltk.ChartParser(grammar) print print 'Sentences from our input set that can be generated by our grammar: ' print '--------------------------------------------' for line in f: # Cleanup input sentences line = line.replace('\n', '').lower() line = line.replace('.', '') sent = line.split() for tree in chart_parser.parse(sent): print(tree) translation_dict = generate_english_to_spanish() translated_sentences = [] for line in f: translated_sentences.append(translate_sentence(line, translation_dict)) print print 'Translated sentences: ' print '----------------------' for item in translated_sentences: print item bleu_score = calculate_bleu_score(translated_sentences) print print 'BLEU Score' print '----------' print "System BLEU Score:", bleu_score
def perform_scg(sentence): gramma_string = (" SIGMA -> DELTA\n" " DELTA -> S P C|S P C A|S P A|S P\n" " S -> h |h m\n" " C -> h m|h\n" " P -> aux l| l \n" " A -> Pre C \n" " h ->" + noun_string + " \n" " l ->" + verb_output + " \n" " m -> 'náà' \n" " aux -> 'n'\n" " Pre -> 'ní'\n") gramma = CFG.fromstring(gramma_string) parser = nltk.ChartParser(gramma) try: lower_sentence = sentence.lower() ans = parser.parse(lower_sentence.split()) output = " ".join(str(x) for x in list(ans)) except ValueError as e: output = "Error : " + str(e) return output
def to_one_hot(transactions): """ Encode a list of smiles strings to one-hot vectors """ assert type(transactions) == list prod_map = {} for ix, prod in enumerate(trans_grammar.GCFG.productions()): # print(prod) prod_map[prod] = ix # tokenize = trans_vae.get_trans_tokenizer(trans_grammar.GCFG) tokens = [] for transaction in transactions: tokens.append(transaction.split()) # tokens = map(string.split, smiles) parser = nltk.ChartParser(trans_grammar.GCFG) parse_trees = [parser.parse(t).next() for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq] one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32) for i in xrange(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions),indices[i]] = 1. one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1. return one_hot
def perform_xbar(sentence): gramma_string = (" IP -> Spec IBAR \n" " Spec -> NP \n" " IBAR -> I VP\n" " NP -> NBAR \n" " NBAR -> N DP| N \n" " VP -> VBAR \n" " VBAR -> V| V NP \n" " DP -> DBAR \n" " DBAR -> D \n" " N -> " + noun_string + " \n" " V -> " + verb_output + " \n" " D -> 'náà' \n") gramma = CFG.fromstring(gramma_string) parser = nltk.ChartParser(gramma) try: lower_sentence = sentence.lower() ans = parser.parse(lower_sentence.split()) output = " ".join(str(x) for x in list(ans)) except ValueError as e: output = "Error : " + str(e) return output
def validate(text): grammar = nltk.CFG.fromstring(grammar_str) parser = nltk.ChartParser(grammar) trees = parser.parse(list(text)) valid = False answer = math_form = None for tree in trees: addition = tree[4].leaves() operation_string = '' for i in addition: operation_string = operation_string + i p = operation_string.replace(PLUS, '+')\ .replace(MUL, '*') \ .replace(DIV, '/') \ .replace(MIN, '-') \ .replace(OPENB, '(') \ .replace(CLOSEB, ')') math_form = p answer = eval(p) valid = True break return (valid, math_form, answer)