def convert2_nltk_CFG(G): terminals, NTs, P, S = G Prod = copy(P) # this is here to ensure full coverage of terminals # when parsing the grammar for testing Prod["DUMMY"] = [list(map(lambda x: (x, ), terminals))] assert len(S) > 0 # need a start symbol if len(S) > 1: if "NT0" not in Prod.keys(): Prod["NT0"] = [] for Si in S: Prod["NT0"].append([(Si, )]) assert "NT0" in S start = nltk.Nonterminal("NT0") nltk_nts = nltk.nonterminals(" ".join(list(NTs))) productions = [] # only look at nonterminals with productions for NT in Prod.keys(): for rule in Prod[NT]: rhs = rule_to_tuple(rule, NTs) #print("convert", NT, rhs) prod = nltk.Production(nltk.Nonterminal(NT), rhs) productions.append(prod) # production is empty here... return nltk.grammar.CFG(start, productions)
def __init__(self, pcfg_filename): start = None with open(pcfg_filename, "r") as f: lines = f.readlines() productions = [] for line in lines: matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line) groups = matches.groups() group_count = len(groups) assert group_count == 4 lhs = nltk.Nonterminal(groups[0].strip()) if groups[2] is None: production = nltk.Production(lhs, [ groups[1].strip('\'') ]) else: production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ]) probability = float(groups[3].strip()) # Read the Production rule: if (start is None): start = lhs productions.append(production) self.probability_of_production[production] = probability self.grammar = nltk.grammar.CFG(start, productions, False)
def __count_productions_recursively(self, node: nltk.Tree) -> nltk.Nonterminal: """Recursively parses a tree representation of a sentence.""" label = node.label() # Traverse the tree: if (len(node) == 2): # Handle non-leaf nodes: left = self.__count_productions_recursively(node[0]) right = self.__count_productions_recursively(node[1]) production = nltk.Production( nltk.Nonterminal(label), [nltk.Nonterminal(left.lhs()), nltk.Nonterminal(right.lhs())]) else: # Handle leaf node. token = node[0] self.token_count += 1 if (token not in self.count_per_token): self.count_per_token[token] = 1 else: self.count_per_token[token] += 1 production = nltk.Production(nltk.Nonterminal(label), [token]) # Update our count of this particular productions. if (production not in self.count_per_production): self.count_per_production[production] = 1 else: self.count_per_production[production] += 1 # Update our count of all productions with a particular LHS. lhs = production.lhs() if (lhs not in self.lhs_count): self.lhs_count[lhs] = 1 else: self.lhs_count[lhs] += 1 return production
def __init__(self, pcfg_filename): start = None # Read the file: with open(pcfg_filename, "r") as f: lines = f.readlines() # Parse the file's contents: productions = [] for line in lines: matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line) groups = matches.groups() group_count = len(groups) assert group_count == 4 lhs = nltk.Nonterminal(groups[0].strip()) if groups[2] is None: production = nltk.Production(lhs, [ groups[1].strip('\'') ]) else: production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ]) log_probability = math.log(float(groups[3].strip())) # Read the Production rule: if (start is None): start = lhs productions.append(production) self.log_probability_of_production[production] = log_probability if self.min_log_probability is None or math.fabs(log_probability) > math.fabs(self.min_log_probability): self.min_log_probability = log_probability self.grammar = nltk.grammar.CFG(start, productions, False) # Make it much less probable than the actual minimum_log_probability but still non-zero. self.min_log_probability = self.min_log_probability / 2
def induce_structure(self, sentences): sentences = [[c for c in s] for s in sentences] start_symbols = set() productions = [] prod_table = {} # group all digits together digit_terminals = set([str(i) for i in range(10)]) # unary rules terminals = set() for s in sentences: terminals.update(s) for t in terminals: if t in digit_terminals: nt = nltk.Nonterminal("Digit") else: nt = nltk.Nonterminal("Unary%s" % self.gen_nt()) p = Production(nt, [t]) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.apply_unary_prod(sentences, prod_table) while len(sentences) > 0: if self.has_recursion(sentences): p = self.generate_recursive_prod(sentences) else: p = self.generate_most_frequent_prod(sentences) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.update_with_prod(sentences, prod_table) new_sentences = [] for s in sentences: if len(s) == 1: start_symbols.add(s[0]) else: new_sentences.append(s) sentences = new_sentences # generate the start productions for symbol in start_symbols: for p in productions: if p.lhs() == symbol: productions.append(Production(self.start, p.rhs())) self.grammar = nltk.induce_pcfg(self.start, productions)
def pcfg_reverse(word): s = build_tree(word, 0) tree = nltk.Tree.fromstring(s) productions = tree.productions() for p in productions: ################################################## # !!! THIS IS WHERE THE MAGIC HAPPENS !!! # if len(p._rhs) > 1: # p._rhs = (p._rhs[1], p._rhs[0]) # ############################################## grammar = nltk.induce_pcfg(nltk.Nonterminal("N0"), productions) # print(grammar) # UNCOMMENT FOR A FUN TIME! parser = nltk.pchart.InsideChartParser(grammar) # Shuffle to generate 1000 possible words; only the correct # solution will be parseable with our grammar! for i in range(1000): cand = random.sample(word, len(word)) # print(cand) # UNCOMMENT FOR A FUN TIME! parser.parse(cand) for parse in parser.parse(cand): if parse._ProbabilisticMixIn__prob > 0: # print("number of tries: {}".format(i)) # UNCOMMENT! return "".join(cand) return "no reverse found, try again"
def expand_all(grammar, nonterm, state): result = "" queue = Queue.LifoQueue() queue.put_nowait(nonterm) # do this iteratively; recursively blows past python's recursive limit in_list = None len_at_start_of_list = 0 while not queue.empty(): head = queue.get_nowait() # Keep track of being in a list until all the bits for the list # have been used up if head in state.list_bits: in_list = head len_at_start_of_list = queue.qsize() # done with the list once we consume the next item in the queue if in_list and queue.qsize() < len_at_start_of_list: in_list = None terms = expand(grammar, head, state, in_list) if len(terms) == 0: if isinstance(head, basestring): result = " ".join([result, head]) else: result = " ".join([result, str(head)]) else : # put them into the lifo queue backwards, so we'll get the # first one out for nt in reversed(terms): if nt in state.common.append_newlines(): queue.put_nowait(nltk.Nonterminal("\n")) queue.put_nowait(nt) return result
def fit_pcfg(self, X): if self.fitted_pcfg: raise ValueError("PCFG.pcfg already fitted") productions = [] for sentence in X: # nltk format t = nltk.tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) # chomky normal form self.chomkysation(t) #rules exraction rules = self.extract_rules(t, lexical=False) productions.extend(rules) start = nltk.Nonterminal('SENT') self.pcfg_ = nltk.induce_pcfg(start, productions) self.pcfg_.chomsky_normal_form(flexible=False) #get tokens for prod in self.pcfg_._productions: for token in prod._rhs: if not token == 'SENT': self.non_terminals.append(token) self.non_terminals.insert(0, start) #get tokens2index self.pos2index = {} for i, token in enumerate(self.non_terminals): self.pos2index[token] = i self.fitted_pcfg = True
def convert_hybrid_productions(self, production): """ Convert a hybrid production into valid CNF by creating new non-terminals for any terminals in the production :param production: a hybrid production :return: """ terminal_list = [] new_rhs = [] for node in production.rhs(): if nltk.grammar.is_nonterminal(node): new_rhs.append(node) else: terminal_list.append(node) # replace each terminal with a new non-terminal new_rhs.append(nltk.Nonterminal(node.upper())) if self.is_long(new_rhs): self.convert_long_productions(production.lhs().symbol(), new_rhs) else: self.add_production(production.lhs().symbol(), self.production_string(new_rhs)) # add new productions for each new non-terminal created for string in terminal_list: self.add_production(string.upper(), self.production_string([string]))
def build(self, rhs): newKey = 'X' + str(self.seed) self.seed = self.seed + 1 lhs = nltk.Nonterminal(newKey) return self.buildNormal(lhs, rhs)
def _train_rules_grammar(self): print("training grammar") self._grammar = nltk.induce_pcfg( nltk.Nonterminal('TOP'), reduce(lambda a, b: a + b, map(lambda t: t.productions(), self._treebank))) if RUN_MODE == PURE_CKY_M and UNKOWN_MODE: add_unknowns(self._grammar) print("finished grammar training")
def create_rule_with_RHS(rhs): global counter newKey = 'X' + str(counter) counter += 1 lhs = nltk.Nonterminal(newKey) if isinstance(rhs, str): rhs = [rhs] return create_rule(lhs, rhs)
def reset_to_previous_finished_query(self): end = None for i, production in reversed(list(enumerate(self.actions))): if production.lhs() == nltk.Nonterminal('_Q_'): end = i break productions = self.actions[:end] self.reset() for production in productions: self.apply_production(production)
def rule_to_tuple(rule, nonterminals): rhs = [] for token in rule: assert type(token) == tuple assert len(token) == 1 symbol = token[0] if symbol in nonterminals: rhs.append(nltk.Nonterminal(symbol)) else: rhs.append(symbol) return tuple(rhs)
def choose(nonterm, in_list, prods, state): if nonterm in state.last_or_nots: # if True, use the last one, otherwise, use anything but. # this choice uses no bits if state.last_or_nots[nonterm]: return prods[len(prods)-1] else: return random.choice(prods[:-1]) elif state.bitstring.index >= len(state.bitstring.bitstring): # We're past the end of the message, so just pick randomly return random.choice(prods) elif len(prods) < 3: return prods[0] bits = int(math.log(len(prods)-1, 2)) prevPow2 = math.pow(2, bits) # For lists, only pick the end of the list once we've used all # the bits, or we're out of bits. Unless we're the last list # left, then keep going until we consume all bits end_list = False if in_list and in_list in state.list_bits: bits_left = state.list_bits[in_list] if (bits_left <= 0 and len(state.list_bits) > 1 and nonterm in state.common.list_recursive_terms()): end_list = True del state.list_bits[in_list] else: state.list_bits[in_list] = bits_left - bits #print ("Consuming %s bits for list %s (%s left)" % # (bits, in_list, state.list_bits[in_list])) # otherwise, use the first 'bits' bits to pick the index index = int(prevPow2) if not end_list: strindex = state.bitstring.bitstring[state.bitstring.index:bits+ state.bitstring.index] index = int(strindex, 2) #print ("(%s) Using bits %s (%s -> %s)" % # (len(state.bitstring.bitstring)-state.bitstring.index, strindex, nonterm, prods[index])) state.bitstring.index += bits state.bitstring.index = min(state.bitstring.index, len(state.bitstring.bitstring)) # now interpret as an int prod = prods[index] if len(state.list_bits) == 0 and nonterm == nltk.Nonterminal("CFP_BODY"): # set the list bits state.list_bits.update( state.common.calc_list_bits(len(state.input_text)*8, prod)) return prod
def createGrammar(self, userMessages, ctx): parser = CoreNLPParser(url='http://localhost:9000') parse_trees = [] for message in userMessages: tokenized = nltk.sent_tokenize(message) for sentence in tokenized: parse_trees.append(list(parser.raw_parse(sentence))[0]) grammar_rules = set() for tree in parse_trees: for production in tree.productions(): grammar_rules.add(production) start = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(start, grammar_rules) return (' '.join((self.generate_sentence(grammar))))
def pcfg_train(trees, vocab): # Write a function pcfg_train() that takes as its input a collection # of nltk.tree.Tree objects. For example, it might be passed some # portion of nltk.corpus.treebank.parsed_sents(). This function # should return a nltk.PCFG object. all_productions = [] for t in trees: for p in t.productions(): all_productions.append(nltk.Production(p.lhs(), p.rhs())) pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions) return (pcfg)
def parse(self, sent): lhs = [pair[0] for pair in self.grammar] rhs = [pair[1] for pair in self.grammar] stack = [] index = 0 shift = True while index != len(sent): reduce = False if shift: stack.append(sent[index]) index += 1 tags = [x.label() if not isinstance(x, tuple)\ else funcs.ConvertPosTag(x[1]) for x in stack] for i in range(len(stack)-2, -1, -1): if tags[i:] in rhs: shift = False reduce = True parent = lhs[rhs.index(tags[i:])] children = stack[i:] tree = nltk.tree.Tree(parent, children) stack = stack[:i] stack.append(tree) break if not reduce: shift = True s = nltk.Nonterminal("S") tree = nltk.tree.Tree(s, stack) return tree
def train_pcfg(): print 'training grammar' productions = [] # print len(treebank.fileids()) trees = [] # up to 199 less for shorter grammar for quicker training for fileid in treebank.fileids()[0:20]: for tree in treebank.parsed_sents(fileid): # perform optional tree transformations, e.g.: # Remove branches A->B->C into A->B+C so we can avoid infinite # productions tree.collapse_unary(collapsePOS=False) # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser) # horizontal and vertical Markovization: remember parents and siblings in tree # This gives a performance boost, but makes the grammar HUGE # If we use these we would need to implement a tag forgetting method #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0) tree.chomsky_normal_form() productions += tree.productions() S = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(S, productions) print "grammar trained!" return grammar
def buildGrammar(self): print('Starting building Grammar...') start = time.time() # We need to modify https://www.cs.bgu.ac.il/~elhadad/nlp16/NLTK-PCFG.html, cells 20/21 # So that we only keep terminals which are PoS tags productions = [] # This will be used in the buildLexicon function self.tokens = set() # Tokens in the vocabulary self.posTags = set() # PoS tags linked to these tokens for tree in self.trees: for production in tree.productions(): if not production.is_lexical(): # We only rules which have terminals which are PoS --> we don't keep terminals which are tokens productions.append(production) else: # We keep rules which have a token as terminal self.tokens.add(production.rhs()[0]) self.posTags.add(production.lhs()) root = nltk.Nonterminal('SENT') grammar = nltk.induce_pcfg(root, productions) end = time.time() print('... Grammar built. Time: {:2.2}s'.format(end-start)) return grammar
from collections import defaultdict import nltk from nltk import ProbabilisticProduction UNKNOWN_T = nltk.Nonterminal('UNKOWN') def add_unknowns(grammar): terminal_derivation_non_t = set( [prod.lhs() for prod in grammar.productions() if prod.is_lexical()]) terminal_derivation_probs = defaultdict(int) for term_prod in terminal_derivation_non_t: terminal_derivation_probs[term_prod] = sum( prod.prob() for prod in grammar.productions(lhs=term_prod) if prod.is_lexical()) if terminal_derivation_probs[term_prod] > 1: terminal_derivation_probs[term_prod] = 1 total_non_t_rules = defaultdict(lambda: { 'l': defaultdict(lambda: [0, 0.0]), 'r': defaultdict(lambda: [0, 0.0]) }) for prod in grammar.productions(): rule = prod.lhs() if len(prod.rhs()) > 1: l_non_t, r_non_t = prod.rhs() prob_l_derives_t = terminal_derivation_probs[l_non_t] prob_r_derives_t = terminal_derivation_probs[r_non_t] total_non_t_rules[rule]['l'][l_non_t][0] += 1 total_non_t_rules[rule]['l'][l_non_t][1] += prod.prob(
self._grammar._productions.append( ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() print(self._grammar) return super(PCFGViterbiParser, self).parse(tokens) f = "/home/esyir/Documents/A-star/NLP/data/GENIA_treebank_v1/10022882.xml" tree = xmltree.analyze(f) production = tree[0].productions() S = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(S, production) viterbi_parser = PCFGViterbiParser.train(production, 'S') tokenized = st.preprocess( "all of these results suggested that the slps of both l. acidophilus strains possessed murein hydrolase activities that were sublethal to e. coli cells." ) a123 = "all of these results suggested that the slps of both l. acidophilus strains possessed murein hydrolase activities that were sublethal to e. coli cells." tokenized2 = tokenized[0] print(tokenized2) print("BAM")
def generate_with_word(start_word): start_symbol = nltk.Nonterminal("wypowiedzenie:|" + start_word) result = generate_symbols(start_symbol) print(" ".join(result))
from nltk.corpus import BracketParseCorpusReader from nltk import induce_pcfg treebank = BracketParseCorpusReader( "resources/", "skladnica_with_heads.txt", ) productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): #tree.draw() productions += tree.productions() grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:|'), productions) print(grammar.start()) #print(grammar.productions()) #print(grammar._lhs_index) #print(grammar.productions(lhs=grammar.start())) #print(grammar.productions(lhs=nltk.Nonterminal("wypowiedzenie:|mogę"))) #print(grammar.productions(lhs=nltk.Nonterminal("znakkonca:|."))) used_symbols = [] def generate_symbols(symbol): if nltk.grammar.is_terminal(symbol):
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', metavar='S', type=int, help='the random number generator seed') parser.add_argument('--socket', metavar='SOCKET', type=str, help='the local socket to bind to') parser.add_argument('--infile', metavar='FILE', type=str, help='read from this file instead of stdin') parser.add_argument('--outfile', metavar='FILE', type=str, help='write to this file instead of stdout') parser.add_argument('--website', metavar='W', type=str, help='a website link to include, if any ' '(must start with "http://")') args = parser.parse_args() if args.socket: ok = scipherd.call_daemon(args.socket, True, args.infile, args.outfile) if ok: sys.exit(0) else: sys.exit(-1) if args.seed: seed = args.seed else: seed = random.randint(0, 2**32) random.seed(seed) sys.stderr.write("Random seed: %d\n" % seed) input_text = "" for line in sys.stdin: input_text += line.decode('utf-8') if len(input_text) > 2**20: print "Input text must be smaller than 1MB." sys.exit(-1) common = cfp_common.CfpCommon.get_latest_common() space_before = re.compile('\s([%s])' % common.chars_to_remove_a_space_before()) space_after = re.compile('([%s])\s' % common.chars_to_remove_a_space_after()) last_or_nots = common.choose_last_or_nots() if args.website: if args.website.find("http://") != 0: sys.stderr.write("Bad website: %s\n" % args.website) sys.exit(-1) last_or_nots[nltk.Nonterminal("SUBMIT_CLOSING")] = True # load grammars #print "1) %s" % time.time() header_grammar = nltk.data.load("file:%s" % common.header_cfg_filename(), 'cfg') body_grammar = nltk.data.load("file:%s" % common.body_cfg_filename(), 'cfg') #print "2) %s" % time.time() state = EncodeState(input_text, Bitstring(), common, header_grammar, body_grammar, {}, space_before, space_after, last_or_nots, LastTime()) (header, body) = do_encode(state, args.website) print header print "" print body
def main(): options = parse_args() # parameter set 1 #assert(options.corpus_name!=None); assert (options.input_directory != None) assert (options.output_directory != None) input_directory = options.input_directory input_directory = input_directory.rstrip("/") corpus_name = os.path.basename(input_directory) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) assert (options.grammar_file != None) grammar_file = options.grammar_file assert (os.path.exists(grammar_file)) # Documents train_docs = [] input_stream = open(os.path.join(input_directory, 'train.dat'), 'r') for line in input_stream: train_docs.append(line.strip()) input_stream.close() print("successfully load all training documents...") # parameter set 2 if options.number_of_documents > 0: number_of_documents = options.number_of_documents else: number_of_documents = len(train_docs) if options.batch_size > 0: batch_size = options.batch_size else: batch_size = number_of_documents #assert(number_of_documents % batch_size==0); training_iterations = number_of_documents / batch_size if options.training_iterations > 0: training_iterations = options.training_iterations #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size)); #multiprocesses = options.multiprocesses; assert (options.number_of_processes >= 0) number_of_processes = options.number_of_processes # parameter set 3 assert (options.grammaton_prune_interval > 0) grammaton_prune_interval = options.grammaton_prune_interval snapshot_interval = grammaton_prune_interval if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print("warning: running in batch mode...") kappa = 0 # read in adaptor grammars desired_truncation_level = {} alpha_pi = {} beta_pi = {} grammar_rules = [] adapted_non_terminals = set() #for line in codecs.open(grammar_file, 'r', encoding='utf-8'): for line in open(grammar_file, 'r'): line = line.strip() if line.startswith("%"): continue if line.startswith("@"): tokens = line.split() assert (len(tokens) == 5) adapted_non_terminal = nltk.Nonterminal(tokens[1]) adapted_non_terminals.add(adapted_non_terminal) desired_truncation_level[adapted_non_terminal] = int(tokens[2]) alpha_pi[adapted_non_terminal] = float(tokens[3]) beta_pi[adapted_non_terminal] = float(tokens[4]) continue grammar_rules.append(line) grammar_rules = "\n".join(grammar_rules) # Warning: if you are using nltk 2.x, please use parse_grammar() #from nltk.grammar import parse_grammar, standard_nonterm_parser #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) from nltk.grammar import read_grammar, standard_nonterm_parser start, productions = read_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) print("start, productions: ", start, productions) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]); #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]); #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]); #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % ( number_of_documents, #number_of_topics, grammaton_prune_interval, snapshot_interval, batch_size, training_iterations, tau, kappa, #alpha_theta, #alpha_pi_string, #beta_pi_string, #desired_truncation_level_string, os.path.basename(grammar_file)) output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a input_stream options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("grammar_file=" + str(grammar_file) + "\n") # parameter set 2 options_output_file.write("number_of_processes=" + str(number_of_processes) + "\n") #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n"); options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("training_iterations=" + str(training_iterations) + "\n") # parameter set 3 options_output_file.write("grammaton_prune_interval=" + str(grammaton_prune_interval) + "\n") options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") # parameter set 4 #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_pi=%s\n" % alpha_pi) options_output_file.write("beta_pi=%s\n" % beta_pi) options_output_file.write("desired_truncation_level=%s\n" % desired_truncation_level) # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print("========== ========== ========== ========== ==========") # parameter set 1 print("output_directory=" + output_directory) print("input_directory=" + input_directory) print("corpus_name=" + corpus_name) print("grammar_file=" + str(grammar_file)) # parameter set 2 print("number_of_documents=" + str(number_of_documents)) print("batch_size=" + str(batch_size)) print("training_iterations=" + str(training_iterations)) print("number_of_processes=" + str(number_of_processes)) #print("multiprocesses=" + str(multiprocesses) # parameter set 3 print("grammaton_prune_interval=" + str(grammaton_prune_interval)) print("snapshot_interval=" + str(snapshot_interval)) print("tau=" + str(tau)) print("kappa=" + str(kappa)) # parameter set 4 #print("alpha_theta=" + str(alpha_theta) print("alpha_pi=%s" % alpha_pi) print("beta_pi=%s" % beta_pi) print("desired_truncation_level=%s" % desired_truncation_level) # parameter set 5 #print("heldout_data=" + str(heldout_data) print("========== ========== ========== ========== ==========") import hybrid print("passing prodcutions = : ", productions) adagram_inferencer = hybrid.Hybrid(start, productions, adapted_non_terminals) adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa, alpha_pi, beta_pi, None, desired_truncation_level, grammaton_prune_interval) ''' clock_iteration = time.time(); clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs); clock_iteration = time.time()-clock_iteration; print('E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p ''' #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0")) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0")) random.shuffle(train_docs) training_clock = time.time() snapshot_clock = time.time() for iteration in range(int(training_iterations)): start_index = batch_size * iteration end_index = batch_size * (iteration + 1) if start_index / number_of_documents < end_index / number_of_documents: #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)]; train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents):] random.shuffle(train_docs) train_doc_set += train_docs[:(batch_size * (iteration + 1)) % (number_of_documents)] else: train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents): (batch_size * (iteration + 1)) % number_of_documents] clock_iteration = time.time() #print("processing document:", train_doc_set clock_e_step, clock_m_step = adagram_inferencer.learning( train_doc_set, number_of_processes) if (iteration + 1) % snapshot_interval == 0: #pickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb'); #pickle.dump(adagram_inferencer, pickle_file); #pickle_file.close(); adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str( (iteration + 1)))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) if (iteration + 1) % 1000 == 0: snapshot_clock = time.time() - snapshot_clock print('Processing 1000 mini-batches take %g seconds...' % (snapshot_clock)) snapshot_clock = time.time() clock_iteration = time.time() - clock_iteration print( 'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...' % (adagram_inferencer._counter, clock_e_step, clock_m_step, clock_iteration)) adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str(adagram_inferencer._counter + 1))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) pickle_file = open( os.path.join(output_directory, "model-%d" % (iteration + 1)), 'wb') pickle.dump(adagram_inferencer, pickle_file) pickle_file.close() training_clock = time.time() - training_clock print('Training finished in %g seconds...' % (training_clock))
treeData = getTreeData(data) print("Done!") print(" ") ######################## ## Q u e s t i o n 6a ## ######################## #list to store all the extracted rules treeProductions = [] # productions() function is used to extract the grammar rule for each sentence for tr in treeData: # for tree in treeData treeProductions += tr.productions() S = nltk.Nonterminal("S") grammar = nltk.induce_pcfg(S, treeProductions) ### Extracting PCFG to a text file #grammar_PCFG = str(grammar) #file = open('/Users/mayapetranova/Documents/QMUL/NLP/assignment_2/6/PCFG.txt', 'w') #file.write(grammar_PCFG) #file.close() ######################## ## Q u e s t i o n 6b ## ######################## sentence = "show me the meals on the flight from Phoenix".split() parser = pchart.InsideChartParser(grammar) for tp in parser.parse(sentence):
def get_number(tree, grammar, state, in_list_arg = None): if type(tree) != nltk.Tree: return ((tree,), []) nt_label = nltk.Nonterminal(tree.label()) if state.done.done: return ((nt_label,), []) prods = grammar.productions(nt_label) rhs = () num = [] # come up with a format with the right number of leading zeroes bits = 0 if len(prods) >= 3: bits = int(math.log(len(prods)-1, 2)) formatstr = "{0:0%db}" % bits prevPow2 = math.pow(2, bits) in_list = in_list_arg if not in_list and nt_label in state.list_bits: in_list = nt_label use_bits = True if nt_label in state.common.choose_last_or_nots(): use_bits = False # Consume list bits before we start recursing, since that's the order # ./encode.py does it. is_list = False if bits > 0 and in_list in state.list_bits: is_list = True if use_bits: # if we know that this is going to be the end of a list such # that the power of 2 was chosen, then don't bother subtracting # the bits from the main done. if (nt_label in state.common.list_recursive_terms() and state.list_bits[in_list] <= 0 and len(state.list_bits) > 1): use_bits = False state.list_bits[in_list] -= bits #print ("Consumed %d bits for list %s, %s left" % # (bits, in_list, list_bits[in_list])) prev_bits_left = state.done.bits_left if use_bits: #print "(%s) Consumed %s bits for %s" % (prev_bits_left, bits, nt_label) state.done.bits_left -= bits # set up list_bits if needed, before recursing: subtrees = tree.subtrees().next() if len(state.list_bits) == 0 and nt_label == nltk.Nonterminal("CFP_BODY"): body_rhs = tuple(nltk.Nonterminal(t.label()) for t in subtrees) for p in prods: if p.rhs() == body_rhs: state.list_bits.update( state.common.calc_list_bits(state.done.total_len, p)) break child_bits = 0 for t in subtrees: (t_rhs, t_num) = get_number(t, grammar, state, in_list) rhs += t_rhs num.extend(t_num) end_list = False if bits == 0: # If this had fewer than 3 rules, the first one was always # used, so don't produce any bits return ((nt_label,), num) elif is_list: if (in_list not in state.list_bits or (state.list_bits[in_list] <= 0 and len(state.list_bits) > 1)): end_list = True for i in range(len(prods)): if prods[i].rhs() == rhs: if (len(state.list_bits) == 0 and nt_label == nltk.Nonterminal("CFP_BODY")): state.list_bits.update(state.common.calc_list_bits( state.done.total_len, prods[i])) if prev_bits_left <= 0: state.done.done = True return ((nt_label,), []) elif is_list and i == prevPow2: if use_bits: state.done.bits_left += bits # encode didn't count these if in_list in state.list_bits: bits_left = state.list_bits[in_list] if bits_left <= 0 and len(state.list_bits) > 1: del state.list_bits[in_list] # end of the list -- still count the choices below us return ((nt_label,), num) else: istring = formatstr.format(i) #print ("(%s) Using %s bits %s (%s -> %s)" % # (prev_bits_left, bits, istring, nt_label, rhs)) return ((nt_label,), [istring]+num) print "Couldn't find rhs for label %s, rhs %s" % (rhs, tree.label())
@author: SHILPASHREE RAO """ import sys import nltk from itertools import islice from nltk import Tree import collections #with open(sys.argv[1], 'r') as f: with open('train.trees.pre.unk', 'r') as f: # data = f.read() data = f.readlines() productions = [] t = [] TOP = nltk.Nonterminal('TOP') for i in data: t = Tree.fromstring(i) productions += t.productions() grammar = nltk.induce_pcfg(TOP, productions) lefRule = [] rightRule = [] for i in productions: lefRule.append(i.lhs()) rightRule.append(i.rhs()) tupList = zip(lefRule, rightRule) count = collections.Counter(tupList) maxv = max([i for i in count.values()]) out = (count.keys()[count.values().index(maxv)], maxv) print "Frequently occuring rule and it's frequency", out num = len(grammar.productions())
import nltk import ancora path = 'ancora/ancora-3.0.1es/' corpus = ancora.AncoraCorpusReader(path) t = corpus.parsed_sents()[0] t.draw() t.productions() prods = [] for t in corpus.parsed_sents(): prods += t.productions() #print (prods) S = nltk.Nonterminal('sentence') grammar = nltk.induce_pcfg(S, prods) #prods2 = grammar.productions(lhs=nltk.Nonterminal('ncms000')) #print (prods2) print("===============================================================") print("===============================================================") parser = nltk.ViterbiParser(grammar) for tree in parser.parse("El gato come pescado crudo .".split()): print(tree) tree.draw() tree.prob()