def length_kld(self, mypcfg): """ Compute KLD between the distribution given by the weights and the true distribution. """ ui = inside.UnaryInside(mypcfg) table = ui.compute_inside_smart(len(self.weights)) kld = 0.0 total = sum(self.weights) for length, w in enumerate(self.weights): if w > 0: p = w / total q = table[length, ui.start] kld += p * math.log(p / q) return kld
def sample(self, ignore_errors=False, viterbi=False): """ return a PCFG that is well behaved. This may not generate strings of all lengths. """ cfg = self.cfgfactory.sample_trim() ## unary = "_unary_" # make the unary_pcfg = pcfg.PCFG() unary_pcfg.start = cfg.start unary_pcfg.terminals.add(unary) unary_pcfg.nonterminals = set(cfg.nonterminals) productions = set() lexical_index = defaultdict(list) sorted_prods = sorted(list(cfg.productions)) for prod in sorted_prods: if len(prod) == 3: productions.add(prod) else: productions.add((prod[0], unary)) lexical_index[prod[0]].append(prod[1]) unary_pcfg.productions = list(productions) unary_pcfg.parameters = {prod: 1.0 for prod in productions} unary_pcfg.normalise() self.current = unary_pcfg # compute best possible LP. logging.info("Training with LENGTH_EM_MAX_LENGTH %d ", LENGTH_EM_MAX_LENGTH) #return unary_pcfg targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH) valid_target = self.length_distribution.ml_lp_general( LENGTH_EM_MAX_LENGTH) logging.info("Target LP = %f, %f", targetlp, valid_target) ui = inside.UnaryInside(unary_pcfg) for i in range(LENGTH_EM_ITERATIONS): logging.info("Starting EM iteration %d, target = %f", i, targetlp) lp, kld = ui.train_once_smart(self.length_distribution.weights, LENGTH_EM_MAX_LENGTH) delta = abs(lp - targetlp) logging.info("KLD from target %f", kld) if kld < TERMINATION_KLD: logging.info("Converged enough. %f < %f ", kld, TERMINATION_KLD) break else: logging.warning( "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.", kld) unary_pcfg.parameters = ui.get_params() unary_pcfg.trim_zeros() final_pcfg = pcfg.PCFG() #print("nonterminals", unary_pcfg.nonterminals) ## Nondeterminism hee sorted_nonterminals = list(cfg.nonterminals) sorted_nonterminals.sort() for nt in sorted_nonterminals: if (nt, unary) in unary_pcfg.parameters: totalp = unary_pcfg.parameters[(nt, unary)] k = len(lexical_index[nt]) probs = self.lexical_distribution.sample(k) #print(nt,probs[0],k,totalp,lexical_index[nt][0]) assert (len(probs) == k) for a, p in zip(lexical_index[nt], probs): prod = (nt, a) final_pcfg.productions.append(prod) final_pcfg.parameters[prod] = p * totalp for prod in unary_pcfg.productions: if len(prod) == 3: final_pcfg.productions.append(prod) final_pcfg.parameters[prod] = unary_pcfg.parameters[prod] final_pcfg.start = unary_pcfg.start final_pcfg.nonterminals = unary_pcfg.nonterminals final_pcfg.terminals = cfg.terminals final_pcfg.normalise() return final_pcfg
def sample_smart(self): """ First constriuct unary one sampling from dirichlet. Then train. Then resample. """ logging.info("Sampling cfg") self.cfgfactory.number_terminals = 1 mycfg = self.cfgfactory.sample_full() logging.info("Sampled unary cfg") ## Number of productions. nb = (self.nonterminals - 1)**2 parameters = {} terminals = list(mycfg.terminals) unary_terminal = terminals[0] snt = list(mycfg.nonterminals) snt.sort() prods = list(mycfg.productions) prods.sort() for a in snt: unary_prob = numpy.random.random() logging.info("Unary probs %s %f" % (a, unary_prob)) #print(lprobs.shape) #for i,prod in enumerate([ prod for prod in mycfg.productions if len(prod) == 2 and prod[0] == a]): parameters[(a, unary_terminal)] = unary_prob probs = self.binary_distribution.sample(nb) #print(probs[0]) for i, prod in enumerate( [prod for prod in prods if len(prod) == 3 and prod[0] == a]): parameters[prod] = (1 - unary_prob) * probs[i] unary_pcfg = pcfg.PCFG(cfg=mycfg) unary_pcfg.parameters = parameters ## Ok we may get an error if we don't trim zeros (with very small concentration parameters) unary_pcfg.trim_zeros() unary_pcfg.set_log_parameters() logging.info("1,1,1 param %e" % unary_pcfg.parameters[("NT1", "NT1", "NT1")]) ## Now we have a unary grammar we train it. logging.info("Training with LENGTH_EM_MAX_LENGTH %d ", LENGTH_EM_MAX_LENGTH) #return unary_pcfg targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH) valid_target = self.length_distribution.ml_lp_general( LENGTH_EM_MAX_LENGTH) logging.info("Target LP = %f, %f", targetlp, valid_target) ui = inside.UnaryInside(unary_pcfg) logging.info("Entropy = %f", unary_pcfg.derivational_entropy()) for i in range(LENGTH_EM_ITERATIONS): logging.info("Starting EM iteration %d, target = %f ", i, targetlp) lp, kld = ui.train_once_smart(self.length_distribution.weights, LENGTH_EM_MAX_LENGTH) delta = abs(lp - targetlp) logging.info("KLD from target %f", kld) if kld < TERMINATION_KLD: logging.info("Converged enough. %f < %f ", kld, TERMINATION_KLD) break else: logging.warning( "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.", kld) unary_pcfg.parameters = ui.get_params() unary_pcfg.trim_zeros() #logging.info("1,1,1 param %e" % unary_pcfg.parameters[("NT1","NT1","NT1")]) ## Now we have a unary grammar, with the right parameters. logging.info("Entropy = %f", unary_pcfg.derivational_entropy()) final_pcfg = pcfg.PCFG() final_pcfg.nonterminals = unary_pcfg.nonterminals final_pcfg.start = unary_pcfg.start final_pcfg.terminals = list(utility.generate_lexicon(self.terminals)) final_pcfg.terminals.sort() logging.info("Terminals from %s to %s" % (final_pcfg.terminals[0], final_pcfg.terminals[-1])) for prod, alpha in unary_pcfg.parameters.items(): if len(prod) == 3: if alpha > 0: final_pcfg.productions.append(prod) final_pcfg.parameters[prod] = alpha else: ## sample conditional probs of binary. lprobs = self.lexical_distribution.sample(self.terminals) for i, a in enumerate(final_pcfg.terminals): newprod = (prod[0], a) newalpha = alpha * lprobs[i] # print(newalpha) if newalpha > 0: final_pcfg.productions.append(newprod) final_pcfg.parameters[newprod] = newalpha final_pcfg.set_log_parameters() return final_pcfg
def sample_uniform(self): """ Sample uniformly. Then train. Then resample. """ logging.info("Sampling uniform cfg") self.cfgfactory.number_terminals = 1 lp = 1 - (0.5**self.terminals) logging.info("Lexical pron for sampling is %e" % lp) mycfg = self.cfgfactory.sample_uniform(lp=lp, bp=0.5) logging.info("Sampled unary cfg") ## Number of productions. parameters = {} terminals = list(mycfg.terminals) unary_terminal = terminals[0] snt = list(mycfg.nonterminals) snt.sort() prods = list(mycfg.productions) prods.sort() nbs = Counter() for p in prods: if len(p) == 3: nbs[p[0]] += 1 for a in snt: unary_prob = numpy.random.random() logging.info("Unary probs %s %f" % (a, unary_prob)) #print(lprobs.shape) #for i,prod in enumerate([ prod for prod in mycfg.productions if len(prod) == 2 and prod[0] == a]): parameters[(a, unary_terminal)] = unary_prob probs = self.binary_distribution.sample(nbs[a]) #print(probs[0]) for i, prod in enumerate( [prod for prod in prods if len(prod) == 3 and prod[0] == a]): parameters[prod] = (1 - unary_prob) * probs[i] unary_pcfg = pcfg.PCFG(cfg=mycfg) unary_pcfg.parameters = parameters ## Ok we may get an error if we don't trim zeros (with very small concentration parameters) unary_pcfg.trim_zeros() unary_pcfg.set_log_parameters() ## Now we have a unary grammar we train it. logging.info("Training with LENGTH_EM_MAX_LENGTH %d ", LENGTH_EM_MAX_LENGTH) #return unary_pcfg targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH) valid_target = self.length_distribution.ml_lp_general( LENGTH_EM_MAX_LENGTH) logging.info("Target LP = %f, %f", targetlp, valid_target) ui = inside.UnaryInside(unary_pcfg) logging.info("Entropy = %f", unary_pcfg.derivational_entropy()) for i in range(LENGTH_EM_ITERATIONS): logging.info("Starting EM iteration %d, target = %f ", i, targetlp) lp, kld = ui.train_once_smart(self.length_distribution.weights, LENGTH_EM_MAX_LENGTH) delta = abs(lp - targetlp) logging.info("KLD from target %f", kld) if kld < TERMINATION_KLD: logging.info("Converged enough. %f < %f ", kld, TERMINATION_KLD) break else: logging.warning( "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.", kld) unary_pcfg.parameters = ui.get_params() unary_pcfg.trim_zeros() #logging.info("1,1,1 param %e" % unary_pcfg.parameters[("NT1","NT1","NT1")]) ## Now we have a unary grammar, with the right parameters. logging.info("Entropy = %f", unary_pcfg.derivational_entropy()) final_pcfg = pcfg.PCFG() final_pcfg.nonterminals = unary_pcfg.nonterminals final_pcfg.start = unary_pcfg.start final_pcfg.terminals = list(utility.generate_lexicon(self.terminals)) final_pcfg.terminals.sort() logging.info("Terminals from %s to %s" % (final_pcfg.terminals[0], final_pcfg.terminals[-1])) ## Now sample the lexical prods lbs = Counter() for prod, alpha in unary_pcfg.parameters.items(): if len(prod) == 3: final_pcfg.productions.append(prod) final_pcfg.parameters[prod] = alpha else: ## It's a unary rule ## so we sample one uniformly from the terminals and then flip a coin for the rest. initial = numpy.random.choice(self.terminals) lps = [] for i, a in enumerate(final_pcfg.terminals): if numpy.random.random() < 0.5 or i == initial: newprod = (prod[0], a) final_pcfg.productions.append(newprod) lps.append(newprod) lprobs = self.lexical_distribution.sample(len(lps)) for i, newprod in enumerate(lps): final_pcfg.parameters[newprod] = alpha * lprobs[i] final_pcfg.set_log_parameters() return final_pcfg
default="length.pdf") parser.add_argument("--logscale", help="Y axis in log scale", action="store_true") ## Other options: control output format, what probs are calculated. args = parser.parse_args() l = args.maxlength + 1 x = np.arange(1, l) ys = [] for f in args.inputfilenames: mypcfg = pcfg.load_pcfg_from_file(f) print(f) upcfg = mypcfg.make_unary() insider = inside.UnaryInside(upcfg) table = insider.compute_inside_smart(l) start = insider.start y = np.zeros(args.maxlength) for length in range(1, l): p = table[length, start] print(length, p) y[length - 1] = p ys.append(y) alpha = 1.0 / math.sqrt(len(ys)) for y in ys: plt.plot(x, y, 'b', alpha=alpha)