def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG + "()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word( 0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True) print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def run(options, ndata): if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE z = sum(data[0].output.values()) if z > 0: best_ll = sum([ (p/z)*log(p/z) for p in data[0].output.values() ]) else: best_ll = 0.0 # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', "'%s'" % t, None, 1.0) # set up the hypothesis h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals())) h0.set_word(0, h0.make_hypothesis(grammar=grammar)) # make the first word at random h0.N = 1 tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? if LOTlib.SIG_INTERRUPTED: return 0, set() # and re-set the posterior or else it's something weird h0.compute_posterior(data) # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): h.best_ll = best_ll # just store this tn.add(copy(h)) if options.TRACE: print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h v = h() sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True ) print "{" + ', '.join(["'%s':%s"% i for i in sortedv]) + "}" # and start from where we ended h0 = copy(h) h0.deepen() return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) # print h.posterior_score, h # print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn
def run(options, ndata): """ This out on the DATA_RANGE amounts of data and returns all hypotheses in top count """ if LOTlib.SIG_INTERRUPTED: return 0, set() language = eval(options.LANG+"()") data = language.sample_data(LARGE_SAMPLE) assert len(data) == 1 # renormalize the counts for k in data[0].output.keys(): data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE #print data # Now add the rules to the grammar grammar = deepcopy(base_grammar) for t in language.terminals(): # add in the specifics grammar.add_rule('ATOM', q(t), None, 2) h0 = IncrementalLexiconHypothesis(grammar=grammar) tn = TopN(N=options.TOP_COUNT) for outer in xrange(options.N): # how many do we add? # add to the grammar grammar.add_rule('SELFF', '%s' % (outer), None, 1.0) # Add one more to the number of words here h0.set_word(outer, h0.make_hypothesis(grammar=grammar)) h0.N = outer+1 assert len(h0.value.keys())==h0.N==outer+1 # now run mcmc for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)): tn.add(h) print h.posterior_score, h print getattr(h, 'll_counts', None) # and start from where we ended h0 = deepcopy(h) # must deepcopy return ndata, tn