Exemple #1
0
def run(options, ndata):
    if LOTlib.SIG_INTERRUPTED: return 0, set()

    language = eval(options.LANG + "()")
    data = language.sample_data(LARGE_SAMPLE)

    assert len(data) == 1
    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    z = sum(data[0].output.values())
    if z > 0:
        best_ll = sum([(p / z) * log(p / z) for p in data[0].output.values()])
    else:
        best_ll = 0.0

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', "'%s'" % t, None, 1.0)

    # set up the hypothesis
    h0 = IncrementalLexiconHypothesis(grammar=grammar,
                                      alphabet_size=len(language.terminals()))
    h0.set_word(
        0,
        h0.make_hypothesis(grammar=grammar))  # make the first word at random
    h0.N = 1

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N):  # how many do we add?
        if LOTlib.SIG_INTERRUPTED: return 0, set()

        # and re-set the posterior or else it's something weird
        h0.compute_posterior(data)

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            h.best_ll = best_ll  # just store this
            tn.add(copy(h))

            if options.TRACE:
                print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h
                v = h()
                sortedv = sorted(v.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
                print "{" + ', '.join(["'%s':%s" % i for i in sortedv]) + "}"

        # and start from where we ended
        h0 = copy(h)
        h0.deepen()

    return ndata, tn
Exemple #2
0
def run(options, ndata):
    if LOTlib.SIG_INTERRUPTED: return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)

    assert len(data) == 1
    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE

    z = sum(data[0].output.values())
    if z > 0:
        best_ll = sum([ (p/z)*log(p/z) for p in data[0].output.values() ])
    else:
        best_ll = 0.0

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', "'%s'" % t, None, 1.0)

    # set up the hypothesis
    h0 = IncrementalLexiconHypothesis(grammar=grammar, alphabet_size=len(language.terminals()))
    h0.set_word(0, h0.make_hypothesis(grammar=grammar)) # make the first word at random
    h0.N = 1

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        if LOTlib.SIG_INTERRUPTED: return 0, set()

        # and re-set the posterior or else it's something weird
        h0.compute_posterior(data)

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            h.best_ll = best_ll # just store this
            tn.add(copy(h))

            if options.TRACE:
                print h.posterior_score, h.prior, h.likelihood, h.likelihood / ndata, h
                v = h()
                sortedv = sorted(v.items(), key=operator.itemgetter(1), reverse=True )
                print "{" + ', '.join(["'%s':%s"% i for i in sortedv]) + "}"


        # and start from where we ended
        h0 = copy(h)
        h0.deepen()

    return ndata, tn
Exemple #3
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    #print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = IncrementalLexiconHypothesis(grammar=grammar)

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        # add to the grammar
        grammar.add_rule('SELFF', '%s' % (outer), None, 1.0)

        # Add one more to the number of words here
        h0.set_word(outer, h0.make_hypothesis(grammar=grammar))
        h0.N = outer+1
        assert len(h0.value.keys())==h0.N==outer+1

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            tn.add(h)

            # print h.posterior_score, h
            # print getattr(h, 'll_counts', None)

        # and start from where we ended
        h0 = deepcopy(h) # must deepcopy

    return ndata, tn
Exemple #4
0
def run(options, ndata):
    """
    This out on the DATA_RANGE amounts of data and returns all hypotheses in top count
    """
    if LOTlib.SIG_INTERRUPTED:
        return 0, set()

    language = eval(options.LANG+"()")
    data = language.sample_data(LARGE_SAMPLE)
    assert len(data) == 1

    # renormalize the counts
    for k in data[0].output.keys():
        data[0].output[k] = float(data[0].output[k] * ndata) / LARGE_SAMPLE
    #print data

    # Now add the rules to the grammar
    grammar = deepcopy(base_grammar)
    for t in language.terminals():  # add in the specifics
        grammar.add_rule('ATOM', q(t), None, 2)

    h0 = IncrementalLexiconHypothesis(grammar=grammar)

    tn = TopN(N=options.TOP_COUNT)

    for outer in xrange(options.N): # how many do we add?
        # add to the grammar
        grammar.add_rule('SELFF', '%s' % (outer), None, 1.0)

        # Add one more to the number of words here
        h0.set_word(outer, h0.make_hypothesis(grammar=grammar))
        h0.N = outer+1
        assert len(h0.value.keys())==h0.N==outer+1

        # now run mcmc
        for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
            tn.add(h)

            print h.posterior_score, h
            print getattr(h, 'll_counts', None)

        # and start from where we ended
        h0 = deepcopy(h) # must deepcopy

    return ndata, tn