예제 #1
0
def hill_climb(text, segs, iterations):
    '''Greedy optimization using single-flip moves only
    
    Note that without flip_n, which modified segs in-place, we need not worry about juggling segs and best!
    '''
    best = evaluate(text, segs)                                     # 
    for i in range(iterations):
        print i,
        pos = -1                                                    
        #pos, best = 0, evaluate(text, segs)                        # pos should really be out of range - what if the first word is a single letter??
        for i in range(len(segs)):
            score = evaluate(text, flip(segs, i))
            if score < best:
                pos, best = i, score                                # is it just the C programmer in me, or is this LESS legible?
        if pos >= 0:                                                # found a greed-satisfying move (changed from !=)
            segs = flip(segs, pos)
            print evaluate(text, segs), segment(text, segs),
        print
    return segs
예제 #2
0
def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))  # initialize T to something big
    best_segs, best = segs, evaluate(
        text, segs)  # initialization really belongs WAY out here
    while temperature > 0.5:
        #best_segs, best = segs, evaluate(text, segs)           # <-- book has initialization here, unnecessarily
        for i in range(iterations):  # instead of the Metropolis criterion:
            guess = flip_n(segs, int(
                round(temperature)))  # - make "iterations" random moves
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess  # - keep your best score from all those moves

        #score, segs = best, best_segs                          # only segs needs to be updated for next T
        segs = best_segs

        temperature = temperature / cooling_rate  # temperature, and thus move "length", gradually decreases

        #print evaluate(text, segs), segment(text, segs)        # my version shows intent more cleanly - running high score
        print evaluate(text, best_segs), segment(text, best_segs)

    print
    return segs
예제 #3
0
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    #print 'sum(seg) =', sum(map(int, list(segs)))
    return text_size + lexicon_size  # Figure 3.8: the objective function from Brent 1995
예제 #4
0
'''

from code_segment import segment

# Natural Language Toolkit: code_evaluate


def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    #print 'sum(seg) =', sum(map(int, list(segs)))
    return text_size + lexicon_size  # Figure 3.8: the objective function from Brent 1995
    # lexicon_size will be off by 1 relative to the Figure (which added a boundary marker to EVERY word)


if __name__ == "__main__":
    print __doc__
    text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
    seg1 = "0000000000000001000000000010000000000000000100000000000"
    seg2 = "0100100100100001001001000010100100010010000100010010000"
    seg3 = "0000100100000011001000000110000100010000001100010000001"
    print segment(text, seg3)
    print evaluate(text, seg3)  # 46: 14 segments, lexicon_size = 32
    print evaluate(text, seg2)  # 47: 15 segments, lexicon_size = 32
    print evaluate(text, seg1)  # 63: 3 segments, lexicon_size = 60

# brief intuition
# - more segments means smaller words
# - smaller words means greater likelihood for repeat usage
# - repeat usage means lower lexicon score (repeated usage of same word is counted only once)
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    #print 'sum(seg) =', sum(map(int, list(segs)))
    return text_size + lexicon_size                 # Figure 3.8: the objective function from Brent 1995
'''



from code_segment import segment

# Natural Language Toolkit: code_evaluate

def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    #print 'sum(seg) =', sum(map(int, list(segs)))
    return text_size + lexicon_size                 # Figure 3.8: the objective function from Brent 1995
                                                    # lexicon_size will be off by 1 relative to the Figure (which added a boundary marker to EVERY word)

if __name__ == "__main__":
    print __doc__
    text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
    seg1 = "0000000000000001000000000010000000000000000100000000000"
    seg2 = "0100100100100001001001000010100100010010000100010010000"
    seg3 = "0000100100000011001000000110000100010000001100010000001"
    print segment(text, seg3)
    print evaluate(text, seg3)                      # 46: 14 segments, lexicon_size = 32
    print evaluate(text, seg2)                      # 47: 15 segments, lexicon_size = 32
    print evaluate(text, seg1)                      # 63: 3 segments, lexicon_size = 60
    
# brief intuition
# - more segments means smaller words
# - smaller words means greater likelihood for repeat usage
# - repeat usage means lower lexicon score (repeated usage of same word is counted only once)
예제 #7
0
    Note that without flip_n, which modified segs in-place, we need not worry about juggling segs and best!
    '''
    best = evaluate(text, segs)                                     # 
    for i in range(iterations):
        print i,
        pos = -1                                                    
        #pos, best = 0, evaluate(text, segs)                        # pos should really be out of range - what if the first word is a single letter??
        for i in range(len(segs)):
            score = evaluate(text, flip(segs, i))
            if score < best:
                pos, best = i, score                                # is it just the C programmer in me, or is this LESS legible?
        if pos >= 0:                                                # found a greed-satisfying move (changed from !=)
            segs = flip(segs, pos)
            print evaluate(text, segs), segment(text, segs),
        print
    return segs

if __name__ == "__main__":

    # data from code_anneal.py??
    text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
    seg1 = "0000000000000001000000000010000000000000000100000000000"
    
    print evaluate(text, seg1), segment(text, seg1)
    # 63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
    
    hill_climb(text, seg1, 20)
    # 61 ['doyouseethekittyseethedoggy', 'doyoulikethekitty', 'likethedoggy']
    # 59 ['doyouseethekittyseethedoggydoyoulikethekitty', 'likethedoggy']
    # 57 ['doyouseethekittyseethedoggydoyoulikethekittylikethedoggy'] 
    # hahahahahaha greed is a sin