def hill_climb(text, segs, iterations): '''Greedy optimization using single-flip moves only Note that without flip_n, which modified segs in-place, we need not worry about juggling segs and best! ''' best = evaluate(text, segs) # for i in range(iterations): print i, pos = -1 #pos, best = 0, evaluate(text, segs) # pos should really be out of range - what if the first word is a single letter?? for i in range(len(segs)): score = evaluate(text, flip(segs, i)) if score < best: pos, best = i, score # is it just the C programmer in me, or is this LESS legible? if pos >= 0: # found a greed-satisfying move (changed from !=) segs = flip(segs, pos) print evaluate(text, segs), segment(text, segs), print return segs
def anneal(text, segs, iterations, cooling_rate): temperature = float(len(segs)) # initialize T to something big best_segs, best = segs, evaluate( text, segs) # initialization really belongs WAY out here while temperature > 0.5: #best_segs, best = segs, evaluate(text, segs) # <-- book has initialization here, unnecessarily for i in range(iterations): # instead of the Metropolis criterion: guess = flip_n(segs, int( round(temperature))) # - make "iterations" random moves score = evaluate(text, guess) if score < best: best, best_segs = score, guess # - keep your best score from all those moves #score, segs = best, best_segs # only segs needs to be updated for next T segs = best_segs temperature = temperature / cooling_rate # temperature, and thus move "length", gradually decreases #print evaluate(text, segs), segment(text, segs) # my version shows intent more cleanly - running high score print evaluate(text, best_segs), segment(text, best_segs) print return segs
def evaluate(text, segs): words = segment(text, segs) text_size = len(words) lexicon_size = len(' '.join(list(set(words)))) #print 'sum(seg) =', sum(map(int, list(segs))) return text_size + lexicon_size # Figure 3.8: the objective function from Brent 1995
''' from code_segment import segment # Natural Language Toolkit: code_evaluate def evaluate(text, segs): words = segment(text, segs) text_size = len(words) lexicon_size = len(' '.join(list(set(words)))) #print 'sum(seg) =', sum(map(int, list(segs))) return text_size + lexicon_size # Figure 3.8: the objective function from Brent 1995 # lexicon_size will be off by 1 relative to the Figure (which added a boundary marker to EVERY word) if __name__ == "__main__": print __doc__ text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" seg1 = "0000000000000001000000000010000000000000000100000000000" seg2 = "0100100100100001001001000010100100010010000100010010000" seg3 = "0000100100000011001000000110000100010000001100010000001" print segment(text, seg3) print evaluate(text, seg3) # 46: 14 segments, lexicon_size = 32 print evaluate(text, seg2) # 47: 15 segments, lexicon_size = 32 print evaluate(text, seg1) # 63: 3 segments, lexicon_size = 60 # brief intuition # - more segments means smaller words # - smaller words means greater likelihood for repeat usage # - repeat usage means lower lexicon score (repeated usage of same word is counted only once)
Note that without flip_n, which modified segs in-place, we need not worry about juggling segs and best! ''' best = evaluate(text, segs) # for i in range(iterations): print i, pos = -1 #pos, best = 0, evaluate(text, segs) # pos should really be out of range - what if the first word is a single letter?? for i in range(len(segs)): score = evaluate(text, flip(segs, i)) if score < best: pos, best = i, score # is it just the C programmer in me, or is this LESS legible? if pos >= 0: # found a greed-satisfying move (changed from !=) segs = flip(segs, pos) print evaluate(text, segs), segment(text, segs), print return segs if __name__ == "__main__": # data from code_anneal.py?? text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy" seg1 = "0000000000000001000000000010000000000000000100000000000" print evaluate(text, seg1), segment(text, seg1) # 63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy'] hill_climb(text, seg1, 20) # 61 ['doyouseethekittyseethedoggy', 'doyoulikethekitty', 'likethedoggy'] # 59 ['doyouseethekittyseethedoggydoyoulikethekitty', 'likethedoggy'] # 57 ['doyouseethekittyseethedoggydoyoulikethekittylikethedoggy'] # hahahahahaha greed is a sin