def test(codebook, model=None, alphabet=None, txt=None, balanced_acc=0.75, default_context='. ', threshold=1.0, min_epochs=1, max_epochs=72, minprob=None, exponent=1.0, w=80): modelin = model if model == None: model = TextPrediction.LanguageModel(alphabet=alphabet) if alphabet == None: alphabet = TextPrediction.default_alphabet36 if txt == None: txt = TextPrediction.default_test_text alphabet = model.match_case(alphabet) if len(set(alphabet) - set(model.alphabet)) > 0: raise ValueError( "some characters in the alphabet are not in the model") if len(txt) < 1000 and os.path.isfile(TextPrediction.FindFile(txt)): txt = TextPrediction.ReadUTF8(TextPrediction.FindFile(txt)) # Things will fail if some characters in the test text are not in alphabet. So... if 'this is a massive hack': oldalphabet = model.alphabet model.alphabet = set(oldalphabet).intersection(alphabet) txt = model.clean(txt) model.alphabet = oldalphabet if isinstance(codebook, dict) and 'Matrix' in codebook: codebook = codebook['Matrix'] if isinstance(codebook, basestring): codebook = [[int(i) for i in row.split()] for row in codebook.strip().replace('.', '0').replace( '-', '0').split('\n')] codebook = numpy.asarray(codebook) N, L = codebook.shape if N != len(alphabet): raise ValueError( "number of rows in the codebook should be equal to the number of symbols in the alphabet" ) out = txt[:0] nchars = len(txt) correct = numpy.zeros((nchars, ), dtype=numpy.bool) nepochs = numpy.zeros((nchars, ), dtype=numpy.int16) z = SigTools.invcg(balanced_acc) confusion = numpy.zeros((2, 2), dtype=numpy.float64) correct_running = SigTools.running_mean() nepochs_running = SigTools.running_mean() try: for i in range(nchars): correct_symbol = txt[i] correct_index = alphabet.index( correct_symbol ) # will throw an error if not found, and rightly so context = model.prepend_context(txt[:i], default_context) d = TextPrediction.Decoder(choices=alphabet, context=context, model=model, verbose=False, threshold=threshold, min_epochs=min_epochs, max_epochs=max_epochs, minprob=minprob, exponent=exponent) result = None while result == None: #__IPYTHON__.dbstop() col = codebook[:, d.L % L] Cij = int(col[correct_index]) d.new_column(col) mean = z * {0: -1, 1: 1}[Cij] x = numpy.random.randn() + mean p = SigTools.logistic(2.0 * z * x) confusion[Cij, int(round(p))] += 1 result = d.new_transmission(p) nepochs[i] = d.L correct[i] = (result == correct_symbol) out = out + result nepochs_running += d.L correct_running += (result == correct_symbol) if w: sys.stdout.write(result) if (i + 1) % w == 0: sys.stdout.write(' %3d\n' % round(100.0 * float(i + 1) / nchars)) elif (i + 1) == nchars: sys.stdout.write('\n') except KeyboardInterrupt: if w: sys.stdout.write('\n') pass ndone = len(out) s = sstruct() s.alphabet = alphabet s.codebook = codebook s.model = modelin s.input = txt[:ndone] s.output = out s.conditions = sstruct() s.conditions.threshold = threshold s.conditions.min_epochs = min_epochs s.conditions.max_epochs = max_epochs s.conditions.minprob = minprob s.conditions.exponent = exponent s.epoch_acc = sstruct() s.epoch_acc.desired = balanced_acc s.epoch_acc.empirical_nontarget, s.epoch_acc.empirical_target = ( confusion.diagonal() / confusion.sum(axis=1)).flat s.epoch_acc.confusion = confusion s.nepochs = sstruct() s.nepochs.each = nepochs[:ndone] s.nepochs.mean = nepochs_running.m s.nepochs.std = nepochs_running.v_unbiased**0.5 s.nepochs.ste = s.nepochs.std / nepochs_running.n**0.5 s.letter_acc = sstruct() s.letter_acc.each = correct[:ndone] s.letter_acc.mean = correct_running.m s.letter_acc.std = correct_running.v_unbiased**0.5 s.letter_acc.ste = s.letter_acc.std / correct_running.n**0.5 return s
def test(codebook, model=None, alphabet=None, txt=None, balanced_acc=0.75, default_context='. ', threshold=1.0, min_epochs=1, max_epochs=72, minprob=None, exponent=1.0, w=80): modelin = model if model == None: model = TextPrediction.LanguageModel(alphabet=alphabet) if alphabet == None: alphabet = TextPrediction.default_alphabet36 if txt==None: txt = TextPrediction.default_test_text alphabet = model.match_case(alphabet) if len(set(alphabet) - set(model.alphabet)) > 0: raise ValueError("some characters in the alphabet are not in the model") if len(txt) < 1000 and os.path.isfile(TextPrediction.FindFile(txt)): txt = TextPrediction.ReadUTF8(TextPrediction.FindFile(txt)) # Things will fail if some characters in the test text are not in alphabet. So... if 'this is a massive hack': oldalphabet = model.alphabet model.alphabet = set(oldalphabet).intersection(alphabet) txt = model.clean(txt) model.alphabet = oldalphabet if isinstance(codebook, dict) and 'Matrix' in codebook: codebook = codebook['Matrix'] if isinstance(codebook, basestring): codebook = [[int(i) for i in row.split()] for row in codebook.strip().replace('.','0').replace('-','0').split('\n')] codebook = numpy.asarray(codebook) N,L = codebook.shape if N != len(alphabet): raise ValueError("number of rows in the codebook should be equal to the number of symbols in the alphabet") out = txt[:0] nchars = len(txt) correct = numpy.zeros((nchars,), dtype=numpy.bool) nepochs = numpy.zeros((nchars,), dtype=numpy.int16) z = SigTools.invcg(balanced_acc) confusion = numpy.zeros((2,2),dtype=numpy.float64) correct_running = SigTools.running_mean() nepochs_running = SigTools.running_mean() try: for i in range(nchars): correct_symbol = txt[i] correct_index = alphabet.index(correct_symbol) # will throw an error if not found, and rightly so context = model.prepend_context(txt[:i], default_context) d = TextPrediction.Decoder(choices=alphabet, context=context, model=model, verbose=False, threshold=threshold, min_epochs=min_epochs, max_epochs=max_epochs, minprob=minprob, exponent=exponent) result = None while result == None: #__IPYTHON__.dbstop() col = codebook[:, d.L % L] Cij = int(col[correct_index]) d.new_column(col) mean = z * {0:-1, 1:1}[Cij] x = numpy.random.randn() + mean p = SigTools.logistic(2.0 * z * x) confusion[Cij, int(round(p))] += 1 result = d.new_transmission(p) nepochs[i] = d.L correct[i] = (result == correct_symbol) out = out + result nepochs_running += d.L correct_running += (result == correct_symbol) if w: sys.stdout.write(result); sys.stdout.flush() if (i+1) % w == 0: sys.stdout.write(' %3d\n' % round(100.0 * float(i+1)/nchars)) elif (i+1) == nchars: sys.stdout.write('\n') except KeyboardInterrupt: if w: sys.stdout.write('\n') pass ndone = len(out) s = sstruct() s.alphabet = alphabet s.codebook = codebook s.model = modelin s.input = txt[:ndone] s.output = out s.conditions = sstruct() s.conditions.threshold = threshold s.conditions.min_epochs = min_epochs s.conditions.max_epochs = max_epochs s.conditions.minprob = minprob s.conditions.exponent = exponent s.epoch_acc = sstruct() s.epoch_acc.desired = balanced_acc s.epoch_acc.empirical_nontarget, s.epoch_acc.empirical_target = (confusion.diagonal() / confusion.sum(axis=1)).flat s.epoch_acc.confusion = confusion s.nepochs = sstruct() s.nepochs.each = nepochs[:ndone] s.nepochs.mean = nepochs_running.m s.nepochs.std = nepochs_running.v_unbiased ** 0.5 s.nepochs.ste = s.nepochs.std / nepochs_running.n ** 0.5 s.letter_acc = sstruct() s.letter_acc.each = correct[:ndone] s.letter_acc.mean = correct_running.m s.letter_acc.std = correct_running.v_unbiased ** 0.5 s.letter_acc.ste = s.letter_acc.std / correct_running.n ** 0.5 return s