def evaluateJamspell(modelFile, testText, alphabetFile, maxWords=50000): utils.loadAlphabet(alphabetFile) corrector = JamspellCorrector(modelFile) random.seed(42) originalText = loadText(testText) erroredText = generateTypos(originalText) assert len(originalText) == len(erroredText) originalSentences = generateSentences(originalText) erroredSentences = generateSentences(erroredText) errorsRate, fixRate, broken, topNerr, topNfix, execTime = \ evaluateCorrector('jamspell', corrector, originalSentences, erroredSentences, maxWords) return errorsRate, fixRate, broken, topNerr, topNfix
def train(self, trainFile): print '[info] loading text' text = loadText(trainFile) sentences = generateSentences(text) sentences = self.convertToIDs(sentences) print '[info] generating N-grams', len(sentences) total = len(sentences) lastTime = time.time() for i in xrange(0, total): sentence = sentences[i] for w in sentence: self.gram1[w] += 1 self.totalWords += 1 for j in xrange(len(sentence) - 1): self.gram2[(sentence[j], sentence[j+1])] += 1 for j in xrange(len(sentence) - 2): self.gram3[(sentence[j], sentence[j+1], sentence[j+2])] += 1 if time.time() - lastTime >= 4.0: lastTime = time.time() print '[info] processed %.2f%%' % (100.0 * i / total) print '[info] finished training'
def main(): parser = argparse.ArgumentParser( description='spelling correctors evaluation') parser.add_argument('file', type=str, help='text file to use for evaluation') parser.add_argument('-hs', '--hunspell', type=str, help='path to hunspell model') parser.add_argument('-ns', '--norvig', type=str, help='path to train file for Norvig spell corrector') parser.add_argument('-cs', '--context', type=str, help='path to context spell model') parser.add_argument('-csp', '--context_prototype', type=str, help='path to context spell prototype model') parser.add_argument('-jsp', '--jamspell', type=str, help='path to jamspell model file') parser.add_argument('-t', '--test', action="store_true") parser.add_argument('-mx', '--max_words', type=int, help='max words to evaluate') parser.add_argument('-a', '--alphabet', type=str, help='alphabet file') args = parser.parse_args() if args.alphabet: utils.loadAlphabet(args.alphabet) correctors = { 'dummy': DummyCorrector(), } # corrector = correctors['dummy'] maxWords = args.max_words print('[info] loading models') if args.hunspell: corrector = correctors['hunspell'] = HunspellCorrector(args.hunspell) if args.norvig: corrector = correctors['norvig'] = NorvigCorrector(args.norvig) if args.context: corrector = correctors['context'] = ContextCorrector(args.context) if args.context_prototype: corrector = correctors['prototype'] = ContextPrototypeCorrector( args.context_prototype) if args.jamspell: corrector = correctors['jamspell'] = JamspellCorrector(args.jamspell) if args.test: return testMode(corrector) random.seed(42) print('[info] loading text') originalText = loadText(args.file) originalTextLen = len(list(originalText)) print('[info] generating typos') #将原始的词随机修改,并以单个词的集合-列表返回 erroredText = generateTypos(originalText) erroredTextLen = len(list(erroredText)) assert originalTextLen == erroredTextLen #将原始文本分割成句子(去掉其中的非法符号和非句号)(不包含句号) originalSentences = generateSentences(originalText) erroredSentences = generateSentences(erroredText) assert len(originalSentences) == len(erroredSentences) # for s in originalSentences[:50]: # print ' '.join(s) + '.' print('[info] total words: %d' % len(originalText)) print('[info] evaluating') results = {} for correctorName, corrector in correctors.items(): errorsRate, fixRate, broken, topNerr, topNfix, execTime = \ evaluateCorrector(correctorName, corrector, originalSentences, erroredSentences, maxWords) results[ correctorName] = errorsRate, fixRate, broken, topNerr, topNfix, execTime print('') print('[info] %12s %8s %8s %8s %8s %8s %8s' % ('', 'errRate', 'fixRate', 'broken', 'topNerr', 'topNfix', 'time')) # 将多个打分器的结果 resultsfixRate从大到小排序打印出来 # 匿名函数 ~ 将x替换为results.items()即就是results.items[i][1] for k, _ in sorted(results.items(), key=lambda x: x[1]): print('[info] %10s %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2fs' % \ (k, 100.0 * results[k][0], 100.0 * results[k][1], 100.0 * results[k][2], 100.0 * results[k][3], 100.0 * results[k][4], results[k][5]))
def main(argv): # default values #path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") path = '' outputfile = '' startfrom = 1 numsyms = 100 generateonly = False try: opts, args = getopt.getopt(argv, "hi:o:s:g:x", ["ifile=", "ofile="]) except getopt.GetoptError: print( 'program.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>' ) sys.exit(2) for opt, arg in opts: if opt == '-h': print('-' * 50) print('\n') print( 'test.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>\n' ) print('-i <inputfile>\t text file to train the network') print( '-g <numchars>\t number of characters to generate in generate-only mode' ) print( '-s <startfrom>\t training epoch to start from (file must exist)' ) print('\n\n') print('-' * 50) sys.exit() elif opt in ("-i", "--ifile"): path = arg elif opt in ("-g", "--generate"): generateonly = True numsyms = int(arg) #elif opt in ("-x", "--generateonly"): # generateonly = True elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-s", "--startepo"): startfrom = int(arg) print('Input file is %s' % path) print('Output file is %s' % outputfile) print('Starting from epoch %s' % startfrom) print('Generative model only %s ' % generateonly) print('Symbols to generate %d ' % numsyms) maxlen = 5 # load data and build vocab voc = ut.loadText(path, bychar=False, lower=True) syms = set(voc[0]) # voc = (text, sym_indices, indices_sym) X, y = ut.buildSkipgram(voc, maxlen=20, step=5) #X,y = ut.buildTrainingSequences(voc, maxlen=maxlen, step=1) print('input shape (%s, %s)' % (X.shape, y.shape)) vocsize = len(syms) emb_dims = 128 # build the model: 2 stacked LSTM print('Build model...') model = Sequential() layers = [maxlen, 100, 512, 1] # (X,y) from buildSkipgram #layers = [maxlen, 100, 512, vocsize] # (X,y) from buildSequences model.add(Embedding(vocsize, emb_dims)) #model.add(LSTM(input_dim=layers[0], output_dim=layers[1], return_sequences=True)) # with next lstm layer model.add( LSTM(input_dim=layers[0], output_dim=layers[1], return_sequences=False)) # with no more layers model.add(Dropout(0.5)) #model.add(LSTM(layers[2], return_sequences=False)) #model.add(Dropout(0.2)) model.add(Dense(output_dim=layers[3])) # for skipgram model.add(Activation('sigmoid')) # for skipgram #model.add(Activation('softmax')) # for sequences #model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # buildSequences model.compile(loss='mse', optimizer='rmsprop') # buildSkipgram if generateonly: print('Loading model from epoch %d' % (startfrom)) model.load_weights('results/lstm_word_based_epo_%d' % (startfrom)) yn = 'y' # default answer while (yn == 'y'): #ut.generate(model, voc, numchars=numsyms) ut.generateByWord(model, voc, numwords=numsyms) yn = '' while (yn not in ['y', 'n']): print("Generate more? [Y/n]: ") yn = str(raw_input()).lower() else: # not generate-only mode # train the model, output generated text after each iteration for iteration in range(startfrom, 400, 10): print() print('-' * 50) print('Starting from epoch %d' % iteration) if iteration >= 2: print('-' * 50) print('Loading model from epoch %d' % (iteration - 1)) model.load_weights('results/lstm_emb_word_based_epo_%d' % (iteration - 1)) model.fit(X, y, batch_size=128, nb_epoch=10) model.save_weights('results/lstm_emb_word_based_epo_%d' % (10 + iteration - 1), overwrite=True) print('Extracting embeddings') emb = model.layers[0] embeddings = emb.W.get_value() print('embeddings shape', embeddings.shape) print('Saving embeddings and vocabulary for t-SNE') np.save('results/lstm_embeddings', embeddings) #np.save('results/vocab_embeddings', voc[1]) ut.saveStuff(voc[1], 'results/vocab_embeddings')
from keras.datasets.data_utils import get_file import numpy as np import random import sys import getopt import os.path import utils as ut path = 'data/alice.txt' maxlen = 20 # load data and build vocab voc = ut.loadText(path, bychar=True) syms = set(voc[0]) X, y = ut.buildTrainingSet(voc, maxlen=maxlen, step=3) # build the model: 2 stacked LSTM print('Build model...') model = Sequential() #model.add(Embedding(len(syms), 128)) model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(syms)))) model.add(Dropout(0.2)) model.add(LSTM(512, return_sequences=False)) model.add(Dropout(0.2)) model.add(Dense(len(syms))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
def main(argv): # default values #path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") path = '' outputfile = '' startfrom = 1 numsyms=100 generateonly = False try: opts, args = getopt.getopt(argv,"hi:o:s:g:x",["ifile=","ofile="]) except getopt.GetoptError: print('program.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('-'*50) print('\n') print('test.py -i <inputfile> -o <outputfile> -g <numchars> -s <startfrom>\n') print('-i <inputfile>\t text file to train the network') print('-g <numchars>\t number of characters to generate in generate-only mode') print('-s <startfrom>\t training epoch to start from (file must exist)') print('\n\n') print('-'*50) sys.exit() elif opt in ("-i", "--ifile"): path = arg elif opt in ("-g", "--generate"): generateonly = True numsyms = int(arg) #elif opt in ("-x", "--generateonly"): # generateonly = True elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-s", "--startepo"): startfrom = int(arg) print('Input file is %s' %path) print('Output file is %s' %outputfile) print('Starting from epoch %s' %startfrom) print('Generative model only %s ' %generateonly) print('Symbols to generate %d '%numsyms) maxlen = 20 # alice # load data and build vocab voc = ut.loadText(path, bychar=True, lower=True) syms = set(voc[0]) # voc = (text, sym_indices, indices_sym) X,y = ut.buildTrainingSet(voc, maxlen=maxlen, step=3) # build the model: 2 stacked LSTM print('Build model...') model = Sequential() #model.add(Embedding(len(syms), 128)) model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(syms)))) model.add(Dropout(0.2)) model.add(LSTM(512, return_sequences=False)) model.add(Dropout(0.2)) model.add(Dense(len(syms))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') if generateonly: print('Loading model from epoch %d' %(startfrom)) #model.load_weights('results/lstm_word_based_epo_%d'%(startfrom)) model.load_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%(startfrom)) yn = 'y' # default answer while(yn == 'y'): ut.generate(model, voc, numchars=numsyms) #ut.generateByWord(model, voc, numwords=numsyms) yn = '' while(yn not in ['y', 'n']): print("Generate more? [Y/n]: ") yn = str(raw_input()).lower() else: # not generate-only mode # train the model, output generated text after each iteration for iteration in range(startfrom, 400): print() print('-' * 50) print('Starting from epoch %d' %iteration) if iteration >= 2: print('-' * 50) print('Loading model from epoch %d' %(iteration-1)) model.load_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%(iteration-1)) model.fit(X, y, batch_size=128, nb_epoch=1) model.save_weights('results/lstm_char_based/tweet_lstm_char_based_epo_%d'%iteration, overwrite=True) # are we learning well? let's print some ut.generate(model, voc, numchars=42)