def save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr): global _lastfilename filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) logging.info("Writing translation_model to %s..." % filename) logging.info(stats()) cPickle.dump(translation_model, myopen(filename, "wb"), protocol=-1) logging.info("...done writing translation_model to %s" % filename) logging.info(stats()) # if _lastfilename is not None: # logging.info("Removing old translation_model %s..." % _lastfilename) # try: # os.remove(_lastfilename) # logging.info("...removed %s" % _lastfilename) # except: # logging.info("Could NOT remove %s" % _lastfilename) _lastfilename = filename common.json.dumpfile((cnt, lastcnt, epoch, filename), os.path.join(rundir, "trainstate.json")) filename = os.path.join(rundir, "newkeystr.txt") myopen(filename, "wt").write(newkeystr)
def createlibsvmfile(model,depth,datafiles,dataout): print >> sys.stderr, 'Creating libsvm file %s (model=%s, depth=%d, datafiles=%s)...' % (repr(dataout), repr(model),depth,datafiles) print >> sys.stderr, stats() outputs = [model.layers[depth].out] func = theano.function([model.inp],outputs) f = myopen(datafiles[0],'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() f = myopen(datafiles[1],'r') labels = numpy.asarray(cPickle.load(f),dtype = 'int64') f.close() f = open(dataout,'w') for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM): textr = '' rep = func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0] for l in range(rep.shape[0]): textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l] idx = rep[l,:].nonzero()[0] for j,v in zip(idx,rep[l,idx]): textr += '%s:%s '%(j,v) textr += '\n' f.write(textr) del instances,labels f.close() print >> sys.stderr, "...done creating libsvm files" print >> sys.stderr, stats()
def createlibsvmfile(model, depth, datafiles, dataout): print >> sys.stderr, 'Creating libsvm file %s (model=%s, depth=%d, datafiles=%s)...' % ( repr(dataout), repr(model), depth, datafiles) print >> sys.stderr, stats() outputs = [model.layers[depth].out] func = theano.function([model.inp], outputs) f = myopen(datafiles[0], 'r') instances = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX) f.close() f = myopen(datafiles[1], 'r') labels = numpy.asarray(cPickle.load(f), dtype='int64') f.close() f = open(dataout, 'w') for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM / globalstate.BATCH_CREATION_LIBSVM): textr = '' rep = func( instances[globalstate.BATCH_CREATION_LIBSVM * i:globalstate.BATCH_CREATION_LIBSVM * (i + 1), :])[0] for l in range(rep.shape[0]): textr += '%s ' % labels[globalstate.BATCH_CREATION_LIBSVM * i + l] idx = rep[l, :].nonzero()[0] for j, v in zip(idx, rep[l, idx]): textr += '%s:%s ' % (j, v) textr += '\n' f.write(textr) del instances, labels f.close() print >> sys.stderr, "...done creating libsvm files" print >> sys.stderr, stats()
def all_training_examples_cached(): global _all_examples if _all_examples is None: try: _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename())) assert len(_all_examples) == cnt logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) except: logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) logging.info("Caching all training examples...") logging.info(stats()) _all_examples = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): for e in get_training_biexample(l1, l2, f1, f2, falign): _all_examples.append(e) if len(_all_examples) % 10000 == 0: logging.info("\tcurrently have read %d training examples" % len(_all_examples)) logging.info(stats()) random.shuffle(_all_examples) logging.info("...done caching all %d training examples" % len(_all_examples)) logging.info(stats()) cnt = len(_all_examples) cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) assert len(_all_examples) == cnt logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) assert _all_examples is not None return _all_examples
def createlibsvmfile(model,datafiles,dataout): print >> sys.stderr, 'Creating libsvm file %s (model=%s, datafiles=%s)...' % (repr(dataout), repr(model),datafiles) print >> sys.stderr, stats() x = T.dmatrix() params = [T.dmatrix(), T.dmatrix(), T.dvector(), T.dvector()] model.x = x model.W, model.W_prime, model.b, model.b_prime = params model.params = [model.W, model.W_prime, model.b, model.b_prime] outputs = [model.get_hidden_values(model.x)] func = theano.function([model.x] + params,outputs) # print >> sys.stderr, 'REMOVEME: about to read' # print >> sys.stderr, stats() f = myopen(datafiles[0],'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() f = myopen(datafiles[1],'r') labels = numpy.asarray(cPickle.load(f),dtype = 'int64') f.close() f = myopen(dataout,'w') # print >> sys.stderr, 'REMOVEME: about to iterate' # print >> sys.stderr, stats() # params = [model.Wvalue, model.W_primevalue, model.bvalue, model.b_primevalue] for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM): # print >> sys.stderr, 'REMOVEME: about to do %d' % i # print >> sys.stderr, stats() textr = '' assert globalstate.BATCH_CREATION_LIBSVM == 1 # Don't want to select indices from more than one example x = instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:] nonzeros = frozenset(x.nonzero()[1]) # print >> sys.stderr, nonzeros # print >> sys.stderr, len(nonzeros) indices = list(nonzeros) # # TODO: Don't duplicate this code, which also appears about one hundred lines down. # x = x[:,indices] # params = [model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices]] # rep = func(x, *params)[0] rep = func(x[:,indices], model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices])[0] for l in range(rep.shape[0]): textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l] idx = rep[l,:].nonzero()[0] for j,v in zip(idx,rep[l,idx]): textr += '%s:%s '%(j,v) textr += '\n' f.write(textr) del instances,labels f.close() print >> sys.stderr, "...done creating libsvm files" print >> sys.stderr, stats()
def save(params, rundir, best_validation_accuracy, best_validation_at): import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb") HLAYERS = HYPERPARAMETERS["hidden layers"] if HLAYERS == 2: (w1, b1, wh, bh, w2, b2) = params cPickle.dump((w1, b1, wh, bh, w2, b2), myopen(_filename(rundir), "w"), protocol=-1) else: (w1, b1, w2, b2) = params cPickle.dump((w1, b1, w2, b2), myopen(_filename(rundir), "w"), protocol=-1) myopen(join(rundir, "best-model-validation.txt"), "w").write("Accuracy %.2f%% after %d updates" % (best_validation_accuracy*100, best_validation_at))
def state_save(): if HLAYERS == 2: cPickle.dump((w1, b1, wh, bh, w2, b2), myopen(join(rundir, "best-model.pkl"), "w"), protocol=-1) else: cPickle.dump((w1, b1, w2, b2), myopen(join(rundir, "best-model.pkl"), "w"), protocol=-1) myopen(join(rundir, "best-model-validation.txt"), "w").write("Accuracy %.2f%% after %d updates" % (best_validation_accuracy * 100, best_validation_at))
def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len( wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][( HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS[ "TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map)) ] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
def load(rundir): print >> sys.stderr, "Loading state from %s..." % _filename(rundir) print >> sys.stderr, stats() m = cPickle.load(myopen(_filename(rundir), "r")) print >> sys.stderr, "...done loading state from %s" % _filename(rundir) print >> sys.stderr, stats() return m
def svm_validation(err, epoch, model, train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST): """ Perform full SVM validation. """ print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats() createlibsvmfile(model,datatrain,datatrainsave) createlibsvmfile(model,datatest,datatestsave) for trainsize in VALIDATION_TRAININGSIZE: print trainsize print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE C,testerr,testerrdev,trainerr,trainerrdev = svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE) err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev)}) for trainsize in VALIDATION_TRAININGSIZE: print >> sys.stderr, 'VALIDATION: epoch %d / trainsize %d / svm error' % ( epoch, trainsize) ,err[trainsize][epoch] print >> sys.stderr, stats() if epoch != 0: f = myopen('err.pkl','w') for trainsize in VALIDATION_TRAININGSIZE: cPickle.dump(err[trainsize],f,-1) f.close() print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats()
def save(params, rundir, best_validation_accuracy, best_validation_at): import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb") HLAYERS = HYPERPARAMETERS["hidden layers"] if HLAYERS == 2: (w1, b1, wh, bh, w2, b2) = params cPickle.dump((w1, b1, wh, bh, w2, b2), myopen(_filename(rundir), "w"), protocol=-1) else: (w1, b1, w2, b2) = params cPickle.dump((w1, b1, w2, b2), myopen(_filename(rundir), "w"), protocol=-1) myopen(join(rundir, "best-model-validation.txt"), "w").write("Accuracy %.2f%% after %d updates" % (best_validation_accuracy * 100, best_validation_at))
def write(_wordmap_new): """ Write the word ID map, passed as a parameter. """ global _wordmap assert _wordmap is None _wordmap = _wordmap_new print >> sys.stderr, "Writing word map with %d words to %s..." % (_wordmap.len, _wordmap_filename()) cPickle.dump(_wordmap, myopen(_wordmap_filename(), "w"))
def write(_wordmap_new): """ Write the word ID map, passed as a parameter. """ global _wordmap assert _wordmap is None _wordmap = _wordmap_new print >> sys.stderr, "Writing word map with %d words to %s..." % ( _wordmap.len, _wordmap_filename()) cPickle.dump(_wordmap, myopen(_wordmap_filename(), "w"))
def write(_targetmap_new, name=""): """ Write the word ID map, passed as a parameter. """ global _targetmap assert name not in _targetmap _targetmap[name] = _targetmap_new f = _targetmap_filename(name=name) print >> sys.stderr, "Writing target map to %s..." % f cPickle.dump(_targetmap[name], myopen(f, "w"))
def get_example(f): import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") for l in myopen(f): prevwords = [] for w in string.split(l): w = string.strip(w) prevwords.append(w) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
def targetmap(name=""): global _targetmap if name not in _targetmap: f = _targetmap_filename(name=name) print >> sys.stderr, "Reading target map from %s..." % f print >> sys.stderr, stats() _targetmap[name] = cPickle.load(myopen(f)) print >> sys.stderr, "...done reading target map from %s" % f print >> sys.stderr, stats() return _targetmap[name]
def svm_validation(err, reconstruction_error, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST,RULE): """ Perform full SVM validation. """ global TRAINFUNC print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats() # Call with noiselevel = None before running the SVM. # No noise because we want the exact representation for each instance. rebuildunsup(model,depth,ACT,LR,None,BATCHSIZE,train,RULE) createlibsvmfile(model,depth,datatrain,datatrainsave) createlibsvmfile(model,depth,datatest,datatestsave) for trainsize in VALIDATION_TRAININGSIZE: print trainsize print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev =\ svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE) err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev)}) if epoch != 0: f = myopen(PATH_DATA + NAME_DATATEST +'_1.pkl.gz','r') train.container.value[:] = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() # Now, restore TRAINFUNC with the original NOISE_LVL rebuildunsup(model,depth,ACT,LR,NOISE_LVL,BATCHSIZE,train,RULE) reconstruction_error.update({epoch:TESTFUNC()}) print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / reconstruction error (is this on test or train?): ' % (depth+1, epoch),reconstruction_error[epoch] for trainsize in VALIDATION_TRAININGSIZE: print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / trainsize %d / svm error' % (depth+1, epoch, trainsize),err[trainsize][epoch] print >> sys.stderr, stats() if epoch != 0: f = open('depth%serr.pkl'%depth,'w') cPickle.dump(reconstruction_error,f,-1) for trainsize in VALIDATION_TRAININGSIZE: cPickle.dump(err[trainsize],f,-1) f.close() modeldir = os.path.join(PATH_SAVE, 'depth%spre%s' % (depth+1,epoch)) if not os.path.isdir(modeldir): os.mkdir(modeldir) model.save(modeldir) if RULE == 5: f = open(modeldir + '/auxsigma.pkl','w') cPickle.dump(model.auxsigma.value,f,-1) f.close() print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats()
def load(rundir, newkeystr): """ Read the directory and load the translation_model, the training count, the training epoch, and the training state. """ global _lastfilename filename = os.path.join(rundir, "newkeystr.txt") assert newkeystr == myopen(filename).read() (cnt, lastcnt, epoch, filename) = common.json.loadfile(os.path.join(rundir, "trainstate.json")) # filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) print >> sys.stderr, ("Reading translation_model from %s..." % filename) print >> sys.stderr, (stats()) translation_model = cPickle.load(myopen(filename)) print >> sys.stderr, ("...done reading translation_model from %s" % filename) print >> sys.stderr, (stats()) _lastfilename = filename return (translation_model, cnt, lastcnt, epoch)
def load(rundir, newkeystr): """ Read the directory and load the model, the training count, the training epoch, and the training state. """ global _lastfilename filename = os.path.join(rundir, "newkeystr.txt") assert newkeystr == myopen(filename).read() filename = os.path.join(rundir, "trainstate.pkl") (trainstate, cnt, epoch) = cPickle.load(myopen(filename)) filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) print >> sys.stderr, ("Reading model from %s..." % filename) print >> sys.stderr, (stats()) model = cPickle.load(myopen(filename)) print >> sys.stderr, ("...done reading model from %s" % filename) print >> sys.stderr, (stats()) _lastfilename = filename return (model, cnt, epoch, trainstate)
def get_validation_example(): from vocabulary import wordmap for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]): prevwords = [] for w in string.split(l): w = string.strip(w) if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: prevwords = []
def all_training_examples_cached(): global _all_examples if _all_examples is None: try: _all_examples, cnt = cPickle.load( myopen(training_examples_cache_filename())) assert len(_all_examples) == cnt logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) except: logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) logging.info("Caching all training examples...") logging.info(stats()) _all_examples = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): for e in get_training_biexample(l1, l2, f1, f2, falign): _all_examples.append(e) if len(_all_examples) % 10000 == 0: logging.info( "\tcurrently have read %d training examples" % len(_all_examples)) logging.info(stats()) random.shuffle(_all_examples) logging.info("...done caching all %d training examples" % len(_all_examples)) logging.info(stats()) cnt = len(_all_examples) cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) assert len(_all_examples) == cnt logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) assert _all_examples is not None return _all_examples
def save(model, cnt, epoch, trainstate, rundir, newkeystr): global _lastfilename filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) logging.info("Writing model to %s..." % filename) logging.info(stats()) cPickle.dump(model, myopen(filename, "wb"), protocol=-1) logging.info("...done writing model to %s" % filename) logging.info(stats()) if _lastfilename is not None: logging.info("Removing old model %s..." % _lastfilename) try: os.remove(_lastfilename) logging.info("...removed %s" % _lastfilename) except: logging.info("Could NOT remove %s" % _lastfilename) _lastfilename = filename filename = os.path.join(rundir, "trainstate.pkl") cPickle.dump((trainstate, cnt, epoch), myopen(filename, "wb"), protocol=-1) filename = os.path.join(rundir, "newkeystr.txt") myopen(filename, "wt").write(newkeystr)
def __iter__(self): from vocabulary import wordmap self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"] self.count = 0 for l in myopen(self.filename): prevwords = [] for w in string.split(l): w = string.strip(w) id = None if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: self.count += 1 yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: prevwords = []
def svm_validation_for_one_trainsize_and_one_C(C, nbinputs,numruns,datatrainsave,datatestsave,PATH_SAVE): """ Train an SVM using some C on nbinputs training examples, for numrums runs. Return: testerr,testerrdev,trainerr,trainerrdev """ print >> sys.stderr, "\t\tTraining SVM with C=%f, nbinputs=%d, numruns=%d" % (C, nbinputs,numruns) os.system('%s -s 4 -c %s -l %s -r %s -q %s %s %s > /dev/null 2> /dev/null'%(globalstate.SVMRUNALL_PATH,C,nbinputs,numruns,datatrainsave,datatestsave,PATH_SAVE+'/currentsvm.txt')) results = myopen(PATH_SAVE+'/currentsvm.txt','r').readline()[:-1].split(' ') os.remove(PATH_SAVE+'/currentsvm.txt') trainerr = float(results[1]) trainerrdev = float(results[2]) testerr = float(results[3]) testerrdev = float(results[4]) return testerr,testerrdev,trainerr,trainerrdev
def get_validation_example(): HYPERPARAMETERS = common.hyperparameters.read("language-model") from vocabulary import wordmap for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]): prevwords = [] for w in string.split(l): w = string.strip(w) if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: # If we can learn an unknown word token, we should # delexicalize the word, not discard the example! if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 prevwords = []
def __iter__(self): HYPERPARAMETERS = common.hyperparameters.read("language-model") from vocabulary import wordmap self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"] self.count = 0 for l in myopen(self.filename): prevwords = [] for w in string.split(l): w = string.strip(w) id = None if wordmap.exists(w): prevwords.append(wordmap.id(w)) if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: self.count += 1 yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] else: # If we can learn an unknown word token, we should # delexicalize the word, not discard the example! if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 prevwords = []
def trainingsentences(): """ For each line (sentence) in the training data, transform it into a list of token IDs. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap filename = HYPERPARAMETERS["TRAIN_SENTENCES"] count = 0 for l in myopen(filename): tokens = [] for w in string.split(l): w = string.strip(w) assert wordmap.exists(w) # Not exactly clear what to do # if the word isn't in the vocab. tokens.append(wordmap.id(w)) yield tokens count += 1 if count % 1000 == 0: logging.info("Read %d lines from training file %s..." % (count, filename)) logging.info(stats())
def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len(wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
#!/bin/env python import sys brownfile = "/u/turian/data/share/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1-v3.txt.gz" prefixes = [4, 6, 10, 20] from common.file import myopen import string word_to_cluster = {} for l in myopen(brownfile): cluster, word, cnt = string.split(l) word_to_cluster[word] = cluster def output_features(fo, seq): for i in range(2, len(seq)-2): fs = [] fs.append('U00=%s' % seq[i-2][0]) fs.append('U01=%s' % seq[i-1][0]) fs.append('U02=%s' % seq[i][0]) fs.append('U03=%s' % seq[i+1][0]) fs.append('U04=%s' % seq[i+2][0]) for name, pos in zip(["U00", "U01", "U02", "U03", "U04"], [i-2,i-1,i,i+1,i+2]): if seq[pos][0] not in word_to_cluster: continue for p in prefixes: fs.append("%sbp%d=%s" % (name, p, word_to_cluster[seq[pos][0]][:p])) fs.append('U05=%s/%s' % (seq[i-1][0], seq[i][0])) fs.append('U06=%s/%s' % (seq[i][0], seq[i+1][0]))
def state_save(): if HLAYERS == 2: cPickle.dump((w1, b1, wh, bh, w2, b2), myopen(join(rundir, "best-model.pkl"), "w"), protocol=-1) else: cPickle.dump((w1, b1, w2, b2), myopen(join(rundir, "best-model.pkl"), "w"), protocol=-1) myopen(join(rundir, "best-model-validation.txt"), "w").write("Accuracy %.2f%% after %d updates" % (best_validation_accuracy*100, best_validation_at))
def dumpfile(object, filename): """ Dump JSON to a filename. """ return dump(object, myopen(filename, "wb"))
print >> sys.stderr, cmd print >> sys.stderr, stats() os.system(cmd) print >> sys.stderr, stats() prefixes = [int(s) for s in string.split(options.prefixes, sep=",")] if options.brown is None: options.brown = [] word_to_cluster = [] for i, brownfile in enumerate(options.brown): print >> sys.stderr, "Reading Brown file: %s" % brownfile word_to_cluster.append({}) assert len(word_to_cluster) == i+1 for l in myopen(brownfile): cluster, word, cnt = string.split(l) word_to_cluster[i][word] = cluster if options.embedding is None: options.embedding = [] word_to_embedding = [] for i, embeddingfile in enumerate(options.embedding): print >> sys.stderr, "Reading Embedding file: %s" % embeddingfile word_to_embedding.append({}) assert len(word_to_embedding) == i+1 for l in myopen(embeddingfile): sp = string.split(l) word_to_embedding[i][sp[0]] = [float(v)*options.embeddingscale for v in sp[1:]] assert len(word_to_embedding) == 0
""" Automatically load the wordmap, if available. """ import cPickle from common.file import myopen import sys def _wordmap_filename(name): import common.hyperparameters, common.options HYPERPARAMETERS = common.hyperparameters.read("language-model") return HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"] wordmap = None try: wordmap = cPickle.load(myopen(_wordmap_filename())) wordmap.str = wordmap.key except: pass def write(wordmap, name=""): """ Write the word ID map, passed as a parameter. """ print >>sys.stderr, "Writing word map to %s..." % _wordmap_filename(name) cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w"))
HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb") common.options.reparse(HYPERPARAMETERS) random.seed(HYPERPARAMETERS["random seed"]) from common.file import myopen import string Tf = HYPERPARAMETERS["train examples file"] Vf = HYPERPARAMETERS["validation examples file"] T = open(Tf, "wt") V = open(Vf, "wt") print "Writing to %s and %s" % (Tf, Vf) ex = "" for l in myopen(HYPERPARAMETERS["original examples file"]): ex += l if string.strip(l) == "": if random.random() < HYPERPARAMETERS["validation example likelihood"]: V.write(ex) else: T.write(ex) ex = "" if ex != "": if random.random() < HYPERPARAMETERS["validation example likelihood"]: V.write(ex) else: T.write(ex)
WEIGHT_REGULARIZATION_COEFF = oldstate.weight_regularization_coeff[:-1] + WEIGHT_REGULARIZATION_COEFF NEPOCHS = oldstate.nepochs + NEPOCHS LR = oldstate.lr + LR NOISE_LVL = oldstate.noise_lvl + NOISE_LVL EPOCHSTEST = oldstate.epochstest + EPOCHSTEST state.bestrec = oldstate.bestrec state.bestrecepoch = oldstate.bestrec del oldstate #if 'rectifier' in ACT: #assert ACT.index('rectifier')== DEPTH -1 # Methods to stack rectifier are still in evaluation (5 different techniques) # The best will be implemented in the script soon :). filename = PATH_DATA + NAME_DATATEST + '_1.pkl.gz' print filename f =myopen(filename,'r') train = theano.shared(numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)) f.close() normalshape = train.value.shape model=SDAE(numpy.random,RandomStreams(),DEPTH,True,act=ACT,n_hid=N_HID,n_out=5,sparsity=ACTIVATION_REGULARIZATION_COEFF,\ regularization=WEIGHT_REGULARIZATION_COEFF, wdreg = WEIGHT_REGULARIZATION_TYPE, spreg = ACTIVATION_REGULARIZATION_TYPE, n_inp=NINPUTS,noise=NOISE,tie=True) #RELOAD previous model for depth in range(depthbegin): print >> sys.stderr, 'reload layer',depth+1 print >> sys.stderr, stats() model.layers[depth].W.value = cPickle.load(open(MODEL_RELOAD + 'Layer%s_W.pkl'%(depth+1),'r')) model.layers[depth].b.value = cPickle.load(open(MODEL_RELOAD + 'Layer%s_b.pkl'%(depth+1),'r')) model.layers[depth].mask.value = cPickle.load(open(MODEL_RELOAD + 'Layer%s_mask.pkl'%(depth+1),'r'))
def loadhelp(): f = myopen(self.filename, "rb") (self.map, self.reverse_map) = pickle.load(f)
def dump(self): """ Dump the map to disk. """ assert self.synchronize f = myopen(self.filename, "wb") pickle.dump((self.map, self.reverse_map), f)
def loadfile(filename): """ Load JSON from a filename. """ return load(myopen(filename))
RandomStreams(state.seed) numpy.random.seed(state.seed) datatrain = (PATH_DATA+NAME_DATA+'_1.pkl.gz',PATH_DATA+NAME_LABEL+'_1.pkl.gz') datatrainsave = PATH_SAVE+'/train.libsvm' datatest = (PATH_DATA+NAME_DATATEST+'_1.pkl.gz',PATH_DATA+NAME_LABELTEST+'_1.pkl.gz') datatestsave = PATH_SAVE+'/test.libsvm' depthbegin = 0 #monitor best performance for reconstruction and classification state.besterr = dict([(`trainsize`, []) for trainsize in VALIDATION_TRAININGSIZE]) state.besterrepoch = dict([(`trainsize`, []) for trainsize in VALIDATION_TRAININGSIZE]) filename = PATH_DATA + NAME_DATATEST + '_1.pkl.gz' print filename f =myopen(filename,'r') train = theano.shared(numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)) f.close() normalshape = train.value.shape model=dA(numpy.random,RandomStreams(),input = None, n_visible = NINPUTS, n_hidden = N_HID, act = ACT, noise = NOISE) #RELOAD previous model channel.save() err = dict([(trainsize, {}) for trainsize in VALIDATION_TRAININGSIZE]) rebuildunsup(model,LR,NOISE_LVL,ACTIVATION_REGULARIZATION_COEFF, WEIGHT_REGULARIZATION_COEFF, BATCHSIZE,train) epoch = 0 if epoch in EPOCHSTEST: svm_validation(err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST)
import sys assert len(sys.argv) == 2 embeddingsscale = float(sys.argv[1]) #embeddingsfile = "/u/turian/data/share/embeddings-20090916-rcv1.case-intact.LEARNING_RATE=0_000000001_--EMBEDDING_LEARNING_RATE=0_0000032.model-720000000.txt.gz" embeddingsfile = "/u/turian/data/share/hlbl_reps_clean_1.rcv1.clean.tokenized-CoNLL03.case-intact.txt.gz" brownfile = "/u/turian/data/share/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1-v3.txt.gz" prefixes = [4, 6, 10, 20] from common.file import myopen import string word_to_embedding = {} for l in myopen(embeddingsfile): sp = string.split(l) word_to_embedding[sp[0]] = [float(v) * embeddingsscale for v in sp[1:]] from common.file import myopen import string word_to_cluster = {} for l in myopen(brownfile): cluster, word, cnt = string.split(l) word_to_cluster[word] = cluster def output_features(fo, seq): for i in range(2, len(seq) - 2): fs = []
def wordmap(): global _wordmap if _wordmap is None: _wordmap = cPickle.load(myopen(_wordmap_filename())) _wordmap.str = _wordmap.key return _wordmap
def write(wordmap, name=""): """ Write the word ID map, passed as a parameter. """ print >>sys.stderr, "Writing word map to %s..." % _wordmap_filename(name) cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w"))
""" Automatically load the wordmap, if available. """ import cPickle from common.file import myopen from hyperparameters import * import sys def _wordmap_filename(name): #import common.hyperparameters, common.options #HYPERPARAMETERS = common.hyperparameters.read("language-model") return HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"] wordmap = None try: wordmap = cPickle.load(myopen(_wordmap_filename(""))) wordmap.str = wordmap.key except: print sys.exc_info()[0], sys.exc_info()[1] def write(wordmap, name=""): """ Write the word ID map, passed as a parameter. """ print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename(name) cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w"))
def write(wordmap, name=""): """ Write the word ID map, passed as a parameter. """ print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename(name) cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w"))