def svm_validation(err, epoch, model, train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST): """ Perform full SVM validation. """ print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats() createlibsvmfile(model,datatrain,datatrainsave) createlibsvmfile(model,datatest,datatestsave) for trainsize in VALIDATION_TRAININGSIZE: print trainsize print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE C,testerr,testerrdev,trainerr,trainerrdev = svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE) err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev)}) for trainsize in VALIDATION_TRAININGSIZE: print >> sys.stderr, 'VALIDATION: epoch %d / trainsize %d / svm error' % ( epoch, trainsize) ,err[trainsize][epoch] print >> sys.stderr, stats() if epoch != 0: f = open('err.pkl','w') for trainsize in VALIDATION_TRAININGSIZE: cPickle.dump(err[trainsize],f,-1) f.close() print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats()
def all_training_examples_cached(): global _all_examples if _all_examples is None: try: _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename())) assert len(_all_examples) == cnt logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) except: logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) logging.info("Caching all training examples...") logging.info(stats()) _all_examples = [] for l1, l2, f1, f2, falign in bicorpora_filenames(): for e in get_training_biexample(l1, l2, f1, f2, falign): _all_examples.append(e) if len(_all_examples) % 10000 == 0: logging.info("\tcurrently have read %d training examples" % len(_all_examples)) logging.info(stats()) random.shuffle(_all_examples) logging.info("...done caching all %d training examples" % len(_all_examples)) logging.info(stats()) cnt = len(_all_examples) cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) assert len(_all_examples) == cnt logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) logging.info(stats()) assert _all_examples is not None return _all_examples
def compute_representation_std(model,depth,PATH_DATA,NAME_DATA,NB_FILES): print >> sys.stderr, "Computing representation std for sigma initialization" print >> sys.stderr, stats() outputs = [model.layers[depth-1].out] func = theano.function([model.inp],outputs) sumvector = numpy.zeros((1,model.n_hid[depth-1])) count = 0 for filenb in xrange(1,NB_FILES + 1): f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM): count += globalstate.BATCH_CREATION_LIBSVM rep = numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0]) sumvector += rep.sum(0) meanvector = sumvector / float(count) sumvector = numpy.zeros((1,model.n_hid[depth-1])) count = 0 for filenb in xrange(1,NB_FILES + 1): f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM): count += globalstate.BATCH_CREATION_LIBSVM rep = (numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0]) - meanvector)**2 sumvector += rep.sum(0) stdvector = numpy.sqrt(sumvector / float(count)) del instances print >> sys.stderr, "...done computing std" print >> sys.stderr, stats() return stdvector.reshape((model.n_hid[depth-1],))
def load(rundir): print >> sys.stderr, "Loading state from %s..." % _filename(rundir) print >> sys.stderr, stats() m = cPickle.load(myopen(_filename(rundir), "r")) print >> sys.stderr, "...done loading state from %s" % _filename(rundir) print >> sys.stderr, stats() return m
def createlibsvmfile(model,depth,datafiles,dataout): print >> sys.stderr, 'Creating libsvm file %s (model=%s, depth=%d, datafiles=%s)...' % (repr(dataout), repr(model),depth,datafiles) print >> sys.stderr, stats() outputs = [model.layers[depth].out] func = theano.function([model.inp],outputs) f = myopen(datafiles[0],'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() f = myopen(datafiles[1],'r') labels = numpy.asarray(cPickle.load(f),dtype = 'int64') f.close() f = open(dataout,'w') for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM): textr = '' rep = func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0] for l in range(rep.shape[0]): textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l] idx = rep[l,:].nonzero()[0] for j,v in zip(idx,rep[l,idx]): textr += '%s:%s '%(j,v) textr += '\n' f.write(textr) del instances,labels f.close() print >> sys.stderr, "...done creating libsvm files" print >> sys.stderr, stats()
def generate_context_vectors(): """ Generate the (random) context vectors. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian": context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))] elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary": NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5) logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"])) # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"] context_vectors = [] for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])): logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for j in range(wordmap.len): idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"]) random.shuffle(idxs) for k in idxs[:NONZEROS]: thiscontext[j][k] = random.choice([-1, +1]) # print thiscontext[j] context_vectors.append(thiscontext) else: assert 0 logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) return context_vectors
def main(invideofilename, facefilename, outvideofilename): faces = Faces("") faces.__setstate__(common.json.loadfile(facefilename)) dir = tempfile.mkdtemp() try: for i, f, totframes in common.video.frames(invideofilename, maxframes=len(faces.frames)): outf = os.path.join(dir, "out%05d.jpg" % i) print >> sys.stderr, "Processing %s to %s, image %s" % (f, outf, common.str.percent(i+1, totframes)) print >> sys.stderr, stats() draw_faces(faces.frames[i], f, outf) # I learned this command from here: http://electron.mit.edu/~gsteele/ffmpeg/ cmd = "ffmpeg -y -r 30 -b 10000k -i %s %s" % (os.path.join(dir, 'out%05d.jpg'), outvideofilename) print >> sys.stderr, "Stitching video together as test1800.mp4" print >> sys.stderr, cmd # import time # time.sleep(30) common.misc.runcmd(cmd) print >> sys.stderr, stats() finally: print >> sys.stderr, "Removing dir %s" % dir shutil.rmtree(dir)
def createlibsvmfile(model,datafiles,dataout): print >> sys.stderr, 'Creating libsvm file %s (model=%s, datafiles=%s)...' % (repr(dataout), repr(model),datafiles) print >> sys.stderr, stats() x = T.dmatrix() params = [T.dmatrix(), T.dmatrix(), T.dvector(), T.dvector()] model.x = x model.W, model.W_prime, model.b, model.b_prime = params model.params = [model.W, model.W_prime, model.b, model.b_prime] outputs = [model.get_hidden_values(model.x)] func = theano.function([model.x] + params,outputs) # print >> sys.stderr, 'REMOVEME: about to read' # print >> sys.stderr, stats() f = myopen(datafiles[0],'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() f = myopen(datafiles[1],'r') labels = numpy.asarray(cPickle.load(f),dtype = 'int64') f.close() f = myopen(dataout,'w') # print >> sys.stderr, 'REMOVEME: about to iterate' # print >> sys.stderr, stats() # params = [model.Wvalue, model.W_primevalue, model.bvalue, model.b_primevalue] for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM): # print >> sys.stderr, 'REMOVEME: about to do %d' % i # print >> sys.stderr, stats() textr = '' assert globalstate.BATCH_CREATION_LIBSVM == 1 # Don't want to select indices from more than one example x = instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:] nonzeros = frozenset(x.nonzero()[1]) # print >> sys.stderr, nonzeros # print >> sys.stderr, len(nonzeros) indices = list(nonzeros) # # TODO: Don't duplicate this code, which also appears about one hundred lines down. # x = x[:,indices] # params = [model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices]] # rep = func(x, *params)[0] rep = func(x[:,indices], model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices])[0] for l in range(rep.shape[0]): textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l] idx = rep[l,:].nonzero()[0] for j,v in zip(idx,rep[l,idx]): textr += '%s:%s '%(j,v) textr += '\n' f.write(textr) del instances,labels f.close() print >> sys.stderr, "...done creating libsvm files" print >> sys.stderr, stats()
def targetmap(name=""): global _targetmap if name not in _targetmap: f = _targetmap_filename(name=name) print >> sys.stderr, "Reading target map from %s..." % f print >> sys.stderr, stats() _targetmap[name] = cPickle.load(myopen(f)) print >> sys.stderr, "...done reading target map from %s" % f print >> sys.stderr, stats() return _targetmap[name]
def svm_validation(err, reconstruction_error, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST,RULE): """ Perform full SVM validation. """ global TRAINFUNC print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats() # Call with noiselevel = None before running the SVM. # No noise because we want the exact representation for each instance. rebuildunsup(model,depth,ACT,LR,None,BATCHSIZE,train,RULE) createlibsvmfile(model,depth,datatrain,datatrainsave) createlibsvmfile(model,depth,datatest,datatestsave) for trainsize in VALIDATION_TRAININGSIZE: print trainsize print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev =\ svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE) err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev)}) if epoch != 0: f = myopen(PATH_DATA + NAME_DATATEST +'_1.pkl.gz','r') train.container.value[:] = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() # Now, restore TRAINFUNC with the original NOISE_LVL rebuildunsup(model,depth,ACT,LR,NOISE_LVL,BATCHSIZE,train,RULE) reconstruction_error.update({epoch:TESTFUNC()}) print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / reconstruction error (is this on test or train?): ' % (depth+1, epoch),reconstruction_error[epoch] for trainsize in VALIDATION_TRAININGSIZE: print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / trainsize %d / svm error' % (depth+1, epoch, trainsize),err[trainsize][epoch] print >> sys.stderr, stats() if epoch != 0: f = open('depth%serr.pkl'%depth,'w') cPickle.dump(reconstruction_error,f,-1) for trainsize in VALIDATION_TRAININGSIZE: cPickle.dump(err[trainsize],f,-1) f.close() modeldir = os.path.join(PATH_SAVE, 'depth%spre%s' % (depth+1,epoch)) if not os.path.isdir(modeldir): os.mkdir(modeldir) model.save(modeldir) if RULE == 5: f = open(modeldir + '/auxsigma.pkl','w') cPickle.dump(model.auxsigma.value,f,-1) f.close() print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats()
def diagnostics(cnt, embeddings): logging.info(stats()) vocab_size = embeddings.shape[0] idxs = range(vocab_size) random.shuffle(idxs) idxs = idxs[:100] embeddings_debug(embeddings[idxs], cnt, "rand 100 words") embeddings_debug(embeddings[:100], cnt, "top 100 words") embeddings_debug(embeddings[vocab_size/2-50:vocab_size/2+50], cnt, "mid 100 words") embeddings_debug(embeddings[-100:], cnt, "last 100 words") logging.info(stats())
def diagnostics(cnt, model): logging.info(stats()) idxs = range(model.parameters.vocab_size) random.shuffle(idxs) idxs = idxs[:100] embeddings_debug(model.parameters.embeddings[idxs], cnt, "rand 100 words, model %s" % model.modelname) embeddings_debug(model.parameters.embeddings[:100], cnt, "top 100 words, model %s" % model.modelname) embeddings_debug(model.parameters.embeddings[model.parameters.vocab_size/2-50:model.parameters.vocab_size/2+50], cnt, "mid 100 words, model %s" % model.modelname) embeddings_debug(model.parameters.embeddings[-100:], cnt, "last 100 words, model %s" % model.modelname) weights_debug(model.parameters.hidden_weights.value, cnt, "hidden weights, model %s" % model.modelname) weights_debug(model.parameters.output_weights.value, cnt, "output weights, model %s" % model.modelname) logging.info(stats())
def validate(cnt): import math logranks = [] logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt) logging.info(stats()) i = 0 for (i, ve) in enumerate(examples.get_validation_example()): # logging.info([wordmap.str(id) for id in ve]) logranks.append(math.log(m.validate(ve))) if (i+1) % 10 == 0: logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)))) logging.info(stats()) logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1)) logging.info(stats())
def onlineproject(x, dimensions, seed, randomization_type): # Online (low-memory) random projection newx = numpy.zeros((x.shape[0], dimensions)) nonzeros = x.nonzero() # (list of rows, list of cols) of all nonzeros # (col, row) of all nonzeros # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once. nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))] nonzero_colrow.sort() nonzero_colrow.reverse() randrow_key = None randrow_values = None randrows_computed = 0 for l, (col, row) in enumerate(nonzero_colrow): if randrow_key != col: randrow_key = col randrow_values = pyrandomprojection.randomrow(key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed) randrows_computed += 1 if randrows_computed % 500 == 0: print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (percent(randrows_computed, x.shape[1]), percent(l+1, len(nonzero_colrow)), f) print >> sys.stderr, stats() newrow = x[row,col] * randrow_values assert newx[row].shape == newrow.shape newx[row] += newrow # if (l+1) % 10000 == 0: # print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f) # print >> sys.stderr, stats() return newx
def readwords(filename): print >> sys.stderr, "Processing %s" % filename i = 0 for line in open(filename): i += 1 if i % 100000 == 0: print >> sys.stderr, "Read line %d of %s..." % (i, filename) print >> sys.stderr, stats() for w in string.split(line): yield w
def batchproject(x, dimensions, seed, randomization_type): # Batch (cached, high-memory) random projection global randommatrix if randommatrix is None: print >> sys.stderr, "Creating random matrix of shape %s" % `(x.shape[1], dimensions)` print >> sys.stderr, stats() numpy.random.seed(seed) assert randomization_type == "gaussian" randommatrix = numpy.random.normal(size=(x.shape[1], dimensions)) else: assert randommatrix.shape == (x.shape[1], dimensions) # We assume the projection matrix won't change print >> sys.stderr, "Multiplying x by random matrix..." print >> sys.stderr, stats() newx = numpy.dot(x, randommatrix) print >> sys.stderr, "...done multiplying x by random matrix" print >> sys.stderr, stats() return newx
def load(rundir, newkeystr): """ Read the directory and load the translation_model, the training count, the training epoch, and the training state. """ global _lastfilename filename = os.path.join(rundir, "newkeystr.txt") assert newkeystr == myopen(filename).read() (cnt, lastcnt, epoch, filename) = common.json.loadfile(os.path.join(rundir, "trainstate.json")) # filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) print >> sys.stderr, ("Reading translation_model from %s..." % filename) print >> sys.stderr, (stats()) translation_model = cPickle.load(myopen(filename)) print >> sys.stderr, ("...done reading translation_model from %s" % filename) print >> sys.stderr, (stats()) _lastfilename = filename return (translation_model, cnt, lastcnt, epoch)
def load(rundir, newkeystr): """ Read the directory and load the model, the training count, the training epoch, and the training state. """ global _lastfilename filename = os.path.join(rundir, "newkeystr.txt") assert newkeystr == myopen(filename).read() filename = os.path.join(rundir, "trainstate.pkl") (trainstate, cnt, epoch) = cPickle.load(myopen(filename)) filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) print >> sys.stderr, ("Reading model from %s..." % filename) print >> sys.stderr, (stats()) model = cPickle.load(myopen(filename)) print >> sys.stderr, ("...done reading model from %s" % filename) print >> sys.stderr, (stats()) _lastfilename = filename return (model, cnt, epoch, trainstate)
def __setstate__(self, state): """ @warning: We ignore the filename. If we wanted to be really fastidious, we would assume that HYPERPARAMETERS["TRAIN_SENTENCES"] might change. The only problem is that if we change filesystems, the filename might change just because the base file is in a different path. So we issue a warning if the filename is different from """ filename, count = state print >>sys.stderr, ("__setstate__(%s)..." % ` state `) print >>sys.stderr, (stats()) iter = self.__iter__() while count != self.count: # print count, self.count iter.next() if self.filename != filename: assert self.filename == HYPERPARAMETERS["TRAIN_SENTENCES"] print >>sys.stderr, ("self.filename %s != filename given to __setstate__ %s" % (self.filename, filename)) print >>sys.stderr, ("...__setstate__(%s)" % ` state `) print >>sys.stderr, (stats())
def main(videofilename): faces = Faces(videofilename) for i, f, totframes in common.video.frames(videofilename): # for i, f, totframes in common.video.frames(videofilename, maxframes=1000): print >> sys.stderr, "Processing %s, image %s" % (f, common.str.percent(i+1, totframes)) print >> sys.stderr, stats() image = cvLoadImage(f) faces.set_dimensions(image.width, image.height) faces.add_frame(i, detect_faces(image)) if i % 100 == 0 and i != 0: print >> sys.stderr, common.json.dumps(faces.__getstate__()) print common.json.dumps(faces.__getstate__())
def save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr): global _lastfilename filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) logging.info("Writing translation_model to %s..." % filename) logging.info(stats()) cPickle.dump(translation_model, myopen(filename, "wb"), protocol=-1) logging.info("...done writing translation_model to %s" % filename) logging.info(stats()) # if _lastfilename is not None: # logging.info("Removing old translation_model %s..." % _lastfilename) # try: # os.remove(_lastfilename) # logging.info("...removed %s" % _lastfilename) # except: # logging.info("Could NOT remove %s" % _lastfilename) _lastfilename = filename common.json.dumpfile((cnt, lastcnt, epoch, filename), os.path.join(rundir, "trainstate.json")) filename = os.path.join(rundir, "newkeystr.txt") myopen(filename, "wt").write(newkeystr)
def main(invideofilename, facechainfilename, outvideofilename): faces = FaceChains() faces.__setstate__(common.json.loadfile(facechainfilename)) dir = tempfile.mkdtemp() try: from collections import defaultdict frames = defaultdict(list) maxframe = 0 for chain in faces.chains: # print chain color = ["red", "yellow", "green", "blue", "purple", "orange"][chain.__hash__() % 6] for i, face in chain.data: frames[i].append((face, color)) if i > maxframe: maxframe = i # print >> sys.stderr, frames for i, f, totframes in common.video.frames(invideofilename, maxframes=maxframe): outf = os.path.join(dir, "out%05d.jpg" % i) print >> sys.stderr, "Processing %s to %s, image %s" % (f, outf, common.str.percent(i+1, totframes)) print >> sys.stderr, stats() draw_faces(frames[i], f, outf) # I learned this command from here: http://electron.mit.edu/~gsteele/ffmpeg/ cmd = "ffmpeg -y -r 30 -b 10000k -i %s %s" % (os.path.join(dir, 'out%05d.jpg'), outvideofilename) print >> sys.stderr, "Stitching video together as test1800.mp4" print >> sys.stderr, cmd # import time # time.sleep(30) common.misc.runcmd(cmd) print >> sys.stderr, stats() finally: print >> sys.stderr, "Removing dir %s" % dir shutil.rmtree(dir)
def save(model, cnt, epoch, trainstate, rundir, newkeystr): global _lastfilename filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) logging.info("Writing model to %s..." % filename) logging.info(stats()) cPickle.dump(model, myopen(filename, "wb"), protocol=-1) logging.info("...done writing model to %s" % filename) logging.info(stats()) if _lastfilename is not None: logging.info("Removing old model %s..." % _lastfilename) try: os.remove(_lastfilename) logging.info("...removed %s" % _lastfilename) except: logging.info("Could NOT remove %s" % _lastfilename) _lastfilename = filename filename = os.path.join(rundir, "trainstate.pkl") cPickle.dump((trainstate, cnt, epoch), myopen(filename, "wb"), protocol=-1) filename = os.path.join(rundir, "newkeystr.txt") myopen(filename, "wt").write(newkeystr)
def get_training_example(): """ Get a training example, as an infinite loop. """ HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb") epoch = 0 examples = 0 while 1: epoch += 1 sys.stderr.write("STARTING EPOCH #%d (%d examples)\n" % (epoch, examples)) sys.stderr.write(stats() + "\n") for l in common.file.myopen(HYPERPARAMETERS["train examples file"]): if l == "\n": continue examples += 1 yield _example_from_string(l)
def rescal_rectifier_model(model,depth,PATH_DATA,NAME_DATA,NB_FILES,rule): print >> sys.stderr, "Rescaling of the rectifier model following the rule: %s"%rule print >> sys.stderr, stats() outputs = [model.layers[depth-1].out] func = theano.function([model.inp],outputs) max_value = numpy.zeros((1,model.n_hid[depth-1])) for filenb in xrange(1,NB_FILES + 1): f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r') instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) f.close() for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM): rep = numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0]) max_value = numpy.asarray([numpy.concatenate([max_value,rep]).max(0)]) del instances if rule == 2: model.layers[depth-1].W.container.value[:] = \ numpy.asarray((model.layers[depth-1].W.value.T / max_value).T,dtype=theano.config.floatX) model.layers[depth-1].b.container.value[:] = \ numpy.asarray((model.layers[depth-1].b.value / max_value[0,:]),dtype=theano.config.floatX) if rule == 1: model.layers[depth-1].W.container.value[:] = model.layers[depth-1].W.value / max_value.max() model.layers[depth-1].b.container.value[:] = model.layers[depth-1].b.value / max_value.max() print >> sys.stderr, "...done rescaling parameters" print >> sys.stderr, stats()
def read(f): """ Generator for reading a wikiprep XML file from a file object. """ print >> sys.stderr, "Reading %s..." % f print >> sys.stderr, stats() doc = {} cnt = 0 for event, elem in cElementTree.iterparse(f): if elem.tag == "title": doc["title"] = ("".join(elem.itertext())) elif elem.tag == "text": doc["text"] = ("".join(elem.itertext())) elif elem.tag == "link": # Skip internal links if elem.get("url") is None: continue if "external links" not in doc: doc["external links"] = [] doc["external links"].append([elem.get("url"), ("".join(elem.itertext()))]) elif elem.tag == "links": doc["links"] = [int(i) for i in string.split("".join(elem.itertext()))] elif elem.tag == "categories": doc["categories"] = [int(i) for i in string.split("".join(elem.itertext()))] elif elem.tag == "page": doc["_id"] = int(elem.get("id")) cnt += 1 yield doc doc = {} # Free the memory of the building tree elem.clear() if cnt % 1000 == 0: print >> sys.stderr, "Read %d articles from %s" % (cnt, f) print >> sys.stderr, stats() print >> sys.stderr, "...done reading %s" % f print >> sys.stderr, stats()
def runjob(model, h, datafile, kfold, job): X, Y = cPickle.load(open(datafile)) # TODO: Is it possible to get around doing this? # e.g. determine based upon "model" ? # At the very least, this should be a command-line param from locals import CONVERT_TO_DENSE if CONVERT_TO_DENSE: X = X.todense() print >> sys.stderr, "X = %s, Y = %s" % (X.shape, Y.shape) print >> sys.stderr, stats() try: train(model, h, X, Y, job, kfold) assert job.result is not None print "JOB", job sys.stdout.flush() except Exception, e: print >> sys.stderr, "Error %s %s on %s" % (type(e), e, (model, h))
def onlineproject(x, dimensions, seed, randomization_type): # Online (low-memory) random projection newx = numpy.zeros((x.shape[0], dimensions)) nonzeros = x.nonzero() # (list of rows, list of cols) of all nonzeros # (col, row) of all nonzeros # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once. nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))] nonzero_colrow.sort() nonzero_colrow.reverse() randrow_key = None randrow_values = None randrows_computed = 0 for l, (col, row) in enumerate(nonzero_colrow): if randrow_key != col: randrow_key = col randrow_values = pyrandomprojection.randomrow( key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed) randrows_computed += 1 if randrows_computed % 500 == 0: print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % ( percent(randrows_computed, x.shape[1]), percent(l + 1, len(nonzero_colrow)), f) print >> sys.stderr, stats() newrow = x[row, col] * randrow_values assert newx[row].shape == newrow.shape newx[row] += newrow # if (l+1) % 10000 == 0: # print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f) # print >> sys.stderr, stats() return newx
def draw_faces(faces, infilename, outfilename): pil_img = Image.open(infilename) # Draw red boxes around faces if faces: draw = ImageDraw.Draw(pil_img) for face in faces: face.draw(draw) del draw # # REMOVEME: Scale image to height of 320 # newwidth = 320 # newheight = newwidth * pil_img.size[1] / pil_img.size[0] ## print pil_img.size ## print newwidth, newheight # pil_img= pil_img.resize((newwidth, newheight), Image.ANTIALIAS) # Save to out.png print >> sys.stderr, "Writing to %s" % outfilename print >> sys.stderr, stats() pil_img.save(outfilename, "JPEG")
def draw_faces(faces, infilename, outfilename): pil_img = Image.open(infilename) # Draw red boxes around faces draw = ImageDraw.Draw(pil_img) for face, color in faces: # print face, color face.draw(draw, color=color) del draw # # REMOVEME: Scale image to height of 320 # newwidth = 320 # newheight = newwidth * pil_img.size[1] / pil_img.size[0] ## print pil_img.size ## print newwidth, newheight # pil_img= pil_img.resize((newwidth, newheight), Image.ANTIALIAS) # Save to out.png print >> sys.stderr, "Writing to %s" % outfilename print >> sys.stderr, stats() pil_img.save(outfilename, "JPEG")
def trainingsentences(): """ For each line (sentence) in the training data, transform it into a list of token IDs. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap filename = HYPERPARAMETERS["TRAIN_SENTENCES"] count = 0 for l in myopen(filename): tokens = [] for w in string.split(l): w = string.strip(w) assert wordmap.exists(w) # Not exactly clear what to do # if the word isn't in the vocab. tokens.append(wordmap.id(w)) yield tokens count += 1 if count % 1000 == 0: logging.info("Read %d lines from training file %s..." % (count, filename)) logging.info(stats())
def batch_apply(f, x, batchsize=1024, verbose=True): """ Slice x in batches of size batchsize, run f on x, and return a list of results. @warning: The function should *NOT* return any indexes because f receives index numbers that are wrong. (The indexes should adjust for the current min.) """ import sys from common.stats import stats ret = [] min = 0 max = batchsize while min < x.shape[0]: if max > x.shape[0]: max = x.shape[0] if verbose: print >> sys.stderr, "Running on %d:%d..." % (min, max) print >> sys.stderr, stats() tmpx = x[min:max] ret.append(f(tmpx)) min += batchsize max += batchsize return ret
# o = graph.validatefn(x, N.array([y]), w1, b1, w2, b2) # (kl, softmax, argmax, presquashh) = o ## print "new KL=%.3f, softmax=%s, argmax=%d" % (kl, softmax, argmax) # print "new KL=%.3f, argmax=%d" % (kl, argmax) if cnt % HYPERPARAMETERS["examples per validation"] == 0: valacc, valstd = validate() sys.stderr.write( "After %d training examples, validation accuracy: %.2f%%, stddev: %.2f%% (former best=%.2f%% at %d)\n" % (cnt, valacc * 100, valstd * 100, best_validation_accuracy * 100, best_validation_at)) if best_validation_accuracy < valacc: best_validation_accuracy = valacc best_validation_at = cnt sys.stderr.write("NEW BEST VALIDATION ACCURACY. Saving state.\n") state_save() elif cnt > 2 * best_validation_at and cnt >= HYPERPARAMETERS[ "minimum training updates"]: sys.stderr.write( "Have not beaten best validation accuracy for a while. Terminating training...\n" ) sys.stderr.write(stats() + "\n") break if cnt % 1000 == 0: sys.stderr.write( "After %d training examples, training accuracy (moving average): %.2f%%, stddev: %.2f%%\n" % (cnt, 100. * mvgavg_accuracy, 100. * math.sqrt(mvgavg_variance))) sys.stderr.write(stats() + "\n") #graph.COMPILE_MODE.print_summary()
def svm_validation_for_one_trainsize(nbinputs, numruns, datatrainsave, datatestsave, PATH_SAVE): """ Train an SVM on nbinputs training examples, for numrums runs. Choose the value of C using a linesearch to minimize the testerr. Return: C,testerr,testerrdev,trainerr,trainerrdev MAXSTEPS is the number of steps performed in the line search. STEPFACTOR is the initial step size. """ MAXSTEPS = globalstate.SVM_MAXSTEPS STEPFACTOR = globalstate.SVM_STEPFACTOR INITIALC = globalstate.SVM_INITIALC print >> sys.stderr, 'Starting SVM validation for %s examples (numrums=%d, datatrainsave=%s, datatestsave=%s, PATH_SAVE=%s, MAXSTEPS=%d, STEPFACTOR=%f, INITIALC=%f)...' % ( nbinputs, numruns, datatrainsave, datatestsave, PATH_SAVE, MAXSTEPS, STEPFACTOR, INITIALC) print >> sys.stderr, stats() Ccurrent = INITIALC Cstepfactor = STEPFACTOR Cnew = Ccurrent * Cstepfactor C_to_allstats = {} Cbest = None while len(C_to_allstats) < MAXSTEPS: if Ccurrent not in C_to_allstats: # Compute the validation statistics for the current C testerr, testerrdev, trainerr, trainerrdev = svm_validation_for_one_trainsize_and_one_C( Ccurrent, nbinputs, numruns, datatrainsave, datatestsave, PATH_SAVE) C_to_allstats[Ccurrent] = (testerr, testerrdev, trainerr, trainerrdev) if Cnew not in C_to_allstats: # Compute the validation statistics for the next C testerr, testerrdev, trainerr, trainerrdev = svm_validation_for_one_trainsize_and_one_C( Cnew, nbinputs, numruns, datatrainsave, datatestsave, PATH_SAVE) C_to_allstats[Cnew] = (testerr, testerrdev, trainerr, trainerrdev) # If Cnew has a lower test err than Ccurrent, then continue stepping in this direction if C_to_allstats[Cnew][0] < C_to_allstats[Ccurrent][0]: print >> sys.stderr, "\ttesterr[Cnew %f] = %f < testerr[Ccurrent %f] = %f" % ( Cnew, C_to_allstats[Cnew][0], Ccurrent, C_to_allstats[Ccurrent][0]) if Cbest is None or C_to_allstats[Cnew][0] < C_to_allstats[Cbest][ 0]: Cbest = Cnew print >> sys.stderr, "\tNEW BEST: Cbest <= %f, testerr[Cbest] = %f" % ( Cbest, C_to_allstats[Cbest][0]) Ccurrent = Cnew Cnew *= Cstepfactor print >> sys.stderr, "\tPROCEED: Cstepfactor remains %f, Ccurrent is now %f, Cnew is now %f" % ( Cstepfactor, Ccurrent, Cnew) # Else, reverse the direction and reduce the step size by sqrt. else: print >> sys.stderr, "\ttesterr[Cnew %f] = %f > testerr[Ccurrent %f] = %f" % ( Cnew, C_to_allstats[Cnew][0], Ccurrent, C_to_allstats[Ccurrent][0]) if Cbest is None or C_to_allstats[Ccurrent][0] < C_to_allstats[ Cbest][0]: Cbest = Ccurrent print >> sys.stderr, "\tCbest <= %f, testerr[Cbest] = %f" % ( Cbest, C_to_allstats[Cbest][0]) Cstepfactor = 1. / math.sqrt(Cstepfactor) Cnew = Ccurrent * Cstepfactor print >> sys.stderr, "\tREVERSE: Cstepfactor is now %f, Ccurrent remains %f, Cnew is now %f" % ( Cstepfactor, Ccurrent, Cnew) allC = C_to_allstats.keys() allC.sort() for C in allC: print >> sys.stderr, "\ttesterr[C %f] = %f" % (C, C_to_allstats[C][0]), if C == Cbest: print >> sys.stderr, " *best* (testerr = %f, testerrdev = %f, trainerr = %f, trainerrdev = %f)" % C_to_allstats[ C] else: print >> sys.stderr, "" print >> sys.stderr, '...done with SVM validation for %s examples (numrums=%d, datatrainsave=%s, datatestsave=%s)' % ( nbinputs, numruns, datatrainsave, datatestsave) print >> sys.stderr, stats() return [Cbest] + list(C_to_allstats[Cbest])
epoch = 0 if epoch in EPOCHSTEST: svm_validation(err, epoch, model, train, datatrain, datatrainsave, datatest, datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST) channel.save() train_reconstruction_error_mvgavg = MovingAverage() for epoch in xrange(1, NEPOCHS + 1): time1 = time.time() state.currentepoch = epoch for filenb in xrange(1, NB_FILES + 1): print >> sys.stderr, "\t\tAbout to read file %s..." % percent( filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # initial_file_time = time.time() f = open(PATH_DATA + NAME_DATA + '_%s.pkl' % filenb, 'r') object = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX) print >> sys.stderr, "\t\t...read file %s" % percent( filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # The last training file is not of the same shape as the other training files. # So, to avoid a GPU memory error, we want to make sure it is the same size. # In which case, we pad the matrix but keep track of how many n (instances) there actually are. # TODO: Also want to pad trainl if object.shape == normalshape: train.container.value[:] = object currentn = normalshape[0] del object else:
def train(model, h, X, Y, job, kfold): # TODO: These should be passed in as command-line parameters FOLDS = 5 #FOLDS = 3 EVALUATION_MEASURE = sklearn.metrics.f1_score if kfold: kf = KFold(X.shape[0], FOLDS, indices=True) #if kfold: kf = LeaveOneOut(X.shape[0], indices=True) else: assert 0 start = time.clock() print >> sys.stderr, "trying %s %s" % (model, h) errs = [] if kfold: for i, (train, test) in enumerate(kf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[ test] clf = model(**h) # TODO: What we should do is have a multiclass command-line parameter, # in which case we do the following: clf = OneVsRestClassifier(clf) clf.fit(X_train, y_train) # TODO: Run evals on train, for debugging? # for j in range(y_test.shape[0]): # probs = [] # for k, est in enumerate(clf.estimators_): # y_test_predict = est.predict_proba(X_test[j]) # probs.append((y_test_predict[0][1], k)) # print "ACC", y_test[j][sorted(probs)[-1][1]] # sys.stdout.flush() y_test_predict = clf.predict(X_test) errs.append(EVALUATION_MEASURE(y_test, y_test_predict)) print >> sys.stderr, "INTERMEDIATE kfold=%d/%d" % ( i + 1, FOLDS), errs[-1], modelstr(clf) print >> sys.stderr, stats() # if errs[-1] < TASKMIN and i+1 < FOLDS: # if FORCE: # print >> sys.stderr, "FORCE=True, otherwise we'd abort becase err %f < %d taskmin %f" % (errs[-1], TASKMIN) # else: # print >> sys.stderr, "ABORTING. err %f < %d taskmin %f" % (errs[-1], TASKMIN) # job.result = False # return else: assert 0 end = time.clock() difftime = end - start if kfold: job.result = { "mean": numpy.mean(errs), "std": numpy.std(errs), "95conf": numpy.mean(errs) - 1.96 * numpy.std(errs), "min": numpy.min(errs), "folds": errs, "time": difftime } print >> sys.stderr, "kfold=%d" % FOLDS, "mean", numpy.mean( errs), "std", numpy.std(errs), "95conf", numpy.mean( errs) - 1.96 * numpy.std(errs), "min", numpy.min( errs), modelstr(clf) print "kfold=%d" % FOLDS, "mean", numpy.mean(errs), "std", numpy.std( errs), "95conf", numpy.mean(errs) - 1.96 * numpy.std( errs), "min", numpy.min(errs), modelstr(clf) else: assert 0 # job.result = {"mean": numpy.mean(errs), "title": difftime} # print num, numpy.mean(errs), modelstr(clf) sys.stdout.flush() print >> sys.stderr, stats()
def svm_validation(err, reconstruction_error, epoch, model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train, datatrain, datatrainsave, datatest, datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST, RULE): """ Perform full SVM validation. """ global TRAINFUNC print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % ( err, epoch, model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train, datatrain, datatrainsave, datatest, datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats() # Call with noiselevel = None before running the SVM. # No noise because we want the exact representation for each instance. rebuildunsup(model, depth, ACT, LR, None, BATCHSIZE, train, RULE) createlibsvmfile(model, depth, datatrain, datatrainsave) createlibsvmfile(model, depth, datatest, datatestsave) for trainsize in VALIDATION_TRAININGSIZE: print trainsize print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev =\ svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE) err[trainsize].update({ epoch: (C, testerr, testerrdev, trainerr, trainerrdev, testerrnew, testerrnewdev, trainerrnew, trainerrnewdev) }) if epoch != 0: f = myopen(PATH_DATA + NAME_DATATEST + '_1.pkl.gz', 'r') train.container.value[:] = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX) f.close() # Now, restore TRAINFUNC with the original NOISE_LVL rebuildunsup(model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train, RULE) reconstruction_error.update({epoch: TESTFUNC()}) print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / reconstruction error (is this on test or train?): ' % ( depth + 1, epoch), reconstruction_error[epoch] for trainsize in VALIDATION_TRAININGSIZE: print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / trainsize %d / svm error' % ( depth + 1, epoch, trainsize), err[trainsize][epoch] print >> sys.stderr, stats() if epoch != 0: f = open('depth%serr.pkl' % depth, 'w') cPickle.dump(reconstruction_error, f, -1) for trainsize in VALIDATION_TRAININGSIZE: cPickle.dump(err[trainsize], f, -1) f.close() modeldir = os.path.join(PATH_SAVE, 'depth%spre%s' % (depth + 1, epoch)) if not os.path.isdir(modeldir): os.mkdir(modeldir) model.save(modeldir) if RULE == 5: f = open(modeldir + '/auxsigma.pkl', 'w') cPickle.dump(model.auxsigma.value, f, -1) f.close() print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % ( err, epoch, model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train, datatrain, datatrainsave, datatest, datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE) print >> sys.stderr, stats()
i = 0 print >> sys.stderr, "Reading lines from sys.stdin..." for l in sys.stdin: i += 1 if string.strip(l) == "": continue doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if i % 10000 == 0: print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % ( i, writer.numDocs()) print >> sys.stderr, stats() # if i > 100000: break print >> sys.stderr, "Indexed a total of %d lines from stdin (%d documents in index)" % ( i, writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) print >> sys.stderr, stats() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() print >> sys.stderr, stats() writer.close() print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs( )
def run(cmd): print >> sys.stderr, cmd print >> sys.stderr, stats() os.system(cmd) print >> sys.stderr, stats()