def svm_validation(err, epoch, model, train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST):
    """
    Perform full SVM validation.
    """
    print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE)
    print >> sys.stderr, stats()

    createlibsvmfile(model,datatrain,datatrainsave)
    createlibsvmfile(model,datatest,datatestsave)

    for trainsize in VALIDATION_TRAININGSIZE:
        print trainsize
        print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE
        C,testerr,testerrdev,trainerr,trainerrdev = svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE)
        err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev)})

    for trainsize in VALIDATION_TRAININGSIZE:
        print >> sys.stderr, 'VALIDATION: epoch %d / trainsize %d / svm error' % ( epoch, trainsize) ,err[trainsize][epoch]
    print >> sys.stderr, stats()

    if epoch != 0:
        f = open('err.pkl','w')
        for trainsize in VALIDATION_TRAININGSIZE:
            cPickle.dump(err[trainsize],f,-1)
        f.close()
    print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE)
    print >> sys.stderr, stats()
def all_training_examples_cached():
    global _all_examples
    if _all_examples is None:
        try:
            _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename()))
            assert len(_all_examples) == cnt
            logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename()))
            logging.info(stats())
        except:
            logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename()))
            logging.info("Caching all training examples...")
            logging.info(stats())
            _all_examples = []
            for l1, l2, f1, f2, falign in bicorpora_filenames():
                for e in get_training_biexample(l1, l2, f1, f2, falign):
                    _all_examples.append(e)
                    if len(_all_examples) % 10000 == 0:
                        logging.info("\tcurrently have read %d training examples" % len(_all_examples))
                        logging.info(stats())
            random.shuffle(_all_examples)
            logging.info("...done caching all %d training examples" % len(_all_examples))
            logging.info(stats())

            cnt = len(_all_examples)
            cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1)
            assert len(_all_examples) == cnt
            logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename()))
            logging.info(stats())
    assert _all_examples is not None
    return _all_examples
def compute_representation_std(model,depth,PATH_DATA,NAME_DATA,NB_FILES):
    print >> sys.stderr, "Computing representation std for sigma initialization"
    print >> sys.stderr, stats()
    outputs = [model.layers[depth-1].out]
    func = theano.function([model.inp],outputs)
    sumvector = numpy.zeros((1,model.n_hid[depth-1]))
    count = 0
    for filenb in xrange(1,NB_FILES + 1):
        f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r')
        instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
        f.close()
        for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM):
            count += globalstate.BATCH_CREATION_LIBSVM
            rep = numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0])
            sumvector += rep.sum(0)
    meanvector = sumvector / float(count)
    sumvector = numpy.zeros((1,model.n_hid[depth-1]))
    count = 0
    for filenb in xrange(1,NB_FILES + 1):
        f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r')
        instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
        f.close()
        for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM):
            count += globalstate.BATCH_CREATION_LIBSVM
            rep = (numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0]) - meanvector)**2
            sumvector += rep.sum(0)
    stdvector = numpy.sqrt(sumvector / float(count))
    del instances
    print >> sys.stderr, "...done computing std"
    print >> sys.stderr, stats()
    return stdvector.reshape((model.n_hid[depth-1],))
Example #4
0
def load(rundir):
    print >> sys.stderr, "Loading state from %s..." % _filename(rundir)
    print >> sys.stderr, stats()
    m = cPickle.load(myopen(_filename(rundir), "r"))
    print >> sys.stderr, "...done loading state from %s" % _filename(rundir)
    print >> sys.stderr, stats()
    return m
def createlibsvmfile(model,depth,datafiles,dataout):
    print >> sys.stderr, 'Creating libsvm file %s (model=%s, depth=%d, datafiles=%s)...' % (repr(dataout), repr(model),depth,datafiles)
    print >> sys.stderr, stats()
    outputs = [model.layers[depth].out]
    func = theano.function([model.inp],outputs)
    f = myopen(datafiles[0],'r')
    instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
    f.close()
    f = myopen(datafiles[1],'r')
    labels = numpy.asarray(cPickle.load(f),dtype = 'int64')
    f.close()
    f = open(dataout,'w')
    for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM):
        textr = ''
        rep = func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0]
        for l in range(rep.shape[0]):
            textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l]
            idx = rep[l,:].nonzero()[0]
            for j,v in zip(idx,rep[l,idx]):
                textr += '%s:%s '%(j,v)
            textr += '\n'
        f.write(textr)
    del instances,labels
    f.close()
    print >> sys.stderr, "...done creating libsvm files"
    print >> sys.stderr, stats()
def generate_context_vectors():
    """
    Generate the (random) context vectors.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap

    if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian":
        context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))]
    elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary":
        NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5)
    
        logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
    
        # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"]
        context_vectors = []
        for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])):
            logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
            logging.info(stats())
            thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
            for j in range(wordmap.len):
                idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"])
                random.shuffle(idxs)
                for k in idxs[:NONZEROS]:
                    thiscontext[j][k] = random.choice([-1, +1])
    #            print thiscontext[j]
            context_vectors.append(thiscontext)
    else:
        assert 0
    
    logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
    logging.info(stats())
    return context_vectors
Example #7
0
def main(invideofilename, facefilename, outvideofilename):
    faces = Faces("")
    faces.__setstate__(common.json.loadfile(facefilename))

    dir = tempfile.mkdtemp()
    try:
        for i, f, totframes in common.video.frames(invideofilename, maxframes=len(faces.frames)):
            outf = os.path.join(dir, "out%05d.jpg" % i)
            print >> sys.stderr, "Processing %s to %s, image %s" % (f, outf, common.str.percent(i+1, totframes))
            print >> sys.stderr, stats()

            draw_faces(faces.frames[i], f, outf)

        # I learned this command from here: http://electron.mit.edu/~gsteele/ffmpeg/
        cmd = "ffmpeg -y -r 30 -b 10000k -i %s %s" % (os.path.join(dir, 'out%05d.jpg'), outvideofilename)
        print >> sys.stderr, "Stitching video together as test1800.mp4"
        print >> sys.stderr, cmd
#        import time
#        time.sleep(30)
        common.misc.runcmd(cmd)
        print >> sys.stderr, stats()

    finally:
        print >> sys.stderr, "Removing dir %s" % dir
        shutil.rmtree(dir)
def createlibsvmfile(model,datafiles,dataout):
    print >> sys.stderr, 'Creating libsvm file %s (model=%s, datafiles=%s)...' % (repr(dataout), repr(model),datafiles)
    print >> sys.stderr, stats()

    x = T.dmatrix()
    params = [T.dmatrix(), T.dmatrix(), T.dvector(), T.dvector()]
    model.x = x
    model.W, model.W_prime, model.b, model.b_prime = params
    model.params = [model.W, model.W_prime, model.b, model.b_prime]

    outputs = [model.get_hidden_values(model.x)]
    func = theano.function([model.x] + params,outputs)

#    print >> sys.stderr, 'REMOVEME: about to read'
#    print >> sys.stderr, stats()

    f = myopen(datafiles[0],'r')
    instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
    f.close()
    f = myopen(datafiles[1],'r')
    labels = numpy.asarray(cPickle.load(f),dtype = 'int64')
    f.close()
    f = myopen(dataout,'w')

#    print >> sys.stderr, 'REMOVEME: about to iterate'
#    print >> sys.stderr, stats()

#    params = [model.Wvalue, model.W_primevalue, model.bvalue, model.b_primevalue]
    for i in range(globalstate.NB_MAX_TRAINING_EXAMPLES_SVM/globalstate.BATCH_CREATION_LIBSVM):
#        print >> sys.stderr, 'REMOVEME: about to do %d' % i
#        print >> sys.stderr, stats()
        textr = ''

        assert globalstate.BATCH_CREATION_LIBSVM == 1       # Don't want to select indices from more than one example
        x = instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:]
        nonzeros = frozenset(x.nonzero()[1])
#        print >> sys.stderr, nonzeros
#        print >> sys.stderr, len(nonzeros)

        indices = list(nonzeros)
#        # TODO: Don't duplicate this code, which also appears about one hundred lines down.
#        x = x[:,indices]
#        params = [model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices]]
#        rep = func(x, *params)[0]

        rep = func(x[:,indices], model.Wvalue[indices], model.W_primevalue[:,indices], model.bvalue, model.b_primevalue[indices])[0]

        for l in range(rep.shape[0]):
            textr += '%s '%labels[globalstate.BATCH_CREATION_LIBSVM*i+l]
            idx = rep[l,:].nonzero()[0]
            for j,v in zip(idx,rep[l,idx]):
                textr += '%s:%s '%(j,v)
            textr += '\n'
        f.write(textr)
    del instances,labels
    f.close()
    print >> sys.stderr, "...done creating libsvm files"
    print >> sys.stderr, stats()
def targetmap(name=""):
    global _targetmap
    if name not in _targetmap:
        f = _targetmap_filename(name=name)
        print >> sys.stderr, "Reading target map from %s..." % f
        print >> sys.stderr, stats()
        _targetmap[name] = cPickle.load(myopen(f))
        print >> sys.stderr, "...done reading target map from %s" % f
        print >> sys.stderr, stats()
    return _targetmap[name]
Example #10
0
def svm_validation(err, reconstruction_error, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST,RULE):
    """
    Perform full SVM validation.
    """
    global TRAINFUNC

    print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE)
    print >> sys.stderr, stats()

    # Call with noiselevel = None before running the SVM.
    # No noise because we want the exact representation for each instance.
    rebuildunsup(model,depth,ACT,LR,None,BATCHSIZE,train,RULE)

    createlibsvmfile(model,depth,datatrain,datatrainsave)
    createlibsvmfile(model,depth,datatest,datatestsave)

    for trainsize in VALIDATION_TRAININGSIZE:
        print trainsize
        print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE
        C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev =\
                                            svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE)
        err[trainsize].update({epoch:(C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev)})


    if epoch != 0:
        f = myopen(PATH_DATA + NAME_DATATEST +'_1.pkl.gz','r')
        train.container.value[:] = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
        f.close()

    # Now, restore TRAINFUNC with the original NOISE_LVL
    rebuildunsup(model,depth,ACT,LR,NOISE_LVL,BATCHSIZE,train,RULE)
    reconstruction_error.update({epoch:TESTFUNC()})

    print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / reconstruction error (is this on test or train?): ' % (depth+1, epoch),reconstruction_error[epoch]
    for trainsize in VALIDATION_TRAININGSIZE:
        print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / trainsize %d / svm error' % (depth+1, epoch, trainsize),err[trainsize][epoch]
    print >> sys.stderr, stats()

    if epoch != 0:
        f = open('depth%serr.pkl'%depth,'w')
        cPickle.dump(reconstruction_error,f,-1)
        for trainsize in VALIDATION_TRAININGSIZE:
            cPickle.dump(err[trainsize],f,-1)
        f.close()
        modeldir = os.path.join(PATH_SAVE, 'depth%spre%s' % (depth+1,epoch))
        if not os.path.isdir(modeldir):
            os.mkdir(modeldir)
        model.save(modeldir)
        if RULE == 5:
            f = open(modeldir + '/auxsigma.pkl','w')
            cPickle.dump(model.auxsigma.value,f,-1)
            f.close()

    print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (err, epoch, model, depth, ACT,LR,NOISE_LVL,BATCHSIZE,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE)
    print >> sys.stderr, stats()
def diagnostics(cnt, embeddings):
    logging.info(stats())
    vocab_size = embeddings.shape[0]
    idxs = range(vocab_size)
    random.shuffle(idxs)
    idxs = idxs[:100]

    embeddings_debug(embeddings[idxs], cnt, "rand 100 words")
    embeddings_debug(embeddings[:100], cnt, "top  100 words")
    embeddings_debug(embeddings[vocab_size/2-50:vocab_size/2+50], cnt, "mid  100 words")
    embeddings_debug(embeddings[-100:], cnt, "last 100 words")
    logging.info(stats())
def diagnostics(cnt, model):
    logging.info(stats())
    idxs = range(model.parameters.vocab_size)
    random.shuffle(idxs)
    idxs = idxs[:100]

    embeddings_debug(model.parameters.embeddings[idxs], cnt, "rand 100 words, model %s" % model.modelname)
    embeddings_debug(model.parameters.embeddings[:100], cnt, "top  100 words, model %s" % model.modelname)
    embeddings_debug(model.parameters.embeddings[model.parameters.vocab_size/2-50:model.parameters.vocab_size/2+50], cnt, "mid  100 words, model %s" % model.modelname)
    embeddings_debug(model.parameters.embeddings[-100:], cnt, "last 100 words, model %s" % model.modelname)
    weights_debug(model.parameters.hidden_weights.value, cnt, "hidden weights, model %s" % model.modelname)
    weights_debug(model.parameters.output_weights.value, cnt, "output weights, model %s" % model.modelname)
    logging.info(stats())
Example #13
0
def validate(cnt):
    import math
    logranks = []
    logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt)
    logging.info(stats())
    i = 0
    for (i, ve) in enumerate(examples.get_validation_example()):
#        logging.info([wordmap.str(id) for id in ve])
        logranks.append(math.log(m.validate(ve)))
        if (i+1) % 10 == 0:
            logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks))))
            logging.info(stats())
    logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1))
    logging.info(stats())
Example #14
0
def onlineproject(x, dimensions, seed, randomization_type):
    # Online (low-memory) random projection
    
    newx = numpy.zeros((x.shape[0], dimensions))

    nonzeros = x.nonzero()      # (list of rows, list of cols) of all nonzeros
    
    # (col, row) of all nonzeros
    # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once.
    nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))]
    nonzero_colrow.sort()
    nonzero_colrow.reverse()
    
    randrow_key = None
    randrow_values = None
    randrows_computed = 0
    for l, (col, row) in enumerate(nonzero_colrow):
        if randrow_key != col:
            randrow_key = col
            randrow_values = pyrandomprojection.randomrow(key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed)

            randrows_computed += 1
            if randrows_computed % 500 == 0:
                print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (percent(randrows_computed, x.shape[1]), percent(l+1, len(nonzero_colrow)), f)
                print >> sys.stderr, stats()
        newrow = x[row,col] * randrow_values
        assert newx[row].shape == newrow.shape
        newx[row] += newrow
#        if (l+1) % 10000 == 0:
#            print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f)
#            print >> sys.stderr, stats()
    return newx
def readwords(filename):
    print >> sys.stderr, "Processing %s" % filename
    i = 0
    for line in open(filename):
        i += 1
        if i % 100000 == 0:
            print >> sys.stderr, "Read line %d of %s..." % (i, filename)
            print >> sys.stderr, stats()
        for w in string.split(line):
            yield w
Example #16
0
def batchproject(x, dimensions, seed, randomization_type):
    # Batch (cached, high-memory) random projection
    global randommatrix

    if randommatrix is None:
        print >> sys.stderr, "Creating random matrix of shape %s" % `(x.shape[1], dimensions)`
        print >> sys.stderr, stats()
        numpy.random.seed(seed)
        assert randomization_type == "gaussian"
        randommatrix = numpy.random.normal(size=(x.shape[1], dimensions))
    else:
        assert randommatrix.shape == (x.shape[1], dimensions)       # We assume the projection matrix won't change

    print >> sys.stderr, "Multiplying x by random matrix..."
    print >> sys.stderr, stats()
    newx = numpy.dot(x, randommatrix)
    print >> sys.stderr, "...done multiplying x by random matrix"
    print >> sys.stderr, stats()
    
    return newx
def load(rundir, newkeystr):
    """
    Read the directory and load the translation_model, the training count, the training epoch, and the training state.
    """
    global _lastfilename

    filename = os.path.join(rundir, "newkeystr.txt")
    assert newkeystr == myopen(filename).read()

    (cnt, lastcnt, epoch, filename) = common.json.loadfile(os.path.join(rundir, "trainstate.json"))

#    filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr))
    print >> sys.stderr, ("Reading translation_model from %s..." % filename)
    print >> sys.stderr, (stats())
    translation_model = cPickle.load(myopen(filename))
    print >> sys.stderr, ("...done reading translation_model from %s" % filename)
    print >> sys.stderr, (stats())
    _lastfilename = filename

    return (translation_model, cnt, lastcnt, epoch)
def load(rundir, newkeystr):
    """
    Read the directory and load the model, the training count, the training epoch, and the training state.
    """
    global _lastfilename

    filename = os.path.join(rundir, "newkeystr.txt")
    assert newkeystr == myopen(filename).read()

    filename = os.path.join(rundir, "trainstate.pkl")
    (trainstate, cnt, epoch) = cPickle.load(myopen(filename))

    filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr))
    print >> sys.stderr, ("Reading model from %s..." % filename)
    print >> sys.stderr, (stats())
    model = cPickle.load(myopen(filename))
    print >> sys.stderr, ("...done reading model from %s" % filename)
    print >> sys.stderr, (stats())
    _lastfilename = filename

    return (model, cnt, epoch, trainstate)
Example #19
0
 def __setstate__(self, state):
     """
     @warning: We ignore the filename.  If we wanted
     to be really fastidious, we would assume that
     HYPERPARAMETERS["TRAIN_SENTENCES"] might change.  The only
     problem is that if we change filesystems, the filename
     might change just because the base file is in a different
     path. So we issue a warning if the filename is different from
     """
     filename, count = state
     print >>sys.stderr, ("__setstate__(%s)..." % ` state `)
     print >>sys.stderr, (stats())
     iter = self.__iter__()
     while count != self.count:
         #            print count, self.count
         iter.next()
     if self.filename != filename:
         assert self.filename == HYPERPARAMETERS["TRAIN_SENTENCES"]
         print >>sys.stderr, ("self.filename %s != filename given to __setstate__ %s" % (self.filename, filename))
     print >>sys.stderr, ("...__setstate__(%s)" % ` state `)
     print >>sys.stderr, (stats())
Example #20
0
def main(videofilename):
    faces = Faces(videofilename)
    for i, f, totframes in common.video.frames(videofilename):
#    for i, f, totframes in common.video.frames(videofilename, maxframes=1000):
        print >> sys.stderr, "Processing %s, image %s" % (f, common.str.percent(i+1, totframes))
        print >> sys.stderr, stats()
        image = cvLoadImage(f)
        faces.set_dimensions(image.width, image.height)
        faces.add_frame(i, detect_faces(image))

        if i % 100 == 0 and i != 0:
            print >> sys.stderr, common.json.dumps(faces.__getstate__())
    print common.json.dumps(faces.__getstate__())
def save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr):
    global _lastfilename

    filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr))
    logging.info("Writing translation_model to %s..." % filename)
    logging.info(stats())
    cPickle.dump(translation_model, myopen(filename, "wb"), protocol=-1)
    logging.info("...done writing translation_model to %s" % filename)
    logging.info(stats())

#    if _lastfilename is not None:
#        logging.info("Removing old translation_model %s..." % _lastfilename)
#        try:
#            os.remove(_lastfilename)
#            logging.info("...removed %s" % _lastfilename)
#        except:
#            logging.info("Could NOT remove %s" % _lastfilename)
    _lastfilename = filename

    common.json.dumpfile((cnt, lastcnt, epoch, filename), os.path.join(rundir, "trainstate.json"))

    filename = os.path.join(rundir, "newkeystr.txt")
    myopen(filename, "wt").write(newkeystr)
Example #22
0
def main(invideofilename, facechainfilename, outvideofilename):
    faces = FaceChains()
    faces.__setstate__(common.json.loadfile(facechainfilename))

    dir = tempfile.mkdtemp()
    try:
        from collections import defaultdict
        frames = defaultdict(list)
        maxframe = 0
        for chain in faces.chains:
#            print chain
            color = ["red", "yellow", "green", "blue", "purple", "orange"][chain.__hash__() % 6]
            for i, face in chain.data:
                frames[i].append((face, color))
                if i > maxframe: maxframe = i
#        print >> sys.stderr, frames


        for i, f, totframes in common.video.frames(invideofilename, maxframes=maxframe):
            outf = os.path.join(dir, "out%05d.jpg" % i)
            print >> sys.stderr, "Processing %s to %s, image %s" % (f, outf, common.str.percent(i+1, totframes))
            print >> sys.stderr, stats()

            draw_faces(frames[i], f, outf)

        # I learned this command from here: http://electron.mit.edu/~gsteele/ffmpeg/
        cmd = "ffmpeg -y -r 30 -b 10000k -i %s %s" % (os.path.join(dir, 'out%05d.jpg'), outvideofilename)
        print >> sys.stderr, "Stitching video together as test1800.mp4"
        print >> sys.stderr, cmd
#        import time
#        time.sleep(30)
        common.misc.runcmd(cmd)
        print >> sys.stderr, stats()

    finally:
        print >> sys.stderr, "Removing dir %s" % dir
        shutil.rmtree(dir)
def save(model, cnt, epoch, trainstate, rundir, newkeystr):
    global _lastfilename

    filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr))
    logging.info("Writing model to %s..." % filename)
    logging.info(stats())
    cPickle.dump(model, myopen(filename, "wb"), protocol=-1)
    logging.info("...done writing model to %s" % filename)
    logging.info(stats())

    if _lastfilename is not None:
        logging.info("Removing old model %s..." % _lastfilename)
        try:
            os.remove(_lastfilename)
            logging.info("...removed %s" % _lastfilename)
        except:
            logging.info("Could NOT remove %s" % _lastfilename)
    _lastfilename = filename

    filename = os.path.join(rundir, "trainstate.pkl")
    cPickle.dump((trainstate, cnt, epoch), myopen(filename, "wb"), protocol=-1)

    filename = os.path.join(rundir, "newkeystr.txt")
    myopen(filename, "wt").write(newkeystr)
Example #24
0
def get_training_example():
    """
    Get a training example, as an infinite loop.
    """
    HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb")
    epoch = 0
    examples = 0
    while 1:
        epoch += 1
        sys.stderr.write("STARTING EPOCH #%d (%d examples)\n" % (epoch, examples))
        sys.stderr.write(stats() + "\n")
        for l in common.file.myopen(HYPERPARAMETERS["train examples file"]):
            if l == "\n": continue
            examples += 1
            yield _example_from_string(l)
Example #25
0
def rescal_rectifier_model(model,depth,PATH_DATA,NAME_DATA,NB_FILES,rule):
    print >> sys.stderr, "Rescaling of the rectifier model following the rule: %s"%rule
    print >> sys.stderr, stats()
    outputs = [model.layers[depth-1].out]
    func = theano.function([model.inp],outputs)
    max_value = numpy.zeros((1,model.n_hid[depth-1]))
    for filenb in xrange(1,NB_FILES + 1):
        f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r')
        instances = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
        f.close()
        for i in range(instances.shape[0]/globalstate.BATCH_CREATION_LIBSVM):
            rep = numpy.abs(func(instances[globalstate.BATCH_CREATION_LIBSVM*i:globalstate.BATCH_CREATION_LIBSVM*(i+1),:])[0])
            max_value = numpy.asarray([numpy.concatenate([max_value,rep]).max(0)])
    del instances
    if rule == 2:
        model.layers[depth-1].W.container.value[:] =  \
		numpy.asarray((model.layers[depth-1].W.value.T / max_value).T,dtype=theano.config.floatX)
        model.layers[depth-1].b.container.value[:] =  \
		numpy.asarray((model.layers[depth-1].b.value / max_value[0,:]),dtype=theano.config.floatX)
    if rule == 1:
        model.layers[depth-1].W.container.value[:] =  model.layers[depth-1].W.value / max_value.max()
        model.layers[depth-1].b.container.value[:] =  model.layers[depth-1].b.value / max_value.max()
    print >> sys.stderr, "...done rescaling parameters"
    print >> sys.stderr, stats()
def read(f):
    """
    Generator for reading a wikiprep XML file from a file object.
    """
    print >> sys.stderr, "Reading %s..." % f
    print >> sys.stderr, stats()
    doc = {}
    cnt = 0
    for event, elem in cElementTree.iterparse(f):
        if elem.tag == "title":
            doc["title"] = ("".join(elem.itertext()))
        elif elem.tag == "text":
            doc["text"] = ("".join(elem.itertext()))
        elif elem.tag == "link":
            # Skip internal links
            if elem.get("url") is None: continue

            if "external links" not in doc: doc["external links"] = []
            doc["external links"].append([elem.get("url"), ("".join(elem.itertext()))])
        elif elem.tag == "links":
            doc["links"] = [int(i) for i in string.split("".join(elem.itertext()))]
        elif elem.tag == "categories":
            doc["categories"] = [int(i) for i in string.split("".join(elem.itertext()))]
        elif elem.tag == "page":
            doc["_id"] = int(elem.get("id"))
            cnt += 1
            yield doc
            doc = {}

            # Free the memory of the building tree
            elem.clear()
            if cnt % 1000 == 0:
                print >> sys.stderr, "Read %d articles from %s" % (cnt, f)
                print >> sys.stderr, stats()
    print >> sys.stderr, "...done reading %s" % f
    print >> sys.stderr, stats()
Example #27
0
def runjob(model, h, datafile, kfold, job):
    X, Y = cPickle.load(open(datafile))

    # TODO: Is it possible to get around doing this?
    # e.g. determine based upon "model" ?
    # At the very least, this should be a command-line param
    from locals import CONVERT_TO_DENSE
    if CONVERT_TO_DENSE:
        X = X.todense()

    print >> sys.stderr, "X = %s, Y = %s" % (X.shape, Y.shape)
    print >> sys.stderr, stats()

    try:
        train(model, h, X, Y, job, kfold)
        assert job.result is not None
        print "JOB", job
        sys.stdout.flush()
    except Exception, e:
        print >> sys.stderr, "Error %s %s on %s" % (type(e), e, (model, h))
Example #28
0
def runjob(model, h, datafile, kfold, job):
    X, Y = cPickle.load(open(datafile))

    # TODO: Is it possible to get around doing this?
    # e.g. determine based upon "model" ?
    # At the very least, this should be a command-line param
    from locals import CONVERT_TO_DENSE
    if CONVERT_TO_DENSE:
        X = X.todense()

    print >> sys.stderr, "X = %s, Y = %s" % (X.shape, Y.shape)
    print >> sys.stderr, stats()

    try:
        train(model, h, X, Y, job, kfold)
        assert job.result is not None
        print "JOB", job
        sys.stdout.flush()
    except Exception, e:
        print >> sys.stderr, "Error %s %s on %s" % (type(e), e, (model, h))
def onlineproject(x, dimensions, seed, randomization_type):
    # Online (low-memory) random projection

    newx = numpy.zeros((x.shape[0], dimensions))

    nonzeros = x.nonzero()  # (list of rows, list of cols) of all nonzeros

    # (col, row) of all nonzeros
    # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once.
    nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l])
                      for l in range(len(nonzeros[0]))]
    nonzero_colrow.sort()
    nonzero_colrow.reverse()

    randrow_key = None
    randrow_values = None
    randrows_computed = 0
    for l, (col, row) in enumerate(nonzero_colrow):
        if randrow_key != col:
            randrow_key = col
            randrow_values = pyrandomprojection.randomrow(
                key=col,
                dimensions=dimensions,
                RANDOMIZATION_TYPE=randomization_type,
                RANDOM_SEED=seed)

            randrows_computed += 1
            if randrows_computed % 500 == 0:
                print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (
                    percent(randrows_computed, x.shape[1]),
                    percent(l + 1, len(nonzero_colrow)), f)
                print >> sys.stderr, stats()
        newrow = x[row, col] * randrow_values
        assert newx[row].shape == newrow.shape
        newx[row] += newrow


#        if (l+1) % 10000 == 0:
#            print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f)
#            print >> sys.stderr, stats()
    return newx
Example #30
0
def draw_faces(faces, infilename, outfilename):
    pil_img = Image.open(infilename)

    # Draw red boxes around faces
    if faces:
        draw = ImageDraw.Draw(pil_img)
        for face in faces:
            face.draw(draw)
        del draw

#    # REMOVEME: Scale image to height of 320
#    newwidth = 320
#    newheight = newwidth * pil_img.size[1] / pil_img.size[0]
##    print pil_img.size
##    print newwidth, newheight
#    pil_img= pil_img.resize((newwidth, newheight), Image.ANTIALIAS) 

    # Save to out.png
    print >> sys.stderr, "Writing to %s" % outfilename
    print >> sys.stderr, stats()
    pil_img.save(outfilename, "JPEG")
Example #31
0
def draw_faces(faces, infilename, outfilename):
    pil_img = Image.open(infilename)

    # Draw red boxes around faces
    draw = ImageDraw.Draw(pil_img)
    for face, color in faces:
        #        print face, color
        face.draw(draw, color=color)
    del draw

    #    # REMOVEME: Scale image to height of 320
    #    newwidth = 320
    #    newheight = newwidth * pil_img.size[1] / pil_img.size[0]
    ##    print pil_img.size
    ##    print newwidth, newheight
    #    pil_img= pil_img.resize((newwidth, newheight), Image.ANTIALIAS)

    # Save to out.png
    print >> sys.stderr, "Writing to %s" % outfilename
    print >> sys.stderr, stats()
    pil_img.save(outfilename, "JPEG")
def trainingsentences():
    """
    For each line (sentence) in the training data, transform it into a list of token IDs.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap
    filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
    count = 0
    for l in myopen(filename):
        tokens = []
        for w in string.split(l):
            w = string.strip(w)
            assert wordmap.exists(w)     # Not exactly clear what to do
                                         # if the word isn't in the vocab.
            tokens.append(wordmap.id(w))
        yield tokens
        count += 1
        if count % 1000 == 0:
            logging.info("Read %d lines from training file %s..." % (count, filename))
            logging.info(stats())
Example #33
0
def trainingsentences():
    """
    For each line (sentence) in the training data, transform it into a list of token IDs.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap
    filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
    count = 0
    for l in myopen(filename):
        tokens = []
        for w in string.split(l):
            w = string.strip(w)
            assert wordmap.exists(w)  # Not exactly clear what to do
            # if the word isn't in the vocab.
            tokens.append(wordmap.id(w))
        yield tokens
        count += 1
        if count % 1000 == 0:
            logging.info("Read %d lines from training file %s..." %
                         (count, filename))
            logging.info(stats())
Example #34
0
def batch_apply(f, x, batchsize=1024, verbose=True):
    """
    Slice x in batches of size batchsize, run f on x, and return a list of results.
    @warning: The function should *NOT* return any indexes because f receives
    index numbers that are wrong. (The indexes should adjust for the current min.)
    """
    import sys
    from common.stats import stats

    ret = []
    min = 0
    max = batchsize
    while min < x.shape[0]:
        if max > x.shape[0]: max = x.shape[0]
        if verbose:
            print >> sys.stderr, "Running on %d:%d..." % (min, max)
            print >> sys.stderr, stats()

        tmpx = x[min:max]
        ret.append(f(tmpx))
        min += batchsize
        max += batchsize

    return ret
Example #35
0
def batch_apply(f, x, batchsize=1024, verbose=True):
    """
    Slice x in batches of size batchsize, run f on x, and return a list of results.
    @warning: The function should *NOT* return any indexes because f receives
    index numbers that are wrong. (The indexes should adjust for the current min.)
    """
    import sys
    from common.stats import stats

    ret = []
    min = 0
    max = batchsize
    while min < x.shape[0]:
        if max > x.shape[0]: max = x.shape[0]
        if verbose:
            print >> sys.stderr, "Running on %d:%d..." % (min, max)
            print >> sys.stderr, stats()

        tmpx = x[min:max]
        ret.append(f(tmpx))
        min += batchsize
        max += batchsize

    return ret
Example #36
0
    #    o = graph.validatefn(x, N.array([y]), w1, b1, w2, b2)
    #    (kl, softmax, argmax, presquashh) = o
    ##    print "new KL=%.3f, softmax=%s, argmax=%d" % (kl, softmax, argmax)
    #    print "new KL=%.3f, argmax=%d" % (kl, argmax)

    if cnt % HYPERPARAMETERS["examples per validation"] == 0:
        valacc, valstd = validate()
        sys.stderr.write(
            "After %d training examples, validation accuracy: %.2f%%, stddev: %.2f%% (former best=%.2f%% at %d)\n"
            % (cnt, valacc * 100, valstd * 100, best_validation_accuracy * 100,
               best_validation_at))
        if best_validation_accuracy < valacc:
            best_validation_accuracy = valacc
            best_validation_at = cnt
            sys.stderr.write("NEW BEST VALIDATION ACCURACY. Saving state.\n")
            state_save()
        elif cnt > 2 * best_validation_at and cnt >= HYPERPARAMETERS[
                "minimum training updates"]:
            sys.stderr.write(
                "Have not beaten best validation accuracy for a while. Terminating training...\n"
            )
            sys.stderr.write(stats() + "\n")
            break
    if cnt % 1000 == 0:
        sys.stderr.write(
            "After %d training examples, training accuracy (moving average): %.2f%%, stddev: %.2f%%\n"
            % (cnt, 100. * mvgavg_accuracy, 100. * math.sqrt(mvgavg_variance)))
        sys.stderr.write(stats() + "\n")

#graph.COMPILE_MODE.print_summary()
def svm_validation_for_one_trainsize(nbinputs, numruns, datatrainsave,
                                     datatestsave, PATH_SAVE):
    """
    Train an SVM on nbinputs training examples, for numrums runs.
    Choose the value of C using a linesearch to minimize the testerr.
    Return:
        C,testerr,testerrdev,trainerr,trainerrdev

    MAXSTEPS is the number of steps performed in the line search.
    STEPFACTOR is the initial step size.
    """
    MAXSTEPS = globalstate.SVM_MAXSTEPS
    STEPFACTOR = globalstate.SVM_STEPFACTOR
    INITIALC = globalstate.SVM_INITIALC

    print >> sys.stderr, 'Starting SVM validation for %s examples (numrums=%d, datatrainsave=%s, datatestsave=%s, PATH_SAVE=%s, MAXSTEPS=%d, STEPFACTOR=%f, INITIALC=%f)...' % (
        nbinputs, numruns, datatrainsave, datatestsave, PATH_SAVE, MAXSTEPS,
        STEPFACTOR, INITIALC)
    print >> sys.stderr, stats()

    Ccurrent = INITIALC
    Cstepfactor = STEPFACTOR
    Cnew = Ccurrent * Cstepfactor

    C_to_allstats = {}
    Cbest = None

    while len(C_to_allstats) < MAXSTEPS:
        if Ccurrent not in C_to_allstats:
            # Compute the validation statistics for the current C
            testerr, testerrdev, trainerr, trainerrdev = svm_validation_for_one_trainsize_and_one_C(
                Ccurrent, nbinputs, numruns, datatrainsave, datatestsave,
                PATH_SAVE)
            C_to_allstats[Ccurrent] = (testerr, testerrdev, trainerr,
                                       trainerrdev)
        if Cnew not in C_to_allstats:
            # Compute the validation statistics for the next C
            testerr, testerrdev, trainerr, trainerrdev = svm_validation_for_one_trainsize_and_one_C(
                Cnew, nbinputs, numruns, datatrainsave, datatestsave,
                PATH_SAVE)
            C_to_allstats[Cnew] = (testerr, testerrdev, trainerr, trainerrdev)
        # If Cnew has a lower test err than Ccurrent, then continue stepping in this direction
        if C_to_allstats[Cnew][0] < C_to_allstats[Ccurrent][0]:
            print >> sys.stderr, "\ttesterr[Cnew %f] = %f < testerr[Ccurrent %f] = %f" % (
                Cnew, C_to_allstats[Cnew][0], Ccurrent,
                C_to_allstats[Ccurrent][0])
            if Cbest is None or C_to_allstats[Cnew][0] < C_to_allstats[Cbest][
                    0]:
                Cbest = Cnew
                print >> sys.stderr, "\tNEW BEST: Cbest <= %f, testerr[Cbest] = %f" % (
                    Cbest, C_to_allstats[Cbest][0])
            Ccurrent = Cnew
            Cnew *= Cstepfactor
            print >> sys.stderr, "\tPROCEED: Cstepfactor remains %f, Ccurrent is now %f, Cnew is now %f" % (
                Cstepfactor, Ccurrent, Cnew)
        # Else, reverse the direction and reduce the step size by sqrt.
        else:
            print >> sys.stderr, "\ttesterr[Cnew %f] = %f > testerr[Ccurrent %f] = %f" % (
                Cnew, C_to_allstats[Cnew][0], Ccurrent,
                C_to_allstats[Ccurrent][0])
            if Cbest is None or C_to_allstats[Ccurrent][0] < C_to_allstats[
                    Cbest][0]:
                Cbest = Ccurrent
                print >> sys.stderr, "\tCbest <= %f, testerr[Cbest] = %f" % (
                    Cbest, C_to_allstats[Cbest][0])
            Cstepfactor = 1. / math.sqrt(Cstepfactor)
            Cnew = Ccurrent * Cstepfactor
            print >> sys.stderr, "\tREVERSE: Cstepfactor is now %f, Ccurrent remains %f, Cnew is now %f" % (
                Cstepfactor, Ccurrent, Cnew)

    allC = C_to_allstats.keys()
    allC.sort()
    for C in allC:
        print >> sys.stderr, "\ttesterr[C %f] = %f" % (C, C_to_allstats[C][0]),
        if C == Cbest:
            print >> sys.stderr, " *best* (testerr = %f, testerrdev = %f, trainerr = %f, trainerrdev = %f)" % C_to_allstats[
                C]
        else:
            print >> sys.stderr, ""
    print >> sys.stderr, '...done with SVM validation for %s examples (numrums=%d, datatrainsave=%s, datatestsave=%s)' % (
        nbinputs, numruns, datatrainsave, datatestsave)
    print >> sys.stderr, stats()

    return [Cbest] + list(C_to_allstats[Cbest])
    epoch = 0
    if epoch in EPOCHSTEST:
        svm_validation(err, epoch, model, train, datatrain, datatrainsave,
                       datatest, datatestsave, VALIDATION_TRAININGSIZE,
                       VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE,
                       PATH_DATA, NAME_DATATEST)
        channel.save()

    train_reconstruction_error_mvgavg = MovingAverage()
    for epoch in xrange(1, NEPOCHS + 1):
        time1 = time.time()
        state.currentepoch = epoch
        for filenb in xrange(1, NB_FILES + 1):
            print >> sys.stderr, "\t\tAbout to read file %s..." % percent(
                filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
            #                initial_file_time = time.time()
            f = open(PATH_DATA + NAME_DATA + '_%s.pkl' % filenb, 'r')
            object = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX)
            print >> sys.stderr, "\t\t...read file %s" % percent(
                filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
            # The last training file is not of the same shape as the other training files.
            # So, to avoid a GPU memory error, we want to make sure it is the same size.
            # In which case, we pad the matrix but keep track of how many n (instances) there actually are.
            # TODO: Also want to pad trainl
            if object.shape == normalshape:
                train.container.value[:] = object
                currentn = normalshape[0]
                del object
            else:
Example #39
0
def train(model, h, X, Y, job, kfold):
    # TODO: These should be passed in as command-line parameters
    FOLDS = 5
    #FOLDS = 3
    EVALUATION_MEASURE = sklearn.metrics.f1_score

    if kfold:
        kf = KFold(X.shape[0], FOLDS, indices=True)
        #if kfold: kf = LeaveOneOut(X.shape[0], indices=True)
    else:
        assert 0

    start = time.clock()
    print >> sys.stderr, "trying %s %s" % (model, h)
    errs = []
    if kfold:
        for i, (train, test) in enumerate(kf):
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[
                test]

            clf = model(**h)
            # TODO: What we should do is have a multiclass command-line parameter,
            # in which case we do the following:
            clf = OneVsRestClassifier(clf)

            clf.fit(X_train, y_train)

            # TODO: Run evals on train, for debugging?

            #            for j in range(y_test.shape[0]):
            #                probs = []
            #                for k, est in enumerate(clf.estimators_):
            #                    y_test_predict = est.predict_proba(X_test[j])
            #                    probs.append((y_test_predict[0][1], k))
            #                print "ACC", y_test[j][sorted(probs)[-1][1]]
            #                sys.stdout.flush()

            y_test_predict = clf.predict(X_test)
            errs.append(EVALUATION_MEASURE(y_test, y_test_predict))
            print >> sys.stderr, "INTERMEDIATE kfold=%d/%d" % (
                i + 1, FOLDS), errs[-1], modelstr(clf)
            print >> sys.stderr, stats()

#            if errs[-1] < TASKMIN and i+1 < FOLDS:
#                if FORCE:
#                    print >> sys.stderr, "FORCE=True, otherwise we'd abort becase err %f < %d taskmin %f" % (errs[-1], TASKMIN)
#                else:
#                    print >> sys.stderr, "ABORTING. err %f < %d taskmin %f" % (errs[-1], TASKMIN)
#                    job.result = False
#                    return
    else:
        assert 0

    end = time.clock()
    difftime = end - start
    if kfold:
        job.result = {
            "mean": numpy.mean(errs),
            "std": numpy.std(errs),
            "95conf": numpy.mean(errs) - 1.96 * numpy.std(errs),
            "min": numpy.min(errs),
            "folds": errs,
            "time": difftime
        }
        print >> sys.stderr, "kfold=%d" % FOLDS, "mean", numpy.mean(
            errs), "std", numpy.std(errs), "95conf", numpy.mean(
                errs) - 1.96 * numpy.std(errs), "min", numpy.min(
                    errs), modelstr(clf)
        print "kfold=%d" % FOLDS, "mean", numpy.mean(errs), "std", numpy.std(
            errs), "95conf", numpy.mean(errs) - 1.96 * numpy.std(
                errs), "min", numpy.min(errs), modelstr(clf)
    else:
        assert 0


#        job.result = {"mean": numpy.mean(errs), "title": difftime}
#        print num, numpy.mean(errs), modelstr(clf)
    sys.stdout.flush()
    print >> sys.stderr, stats()
Example #40
0
def svm_validation(err, reconstruction_error, epoch, model, depth, ACT, LR,
                   NOISE_LVL, BATCHSIZE, train, datatrain, datatrainsave,
                   datatest, datatestsave, VALIDATION_TRAININGSIZE,
                   VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA,
                   NAME_DATATEST, RULE):
    """
    Perform full SVM validation.
    """
    global TRAINFUNC

    print >> sys.stderr, "Validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)..." % (
        err, epoch, model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train,
        datatrain, datatrainsave, datatest, datatestsave,
        VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE,
        PATH_SAVE)
    print >> sys.stderr, stats()

    # Call with noiselevel = None before running the SVM.
    # No noise because we want the exact representation for each instance.
    rebuildunsup(model, depth, ACT, LR, None, BATCHSIZE, train, RULE)

    createlibsvmfile(model, depth, datatrain, datatrainsave)
    createlibsvmfile(model, depth, datatest, datatestsave)

    for trainsize in VALIDATION_TRAININGSIZE:
        print trainsize
        print VALIDATION_RUNS_FOR_EACH_TRAININGSIZE
        C,testerr,testerrdev,trainerr,trainerrdev,testerrnew,testerrnewdev,trainerrnew,trainerrnewdev =\
                                            svm_validation_for_one_trainsize(trainsize,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE[`trainsize`],datatrainsave,datatestsave,PATH_SAVE)
        err[trainsize].update({
            epoch: (C, testerr, testerrdev, trainerr, trainerrdev, testerrnew,
                    testerrnewdev, trainerrnew, trainerrnewdev)
        })

    if epoch != 0:
        f = myopen(PATH_DATA + NAME_DATATEST + '_1.pkl.gz', 'r')
        train.container.value[:] = numpy.asarray(cPickle.load(f),
                                                 dtype=theano.config.floatX)
        f.close()

    # Now, restore TRAINFUNC with the original NOISE_LVL
    rebuildunsup(model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train, RULE)
    reconstruction_error.update({epoch: TESTFUNC()})

    print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / reconstruction error (is this on test or train?): ' % (
        depth + 1, epoch), reconstruction_error[epoch]
    for trainsize in VALIDATION_TRAININGSIZE:
        print >> sys.stderr, 'VALIDATION: depth %d / epoch %d / trainsize %d / svm error' % (
            depth + 1, epoch, trainsize), err[trainsize][epoch]
    print >> sys.stderr, stats()

    if epoch != 0:
        f = open('depth%serr.pkl' % depth, 'w')
        cPickle.dump(reconstruction_error, f, -1)
        for trainsize in VALIDATION_TRAININGSIZE:
            cPickle.dump(err[trainsize], f, -1)
        f.close()
        modeldir = os.path.join(PATH_SAVE, 'depth%spre%s' % (depth + 1, epoch))
        if not os.path.isdir(modeldir):
            os.mkdir(modeldir)
        model.save(modeldir)
        if RULE == 5:
            f = open(modeldir + '/auxsigma.pkl', 'w')
            cPickle.dump(model.auxsigma.value, f, -1)
            f.close()

    print >> sys.stderr, "...done validating (err=%s,epoch=%s,model=%s,depth=%s,ACT=%s,LR=%s,NOISE_LVL=%s,BATCHSIZE=%s,train=%s,datatrain=%s,datatrainsave=%s,datatest=%s,datatestsave=%s,VALIDATION_TRAININGSIZE=%s,VALIDATION_RUNS_FOR_EACH_TRAININGSIZE=%s,PATH_SAVE=%s)" % (
        err, epoch, model, depth, ACT, LR, NOISE_LVL, BATCHSIZE, train,
        datatrain, datatrainsave, datatest, datatestsave,
        VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE,
        PATH_SAVE)
    print >> sys.stderr, stats()
    i = 0
    print >> sys.stderr, "Reading lines from sys.stdin..."
    for l in sys.stdin:
        i += 1

        if string.strip(l) == "": continue

        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        if i % 10000 == 0:
            print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (
                i, writer.numDocs())
            print >> sys.stderr, stats()
#        if i > 100000: break

    print >> sys.stderr, "Indexed a total of %d lines from stdin (%d documents in index)" % (
        i, writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    print >> sys.stderr, stats()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    print >> sys.stderr, stats()
    writer.close()
    print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs(
    )
Example #42
0
def run(cmd):
    print >> sys.stderr, cmd
    print >> sys.stderr, stats()
    os.system(cmd)
    print >> sys.stderr, stats()