row = docs[i * docGridWidth:(i + 1) * docGridWidth] rowExt = [] for r in row: rowExt.append(r) rowExt.append( numpy.zeros((gridSpace, gridScale * 5), dtype=numpy.float_)) rowExt = rowExt[:-1] rows.append(numpy.vstack(rowExt)) stack = [] for r in rows: stack.append(r) stack.append(numpy.zeros((r.shape[0], gridSpace), dtype=numpy.float_)) stack = stack[:-1] docImage = numpy.hstack(stack).T * 255.0 img = cvarray.array2cv(docImage) cv.SaveImage('test_lines/docs.png', img) # Train... params = dhdp.Params() params.runs = 1 params.samples = 1 #params.burnIn = 10000 #c.setOneCluster(True) #c.setCalcBeta(True) print 'Fitting model...' p = ProgBar() model = c.sampleModel(params, p.callback) del p
#print model.z.sum(axis=0) # Now plot the estimated distribution against the actual distribution... img = numpy.ones((height, width, 3)) draw = model.sampleMixture() for px in xrange(width): x = float(px) / float(width) * (high - low) + low y_gt = 0.0 for ii in xrange(len(gt)): y_gt += gt_weight[ii] * gt[ii].prob([x]) y_gu = model.prob([x]) y_gd = 0.0 for ind, gauss in enumerate(draw[1]): y_gd += draw[0][ind] * gauss.prob([x]) py_gt = int((1.0 - y_gt / scale) * height) py_gu = int((1.0 - y_gu / scale) * height) py_gd = numpy.clip(int((1.0 - y_gd / scale) * height), 0, height - 1) img[py_gt, px, :] = [0.0, 1.0, 0.0] img[py_gu, px, :] = [1.0, 0.0, 0.0] img[py_gd, px, :] = [0.0, 0.0, 1.0] # Save plot out... img = cvarray.array2cv(img * 255.0) cv.SaveImage('%s/plot_%i.png' % (out_dir, i + 1), img) print
image *= 255.0 / image.max() image = numpy.reshape(image, (5, 5)) image = numpy.repeat(numpy.repeat(image, 5, axis=0), 5, axis=1) image = numpy.append(image, numpy.atleast_2d(numpy.zeros(image.shape[1])), axis=0) image = numpy.append(image, numpy.atleast_2d(numpy.zeros(image.shape[0])).T, axis=1) docImageSet.append(image) docVertSet = [] for i in xrange(50): docVertSet.append(numpy.vstack(docImageSet[i * 20:(i + 1) * 20])) docSet = numpy.hstack(docVertSet) img = cvarray.array2cv(docSet) cv.SaveImage('test_grid_docs.png', img) # Train... print 'Trainning...' #p = ProgBar() #passes = vlda.solve() #del p passes = vlda.solveHuman() print 'Took %i passes' % passes # Generate an image of the final distributions associated with the learned documents... # Get pixel values... tImages = [] for topic in xrange(vlda.numTopics()): # Get distribution...
for px in xrange(width): x = float(px)/float(width) * (high-low) + low y_gt = 0.0 for ii in xrange(len(gt)): y_gt += gt_weight[ii] * gt[ii].prob([x]) y_gu = model.prob([x]) y_gd = 0.0 for ind,gauss in enumerate(draw[1]): y_gd += draw[0][ind] * gauss.prob([x]) py_gt = int((1.0 - y_gt/scale) * height) py_gu = int((1.0 - y_gu/scale) * height) py_gd = numpy.clip(int((1.0 - y_gd/scale) * height),0,height-1) img[py_gt,px,:] = [0.0,1.0,0.0] img[py_gu,px,:] = [1.0,0.0,0.0] img[py_gd,px,:] = [0.0,0.0,1.0] # Save plot out... img = cvarray.array2cv(img*255.0) cv.SaveImage('%s/plot_%i.png'%(out_dir,model.getStickCap()),img) print # Either finish or incriment the number of sticks for the next run... value = model.nllData() if prev==None or value<prev: prev = value model.incStickCap() else: break
# Save out the input documents for confirmation (50x20 grid)... docImageSet = [] for words in inputImageSet: image = numpy.asfarray(words) image *= 255.0/image.max() image = numpy.reshape(image,(5,5)) image = numpy.repeat(numpy.repeat(image,5,axis=0),5,axis=1) image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[1])),axis=0) image = numpy.append(image,numpy.atleast_2d(numpy.zeros(image.shape[0])).T,axis=1) docImageSet.append(image) docVertSet = [] for i in xrange(50): docVertSet.append(numpy.vstack(docImageSet[i*20:(i+1)*20])) docSet = numpy.hstack(docVertSet) img = cvarray.array2cv(docSet) cv.SaveImage('test_grid_docs.png',img) # Train... print 'Trainning...' #p = ProgBar() #passes = vlda.solve() #del p passes = vlda.solveHuman() print 'Took %i passes'%passes # Generate an image of the final distributions associated with the learned documents...
for i in xrange(documentsToTrain//docGridWidth): row = docs[i*docGridWidth:(i+1)*docGridWidth] rowExt = [] for r in row: rowExt.append(r) rowExt.append(numpy.zeros((gridSpace,gridScale*5), dtype=numpy.float_)) rowExt = rowExt[:-1] rows.append(numpy.vstack(rowExt)) stack = [] for r in rows: stack.append(r) stack.append(numpy.zeros((r.shape[0],gridSpace), dtype=numpy.float_)) stack = stack[:-1] docImage = numpy.hstack(stack).T * 255.0 img = cvarray.array2cv(docImage) cv.SaveImage('test_lines/docs.png',img) # Train... params = dhdp.Params() params.runs = 1 params.samples = 1 #params.burnIn = 10000 #c.setOneCluster(True) #c.setCalcBeta(True) print 'Fitting model...' p = ProgBar()