line = line[:-2] data = line.split("\t") assert (len(data) == 3) id_image = data[0] tags = [x.lower() for x in data[2].split(" ")] final_tags = [t for t in tags if t in vocab] id_tags[id_image] = final_tags N_images = len(id_tags) print "N images: ", N_images # build tag matrix tagmatrix = np.zeros((N_images, N_tags), dtype=np.int8) for i, id_im in enumerate(id_images): tags = id_tags[id_im] if len(tags) > 0: idx = [bisect_index(vocab, t) for t in tags] tagmatrix[i, idx] = True # save output fout = h5py.File(resultfile, 'w') fout['tagmatrix'] = tagmatrix fout['vocab'] = vocab fout['id_images'] = id_images fout.close()
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance overwrite = options.overwrite testset = testCollection onlytest = options.onlytest nnName = distance + "knn" resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5') resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5') if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite): return 0 testSet = readImageSet(testCollection, testset, rootpath) trainSet = readImageSet(trainCollection, trainCollection, rootpath) testSet.sort() trainSet.sort() #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) #train_feat_file = BigFile(train_feat_dir) tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001) printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet))) # allocate train -> train nearest neighbors if not onlytest: printStatus(INFO, 'Allocating NN, NND matrices') NN = np.zeros((len(trainSet), k+1), dtype=np.int32) NND = np.zeros((len(trainSet), k+1)) printStatus(INFO, 'Filling NN, NND matrices') for i,id_img in enumerate(trainSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img)) if len(neighbors) < k+1: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NN[i,:] = NNrow[0:k+1] NND[i,:] = NNDrow[0:k+1] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(trainSet))) printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train)) makedirsforfile(resultfile_train) fout = h5py.File(resultfile_train, 'w') fout['NN'] = NN fout['NND'] = NND fout['trainSet'] = trainSet fout['concepts'] = tagger.concepts fout.close() del NN del NND # allocate test -> train nearest neighbors printStatus(INFO, 'Allocating NNT, NNDT matrices') NNT = np.zeros((len(testSet), k), dtype=np.int32) NNDT = np.zeros((len(testSet), k)) printStatus(INFO, 'Filling NNT, NNDT matrices') for i,id_img in enumerate(testSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img)) if len(neighbors) < k: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NNT[i,:] = NNrow[0:k] NNDT[i,:] = NNDrow[0:k] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(testSet))) printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test)) makedirsforfile(resultfile_test) fout = h5py.File(resultfile_test, 'w') fout['NNT'] = NNT fout['NNDT'] = NNDT fout['trainSet'] = trainSet fout['testSet'] = testSet fout['concepts'] = tagger.concepts fout.close()
def process(options, testCollection, trainCollection, annotationName, feature): rootpath = options.rootpath k = options.k distance = options.distance overwrite = options.overwrite testset = testCollection onlytest = options.onlytest nnName = distance + "knn" resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d' % (feature, nnName, k), 'nn_train.h5') resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d' % (feature, nnName, k), 'nn_test.h5') if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip( resultfile_test, overwrite): return 0 testSet = readImageSet(testCollection, testset, rootpath) trainSet = readImageSet(trainCollection, trainCollection, rootpath) testSet.sort() trainSet.sort() #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature) #train_feat_file = BigFile(train_feat_dir) tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001) printStatus( INFO, '%d test images, %d train images' % (len(testSet), len(trainSet))) # allocate train -> train nearest neighbors if not onlytest: printStatus(INFO, 'Allocating NN, NND matrices') NN = np.zeros((len(trainSet), k + 1), dtype=np.int32) NND = np.zeros((len(trainSet), k + 1)) printStatus(INFO, 'Filling NN, NND matrices') for i, id_img in enumerate(trainSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img)) if len(neighbors) < k + 1: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k + 1)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NN[i, :] = NNrow[0:k + 1] NND[i, :] = NNDrow[0:k + 1] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(trainSet))) printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train)) makedirsforfile(resultfile_train) fout = h5py.File(resultfile_train, 'w') fout['NN'] = NN fout['NND'] = NND fout['trainSet'] = trainSet fout['concepts'] = tagger.concepts fout.close() del NN del NND # allocate test -> train nearest neighbors printStatus(INFO, 'Allocating NNT, NNDT matrices') NNT = np.zeros((len(testSet), k), dtype=np.int32) NNDT = np.zeros((len(testSet), k)) printStatus(INFO, 'Filling NNT, NNDT matrices') for i, id_img in enumerate(testSet): neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img)) if len(neighbors) < k: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k)) sys.exit(1) NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors]) NNDrow = np.array([x[1] for x in neighbors]) NNT[i, :] = NNrow[0:k] NNDT[i, :] = NNDrow[0:k] if i % 1000 == 0: printStatus(INFO, '%d / %d images' % (i, len(testSet))) printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test)) makedirsforfile(resultfile_test) fout = h5py.File(resultfile_test, 'w') fout['NNT'] = NNT fout['NNDT'] = NNDT fout['trainSet'] = trainSet fout['testSet'] = testSet fout['concepts'] = tagger.concepts fout.close()
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i+1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i+1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V,J,I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V) ** 2.; printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i,v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts) predicted_tagmatrix = robustpca_output['P'][:,mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert(final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, annotationName, feature, outputpkl): rootpath = options.rootpath distance = options.distance overwrite = options.overwrite k_ratio = options.kratio ratio_cs = options.ratiocs lambda1 = options.lambda1 lambda2 = options.lambda2 outputonlytest = options.outputonlytest rawtagmatrix = options.rawtagmatrix modelName = "robustpca" nnName = distance + "knn" printStatus( INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2)) if rawtagmatrix: printStatus(INFO, "Using raw tag matrix.") else: printStatus(INFO, "Using preprocessed tag matrix.") resultfile = os.path.join(outputpkl) resultfile_robustpca = os.path.join( rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d' % (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix), 'prediction.mat') if checkToSkip(resultfile_robustpca, overwrite): only_dump = True else: only_dump = False if not rawtagmatrix: tagmatrix_file = os.path.join( rootpath, workingCollection, 'RobustPCA', '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5') if not os.path.exists(tagmatrix_file): printStatus( INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file)) sys.exit(1) else: tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus( INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus( INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file)) sys.exit(1) laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f' % (ratio_cs), 'laplacianT.mat') if not os.path.exists(laplacianT_file): printStatus( INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file)) sys.exit(1) # being learning script = """ rpca_path = 'transduction_based/robustpca/'; addpath(rpca_path); addpath([rpca_path, 'fast_svd/']); tagmatrix = sparse(double(h5read('%s', '/tagmatrix'))); load('%s'); load('%s'); lambda1 = %f; lambda2 = %f; maxIters = 50; precision = 1e-4; mu_start = 1.; parpool('local', 4); [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start); """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2) script += """ delete(gcp); save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3'); exit; """ % resultfile_robustpca if not only_dump: printStatus(INFO, "starting learning") makedirsforfile(resultfile_robustpca) call_matlab(script) if checkToSkip(resultfile, overwrite): return 0 # save results in pkl format printStatus(INFO, "Dump results in pkl format at %s" % resultfile) concepts = readConcepts(workingCollection, annotationName, rootpath) if outputonlytest: testset_id_images = readImageSet( workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() id_images = readImageSet(workingCollection, workingCollection, rootpath) id_images.sort() # concepts mapping robustpca_output = h5py.File(resultfile_robustpca, 'r') tagprop_input = h5py.File(tagmatrix_file, 'r') mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts) predicted_tagmatrix = robustpca_output['P'][:, mapping] if outputonlytest: idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] assert (final_tagmatrix.shape[0] == idx.shape[0]) id_images = testset_id_images else: final_tagmatrix = predicted_tagmatrix makedirsforfile(resultfile) with open(resultfile, 'w') as f: pickle.dump( { 'concepts': concepts, 'id_images': id_images, 'scores': final_tagmatrix }, f, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite laplaciankratio = options.laplaciankratio nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,k_ratio), 'tagmatrix.h5') if checkToSkip(resultfile, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus(INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,laplaciankratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus(INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file)) sys.exit(1) tagmatrix_data = h5py.File(tagmatrix_file, 'r') tagmatrix = tagmatrix_data['tagmatrix'][:] printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape))) laplacian_data = scipy.io.loadmat(laplacianI_file) sigma = laplacian_data['sigma'] printStatus(INFO, 'Sigma^2 = %f' % (sigma)) workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10]) #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:]))) assert(np.all(workingSet == list(tagmatrix_data['id_images'][:]))) tot_images = len(workingSet) printStatus(INFO, '%d images in %s' % (tot_images, workingCollection)) printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0)))) K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio)) printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Starting the propagation pre-processing') tagmatrix_new = np.zeros(tagmatrix.shape) for i in xrange(tot_images): neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] C = np.sum(np.exp(-(NNDrow)/sigma)) tagmatrix_new[i,:] = np.sum((np.exp(-(NNDrow)/sigma).T * tagmatrix[NNrow]) / C, axis=0); if (i+1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i+1, tot_images)) # save output printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile) makedirsforfile(resultfile) fout = h5py.File(resultfile, 'w') fout['tagmatrix'] = tagmatrix_new fout['vocab'] = tagmatrix_data['vocab'][:] fout['id_images'] = workingSet fout.close()
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite laplaciankratio = options.laplaciankratio nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f' % (feature, nnName, k_ratio), 'tagmatrix.h5') if checkToSkip(resultfile, overwrite): return 0 tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5") if not os.path.exists(tagmatrix_file): printStatus( INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file)) sys.exit(1) laplacianI_file = os.path.join( rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, laplaciankratio), 'laplacianI.mat') if not os.path.exists(laplacianI_file): printStatus( INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file)) sys.exit(1) tagmatrix_data = h5py.File(tagmatrix_file, 'r') tagmatrix = tagmatrix_data['tagmatrix'][:] printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape))) laplacian_data = scipy.io.loadmat(laplacianI_file) sigma = laplacian_data['sigma'] printStatus(INFO, 'Sigma^2 = %f' % (sigma)) workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10]) #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:]))) assert (np.all(workingSet == list(tagmatrix_data['id_images'][:]))) tot_images = len(workingSet) printStatus(INFO, '%d images in %s' % (tot_images, workingCollection)) printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0)))) K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio)) printStatus( INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Starting the propagation pre-processing') tagmatrix_new = np.zeros(tagmatrix.shape) for i in xrange(tot_images): neighbors = _get_neighbors( '%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] C = np.sum(np.exp(-(NNDrow) / sigma)) tagmatrix_new[i, :] = np.sum( (np.exp(-(NNDrow) / sigma).T * tagmatrix[NNrow]) / C, axis=0) if (i + 1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i + 1, tot_images)) # save output printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile) makedirsforfile(resultfile) fout = h5py.File(resultfile, 'w') fout['tagmatrix'] = tagmatrix_new fout['vocab'] = tagmatrix_data['vocab'][:] fout['id_images'] = workingSet fout.close()
line = line[:-2] data = line.split("\t") assert len(data) == 3 id_image = data[0] tags = [x.lower() for x in data[2].split(" ")] final_tags = [t for t in tags if t in vocab] id_tags[id_image] = final_tags N_images = len(id_tags) print "N images: ", N_images # build tag matrix tagmatrix = np.zeros((N_images, N_tags), dtype=np.int8) for i, id_im in enumerate(id_images): tags = id_tags[id_im] if len(tags) > 0: idx = [bisect_index(vocab, t) for t in tags] tagmatrix[i, idx] = True # save output fout = h5py.File(resultfile, "w") fout["tagmatrix"] = tagmatrix fout["vocab"] = vocab fout["id_images"] = id_images fout.close()
pkl_file = open(sys.argv[2], 'w') workingCollection = sys.argv[3] annotationName = sys.argv[4] rootpath = ROOT_PATH id_images = tagmatrix_file['id_images'] concepts = readConcepts(workingCollection, annotationName, rootpath) testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath) testset_id_images.sort() if not type(id_images[0]) is str: id_images = map(str, id_images) if not type(testset_id_images[0]) is str: testset_id_images = map(str, testset_id_images) mapping = getVocabMap(list(tagmatrix_file['vocab'][:]),concepts) predicted_tagmatrix = tagmatrix_file['tagmatrix'][:,mapping] print "predicted_tagmatrix.shape = ", predicted_tagmatrix.shape print "len(id_images) = ", len(id_images) print "len(testset_id_images) = ", len(testset_id_images) idx = np.array([bisect_index(id_images, x) for x in testset_id_images]) final_tagmatrix = predicted_tagmatrix[idx, :] id_images = testset_id_images print "dumping %d elements..." % len(id_images) pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':final_tagmatrix}, pkl_file, pickle.HIGHEST_PROTOCOL)
def process(options, workingCollection, feature): rootpath = options.rootpath k_ratio = options.kratio distance = options.distance overwrite = options.overwrite nnName = distance + "knn" resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f' % (feature, nnName, k_ratio), 'laplacianI.mat') if checkToSkip(resultfile, overwrite): return 0 workingSet = readImageSet(workingCollection, workingCollection, rootpath) workingSet.sort() tot_images = len(workingSet) printStatus(INFO, '%d images' % (tot_images)) K_neighs = int(math.floor(len(workingSet) * k_ratio)) printStatus( INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio)) printStatus(INFO, 'Allocating I,J,V arrays') I = np.zeros((K_neighs * tot_images * 2)) J = np.zeros((K_neighs * tot_images * 2)) V = np.zeros((K_neighs * tot_images * 2)) n_entries = 0 # distances printStatus(INFO, 'Starting to fill I,J,V arrays') for i in xrange(tot_images): try: neighbors = _get_neighbors( '%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance) # remove images with features but not in the working set NNrow = [] NNDrow = [] new_neighs = [] for x in neighbors: try: NNrow.append(bisect_index(workingSet, x[0])) NNDrow.append(x[1]) new_neighs.append(x) except ValueError: pass #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors]) #NNDrow = np.array([x[1] for x in neighbors]) NNrow = np.array(NNrow) NNDrow = np.array(NNDrow) neighbors = new_neighs[0:K_neighs] except ValueError: printStatus( INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i])) sys.exit(1) if len(neighbors) < K_neighs: printStatus( INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs)) sys.exit(1) if (i + 1) % 1000 == 0: printStatus(INFO, '%d / %d done' % (i + 1, tot_images)) for k in xrange(K_neighs): if i != int(NNrow[k]): # -1 zero on the diagonal for a later step I[n_entries] = i J[n_entries] = int(NNrow[k]) # -1 V[n_entries] = NNDrow[k] n_entries += 1 I[n_entries] = int(NNrow[k]) # -1 J[n_entries] = i V[n_entries] = NNDrow[k] n_entries += 1 I = I[0:n_entries] J = J[0:n_entries] V = V[0:n_entries] printStatus(INFO, 'Removing duplicates') ind = np.lexsort((V, J, I)) I = I[ind] J = J[ind] V = V[ind] a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T b = np.ascontiguousarray(a).view( np.dtype((np.void, a.dtype.itemsize * a.shape[1]))) del a _, idx = np.unique(b, return_index=True) del b I = I[idx] J = J[idx] V = V[idx] printStatus(INFO, 'Computing the final laplacian matrix') sigma = np.median(V)**2. printStatus(INFO, 'Estimated sigma^2 = %f' % sigma) V = np.exp(-V / sigma) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr() new_diag = matrix.sum(axis=0).T V = -V I_add = np.zeros((tot_images)) J_add = np.zeros((tot_images)) V_add = np.zeros((tot_images)) for i, v in enumerate(new_diag): I_add[i] = i J_add[i] = i V_add[i] = v I = np.append(I, I_add) J = np.append(J, J_add) V = np.append(V, V_add) matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil() printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile) makedirsforfile(resultfile) scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})