예제 #1
0
        line = line[:-2]
        data = line.split("\t")

        assert (len(data) == 3)
        id_image = data[0]
        tags = [x.lower() for x in data[2].split(" ")]

        final_tags = [t for t in tags if t in vocab]

        id_tags[id_image] = final_tags

N_images = len(id_tags)

print "N images: ", N_images

# build tag matrix
tagmatrix = np.zeros((N_images, N_tags), dtype=np.int8)

for i, id_im in enumerate(id_images):
    tags = id_tags[id_im]
    if len(tags) > 0:
        idx = [bisect_index(vocab, t) for t in tags]
        tagmatrix[i, idx] = True

# save output
fout = h5py.File(resultfile, 'w')
fout['tagmatrix'] = tagmatrix
fout['vocab'] = vocab
fout['id_images'] = id_images
fout.close()
예제 #2
0
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    overwrite = options.overwrite
    testset = testCollection
    onlytest = options.onlytest
    
    nnName = distance + "knn"
    resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data', trainCollection, '%s,%s,%d'%(feature,nnName,k), 'nn_train.h5')
    resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data', testset, trainCollection, annotationName, '%s,%s,%d'%(feature,nnName,k), 'nn_test.h5')
    
    if (not onlytest and checkToSkip(resultfile_train, overwrite)) or checkToSkip(resultfile_test, overwrite):
        return 0

    testSet = readImageSet(testCollection, testset, rootpath)
    trainSet = readImageSet(trainCollection, trainCollection, rootpath)
    testSet.sort()
    trainSet.sort()

    #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    #train_feat_file = BigFile(train_feat_dir)

    tagger = NAME_TO_TAGGER["preknn"](trainCollection, annotationName, feature, distance, rootpath=rootpath, k=1001)

    printStatus(INFO, '%d test images, %d train images' % (len(testSet), len(trainSet)))

    # allocate train -> train nearest neighbors
    if not onlytest:
        printStatus(INFO, 'Allocating NN, NND matrices')    
        NN = np.zeros((len(trainSet), k+1), dtype=np.int32)
        NND = np.zeros((len(trainSet), k+1))

        printStatus(INFO, 'Filling NN, NND matrices')    
        for i,id_img in enumerate(trainSet):
            neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (trainCollection, id_img))
            if len(neighbors) < k+1:
                printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k+1))    
                sys.exit(1)

            NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
            NNDrow = np.array([x[1] for x in neighbors])

            NN[i,:] = NNrow[0:k+1]
            NND[i,:] = NNDrow[0:k+1]

            if i % 1000 == 0:
                printStatus(INFO, '%d / %d images' % (i, len(trainSet)))    

        printStatus(INFO, 'Saving train matrices to file %s' % (resultfile_train))
        makedirsforfile(resultfile_train)
        fout = h5py.File(resultfile_train, 'w')
        fout['NN'] = NN
        fout['NND'] = NND
        fout['trainSet'] = trainSet
        fout['concepts'] = tagger.concepts
        fout.close()

        del NN
        del NND
   
    # allocate test -> train nearest neighbors
    printStatus(INFO, 'Allocating NNT, NNDT matrices')        
    NNT = np.zeros((len(testSet), k), dtype=np.int32)
    NNDT = np.zeros((len(testSet), k))

    printStatus(INFO, 'Filling NNT, NNDT matrices')    
    for i,id_img in enumerate(testSet):
        neighbors = tagger._get_neighbors(content=None, context='%s,%s' % (testCollection, id_img))
        if len(neighbors) < k:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (id_img, len(neighbors), k))    
            sys.exit(1)

        NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
        NNDrow = np.array([x[1] for x in neighbors])

        NNT[i,:] = NNrow[0:k]
        NNDT[i,:] = NNDrow[0:k]

        if i % 1000 == 0:
            printStatus(INFO, '%d / %d images' % (i, len(testSet)))    
   
    printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test))
    makedirsforfile(resultfile_test)
    fout = h5py.File(resultfile_test, 'w')
    fout['NNT'] = NNT
    fout['NNDT'] = NNDT
    fout['trainSet'] = trainSet
    fout['testSet'] = testSet
    fout['concepts'] = tagger.concepts   
    fout.close()
def process(options, testCollection, trainCollection, annotationName, feature):
    rootpath = options.rootpath
    k = options.k
    distance = options.distance
    overwrite = options.overwrite
    testset = testCollection
    onlytest = options.onlytest

    nnName = distance + "knn"
    resultfile_train = os.path.join(rootpath, trainCollection, 'TagProp-data',
                                    trainCollection,
                                    '%s,%s,%d' % (feature, nnName, k),
                                    'nn_train.h5')
    resultfile_test = os.path.join(rootpath, testCollection, 'TagProp-data',
                                   testset, trainCollection, annotationName,
                                   '%s,%s,%d' % (feature, nnName, k),
                                   'nn_test.h5')

    if (not onlytest
            and checkToSkip(resultfile_train, overwrite)) or checkToSkip(
                resultfile_test, overwrite):
        return 0

    testSet = readImageSet(testCollection, testset, rootpath)
    trainSet = readImageSet(trainCollection, trainCollection, rootpath)
    testSet.sort()
    trainSet.sort()

    #train_feat_dir = os.path.join(rootpath, trainCollection, 'FeatureData', feature)
    #train_feat_file = BigFile(train_feat_dir)

    tagger = NAME_TO_TAGGER["preknn"](trainCollection,
                                      annotationName,
                                      feature,
                                      distance,
                                      rootpath=rootpath,
                                      k=1001)

    printStatus(
        INFO,
        '%d test images, %d train images' % (len(testSet), len(trainSet)))

    # allocate train -> train nearest neighbors
    if not onlytest:
        printStatus(INFO, 'Allocating NN, NND matrices')
        NN = np.zeros((len(trainSet), k + 1), dtype=np.int32)
        NND = np.zeros((len(trainSet), k + 1))

        printStatus(INFO, 'Filling NN, NND matrices')
        for i, id_img in enumerate(trainSet):
            neighbors = tagger._get_neighbors(content=None,
                                              context='%s,%s' %
                                              (trainCollection, id_img))
            if len(neighbors) < k + 1:
                printStatus(
                    INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                    (id_img, len(neighbors), k + 1))
                sys.exit(1)

            NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
            NNDrow = np.array([x[1] for x in neighbors])

            NN[i, :] = NNrow[0:k + 1]
            NND[i, :] = NNDrow[0:k + 1]

            if i % 1000 == 0:
                printStatus(INFO, '%d / %d images' % (i, len(trainSet)))

        printStatus(INFO,
                    'Saving train matrices to file %s' % (resultfile_train))
        makedirsforfile(resultfile_train)
        fout = h5py.File(resultfile_train, 'w')
        fout['NN'] = NN
        fout['NND'] = NND
        fout['trainSet'] = trainSet
        fout['concepts'] = tagger.concepts
        fout.close()

        del NN
        del NND

    # allocate test -> train nearest neighbors
    printStatus(INFO, 'Allocating NNT, NNDT matrices')
    NNT = np.zeros((len(testSet), k), dtype=np.int32)
    NNDT = np.zeros((len(testSet), k))

    printStatus(INFO, 'Filling NNT, NNDT matrices')
    for i, id_img in enumerate(testSet):
        neighbors = tagger._get_neighbors(content=None,
                                          context='%s,%s' %
                                          (testCollection, id_img))
        if len(neighbors) < k:
            printStatus(
                INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                (id_img, len(neighbors), k))
            sys.exit(1)

        NNrow = np.array([bisect_index(trainSet, x[0]) for x in neighbors])
        NNDrow = np.array([x[1] for x in neighbors])

        NNT[i, :] = NNrow[0:k]
        NNDT[i, :] = NNDrow[0:k]

        if i % 1000 == 0:
            printStatus(INFO, '%d / %d images' % (i, len(testSet)))

    printStatus(INFO, 'Saving test matrices to file %s' % (resultfile_test))
    makedirsforfile(resultfile_test)
    fout = h5py.File(resultfile_test, 'w')
    fout['NNT'] = NNT
    fout['NNDT'] = NNDT
    fout['trainSet'] = trainSet
    fout['testSet'] = testSet
    fout['concepts'] = tagger.concepts
    fout.close()
예제 #4
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs*2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(INFO, 'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(INFO, 'ERROR: id_img %s has %d < %d neighbors!' % (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i+1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]): # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k]) # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k]) # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V,J,I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V) ** 2.;
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i,v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity' : matrix, 'sigma' : sigma})
예제 #5
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" % (workingCollection, annotationName, feature, nnName, k_ratio, lambda1, lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(rootpath, workingCollection, 'RobustPCA-Prediction', '%s,%s,%f,%f,%f,%d'%(feature,nnName,lambda1,lambda2,k_ratio,rawtagmatrix), 'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?" % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(INFO, 'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,k_ratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(INFO, "LaplacianI file not found at %s Did you run laplacian_images.py?" % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT', '%f'%(ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(INFO, "LaplacianT file not found at %s Did you run laplacian_tags.py?" % (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1, lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]),concepts)

    predicted_tagmatrix = robustpca_output['P'][:,mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert(final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images': id_images, 'scores':final_tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
예제 #6
0
def process(options, workingCollection, annotationName, feature, outputpkl):
    rootpath = options.rootpath
    distance = options.distance
    overwrite = options.overwrite
    k_ratio = options.kratio
    ratio_cs = options.ratiocs
    lambda1 = options.lambda1
    lambda2 = options.lambda2
    outputonlytest = options.outputonlytest
    rawtagmatrix = options.rawtagmatrix
    modelName = "robustpca"
    nnName = distance + "knn"

    printStatus(
        INFO, "Starting RobustPCA %s,%s,%s,%s,%f,%f,%f" %
        (workingCollection, annotationName, feature, nnName, k_ratio, lambda1,
         lambda2))

    if rawtagmatrix:
        printStatus(INFO, "Using raw tag matrix.")
    else:
        printStatus(INFO, "Using preprocessed tag matrix.")

    resultfile = os.path.join(outputpkl)
    resultfile_robustpca = os.path.join(
        rootpath, workingCollection, 'RobustPCA-Prediction',
        '%s,%s,%f,%f,%f,%d' %
        (feature, nnName, lambda1, lambda2, k_ratio, rawtagmatrix),
        'prediction.mat')

    if checkToSkip(resultfile_robustpca, overwrite):
        only_dump = True
    else:
        only_dump = False

    if not rawtagmatrix:
        tagmatrix_file = os.path.join(
            rootpath, workingCollection, 'RobustPCA',
            '%s,%s,%f' % (feature, nnName, DEFAULT_K_PROP), 'tagmatrix.h5')
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                "Tag matrix file not found at %s Did you run robustpca_preprocessing.py?"
                % (tagmatrix_file))
            sys.exit(1)
    else:
        tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData',
                                      "lemm_wordnet_freq_tags.h5")
        if not os.path.exists(tagmatrix_file):
            printStatus(
                INFO,
                'Tag matrix file not found in %s Did you run wordnet_frequency_tags.py?'
                % (tagmatrix_file))
            sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI',
                                   workingCollection,
                                   '%s,%s,%f' % (feature, nnName, k_ratio),
                                   'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(
            INFO,
            "LaplacianI file not found at %s Did you run laplacian_images.py?"
            % (laplacianI_file))
        sys.exit(1)

    laplacianT_file = os.path.join(rootpath, workingCollection, 'LaplacianT',
                                   '%f' % (ratio_cs), 'laplacianT.mat')
    if not os.path.exists(laplacianT_file):
        printStatus(
            INFO,
            "LaplacianT file not found at %s Did you run laplacian_tags.py?" %
            (laplacianT_file))
        sys.exit(1)

    # being learning
    script = """
        rpca_path = 'transduction_based/robustpca/';
        addpath(rpca_path);
        addpath([rpca_path, 'fast_svd/']);
        tagmatrix = sparse(double(h5read('%s', '/tagmatrix')));
        load('%s');
        load('%s');

        lambda1 = %f;
        lambda2 = %f;
        maxIters = 50;
        precision = 1e-4;
        mu_start = 1.;

        parpool('local', 4);
        [P,E]=robustpca(tagmatrix, lambda1, lambda2, tag_similarity, im_similarity, maxIters, precision, mu_start);
        """ % (tagmatrix_file, laplacianI_file, laplacianT_file, lambda1,
               lambda2)

    script += """
        delete(gcp);
        save('%s', 'P', 'E', 'lambda1', 'lambda2', '-v7.3');
        exit;
    """ % resultfile_robustpca

    if not only_dump:
        printStatus(INFO, "starting learning")
        makedirsforfile(resultfile_robustpca)
        call_matlab(script)

    if checkToSkip(resultfile, overwrite):
        return 0

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    concepts = readConcepts(workingCollection, annotationName, rootpath)
    if outputonlytest:
        testset_id_images = readImageSet(
            workingCollection.split('+')[1],
            workingCollection.split('+')[1], rootpath)
        testset_id_images.sort()

    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    id_images.sort()

    # concepts mapping
    robustpca_output = h5py.File(resultfile_robustpca, 'r')
    tagprop_input = h5py.File(tagmatrix_file, 'r')
    mapping = getVocabMap(list(tagprop_input['vocab'][:]), concepts)

    predicted_tagmatrix = robustpca_output['P'][:, mapping]

    if outputonlytest:
        idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
        final_tagmatrix = predicted_tagmatrix[idx, :]
        assert (final_tagmatrix.shape[0] == idx.shape[0])
        id_images = testset_id_images
    else:
        final_tagmatrix = predicted_tagmatrix

    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': id_images,
                'scores': final_tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
예제 #7
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite
    laplaciankratio = options.laplaciankratio

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA', '%s,%s,%f'%(feature,nnName,k_ratio), 'tagmatrix.h5')

    if checkToSkip(resultfile, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData', "lemm_wordnet_freq_tags.h5")
    if not os.path.exists(tagmatrix_file):
        printStatus(INFO, 'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?' % (tagmatrix_file))
        sys.exit(1)

    laplacianI_file = os.path.join(rootpath, workingCollection, 'LaplacianI', workingCollection, '%s,%s,%f'%(feature,nnName,laplaciankratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(INFO, 'LaplacianI file not found in %s Did you run laplacian_images.py?' % (laplacianI_file))
        sys.exit(1)

    tagmatrix_data = h5py.File(tagmatrix_file, 'r')
    tagmatrix = tagmatrix_data['tagmatrix'][:]
    printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape)))

    laplacian_data = scipy.io.loadmat(laplacianI_file)
    sigma = laplacian_data['sigma']
    printStatus(INFO, 'Sigma^2 = %f' % (sigma))

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()
    #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10])
    #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:])))
    assert(np.all(workingSet == list(tagmatrix_data['id_images'][:])))

    tot_images = len(workingSet)
    printStatus(INFO, '%d images in %s' % (tot_images, workingCollection))

    printStatus(INFO, 'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0))))
    K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio))
    printStatus(INFO, '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Starting the propagation pre-processing')
    tagmatrix_new = np.zeros(tagmatrix.shape)
    for i in xrange(tot_images):
        neighbors = _get_neighbors('%s,%s' % (workingCollection, workingSet[i]), rootpath, K_neighs * 2, feature, distance)

        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])

        # remove images with features but not in the working set
        NNrow = []
        NNDrow = []
        new_neighs = []
        for x in neighbors:
            try:
                NNrow.append(bisect_index(workingSet, x[0]))
                NNDrow.append(x[1])
                new_neighs.append(x)
            except ValueError:
                pass
        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])
        NNrow = np.array(NNrow)
        NNDrow = np.array(NNDrow)
        neighbors = new_neighs[0:K_neighs]
        
        C = np.sum(np.exp(-(NNDrow)/sigma))
        tagmatrix_new[i,:] = np.sum((np.exp(-(NNDrow)/sigma).T * tagmatrix[NNrow]) / C, axis=0);

        if (i+1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i+1, tot_images))

    # save output
    printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile)
    makedirsforfile(resultfile)
    fout = h5py.File(resultfile, 'w')
    fout['tagmatrix'] = tagmatrix_new
    fout['vocab'] = tagmatrix_data['vocab'][:]
    fout['id_images'] = workingSet
    fout.close()
예제 #8
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite
    laplaciankratio = options.laplaciankratio

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'RobustPCA',
                              '%s,%s,%f' % (feature, nnName, k_ratio),
                              'tagmatrix.h5')

    if checkToSkip(resultfile, overwrite):
        return 0

    tagmatrix_file = os.path.join(rootpath, workingCollection, 'TextData',
                                  "lemm_wordnet_freq_tags.h5")
    if not os.path.exists(tagmatrix_file):
        printStatus(
            INFO,
            'Tagmatrix file not found in %s Did you run wordnet_frequency_tags.py?'
            % (tagmatrix_file))
        sys.exit(1)

    laplacianI_file = os.path.join(
        rootpath, workingCollection, 'LaplacianI', workingCollection,
        '%s,%s,%f' % (feature, nnName, laplaciankratio), 'laplacianI.mat')
    if not os.path.exists(laplacianI_file):
        printStatus(
            INFO,
            'LaplacianI file not found in %s Did you run laplacian_images.py?'
            % (laplacianI_file))
        sys.exit(1)

    tagmatrix_data = h5py.File(tagmatrix_file, 'r')
    tagmatrix = tagmatrix_data['tagmatrix'][:]
    printStatus(INFO, 'tagmatrix.shape = %s' % (str(tagmatrix.shape)))

    laplacian_data = scipy.io.loadmat(laplacianI_file)
    sigma = laplacian_data['sigma']
    printStatus(INFO, 'Sigma^2 = %f' % (sigma))

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()
    #print map(int, workingSet)[0:10], map(int, list(tagmatrix_data['id_images'][:])[0:10])
    #assert(np.all(map(int, workingSet) == list(tagmatrix_data['id_images'][:])))
    assert (np.all(workingSet == list(tagmatrix_data['id_images'][:])))

    tot_images = len(workingSet)
    printStatus(INFO, '%d images in %s' % (tot_images, workingCollection))

    printStatus(INFO,
                'Mean images per tag = %f' % (np.mean(tagmatrix.sum(axis=0))))
    K_neighs = int(math.floor(np.mean(tagmatrix.sum(axis=0)) * k_ratio))
    printStatus(
        INFO,
        '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Starting the propagation pre-processing')
    tagmatrix_new = np.zeros(tagmatrix.shape)
    for i in xrange(tot_images):
        neighbors = _get_neighbors(
            '%s,%s' % (workingCollection, workingSet[i]), rootpath,
            K_neighs * 2, feature, distance)

        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])

        # remove images with features but not in the working set
        NNrow = []
        NNDrow = []
        new_neighs = []
        for x in neighbors:
            try:
                NNrow.append(bisect_index(workingSet, x[0]))
                NNDrow.append(x[1])
                new_neighs.append(x)
            except ValueError:
                pass
        #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
        #NNDrow = np.array([x[1] for x in neighbors])
        NNrow = np.array(NNrow)
        NNDrow = np.array(NNDrow)
        neighbors = new_neighs[0:K_neighs]

        C = np.sum(np.exp(-(NNDrow) / sigma))
        tagmatrix_new[i, :] = np.sum(
            (np.exp(-(NNDrow) / sigma).T * tagmatrix[NNrow]) / C, axis=0)

        if (i + 1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i + 1, tot_images))

    # save output
    printStatus(INFO, 'Saving propagated tagmatrix to %s' % resultfile)
    makedirsforfile(resultfile)
    fout = h5py.File(resultfile, 'w')
    fout['tagmatrix'] = tagmatrix_new
    fout['vocab'] = tagmatrix_data['vocab'][:]
    fout['id_images'] = workingSet
    fout.close()
예제 #9
0
        line = line[:-2]
        data = line.split("\t")

        assert len(data) == 3
        id_image = data[0]
        tags = [x.lower() for x in data[2].split(" ")]

        final_tags = [t for t in tags if t in vocab]

        id_tags[id_image] = final_tags

N_images = len(id_tags)

print "N images: ", N_images

# build tag matrix
tagmatrix = np.zeros((N_images, N_tags), dtype=np.int8)

for i, id_im in enumerate(id_images):
    tags = id_tags[id_im]
    if len(tags) > 0:
        idx = [bisect_index(vocab, t) for t in tags]
        tagmatrix[i, idx] = True

# save output
fout = h5py.File(resultfile, "w")
fout["tagmatrix"] = tagmatrix
fout["vocab"] = vocab
fout["id_images"] = id_images
fout.close()
예제 #10
0
pkl_file = open(sys.argv[2], 'w')
workingCollection = sys.argv[3]
annotationName = sys.argv[4]
rootpath = ROOT_PATH

id_images = tagmatrix_file['id_images']
concepts = readConcepts(workingCollection, annotationName, rootpath)
testset_id_images = readImageSet(workingCollection.split('+')[1], workingCollection.split('+')[1], rootpath)
testset_id_images.sort()

if not type(id_images[0]) is str:
	id_images = map(str, id_images)

if not type(testset_id_images[0]) is str:
	testset_id_images = map(str, testset_id_images)

mapping = getVocabMap(list(tagmatrix_file['vocab'][:]),concepts)
predicted_tagmatrix = tagmatrix_file['tagmatrix'][:,mapping]

print "predicted_tagmatrix.shape = ", predicted_tagmatrix.shape
print "len(id_images) = ", len(id_images)
print "len(testset_id_images) = ", len(testset_id_images)

idx = np.array([bisect_index(id_images, x) for x in testset_id_images])
final_tagmatrix = predicted_tagmatrix[idx, :]
id_images = testset_id_images

print "dumping %d elements..." % len(id_images)

pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':final_tagmatrix}, pkl_file, pickle.HIGHEST_PROTOCOL)
예제 #11
0
def process(options, workingCollection, feature):
    rootpath = options.rootpath
    k_ratio = options.kratio
    distance = options.distance
    overwrite = options.overwrite

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, workingCollection, 'LaplacianI',
                              workingCollection,
                              '%s,%s,%f' % (feature, nnName, k_ratio),
                              'laplacianI.mat')

    if checkToSkip(resultfile, overwrite):
        return 0

    workingSet = readImageSet(workingCollection, workingCollection, rootpath)
    workingSet.sort()

    tot_images = len(workingSet)
    printStatus(INFO, '%d images' % (tot_images))

    K_neighs = int(math.floor(len(workingSet) * k_ratio))
    printStatus(
        INFO,
        '%d nearest neighbor per image (ratio = %f)' % (K_neighs, k_ratio))

    printStatus(INFO, 'Allocating I,J,V arrays')
    I = np.zeros((K_neighs * tot_images * 2))
    J = np.zeros((K_neighs * tot_images * 2))
    V = np.zeros((K_neighs * tot_images * 2))
    n_entries = 0

    # distances
    printStatus(INFO, 'Starting to fill I,J,V arrays')
    for i in xrange(tot_images):
        try:
            neighbors = _get_neighbors(
                '%s,%s' % (workingCollection, workingSet[i]), rootpath,
                K_neighs * 2, feature, distance)
            # remove images with features but not in the working set
            NNrow = []
            NNDrow = []
            new_neighs = []
            for x in neighbors:
                try:
                    NNrow.append(bisect_index(workingSet, x[0]))
                    NNDrow.append(x[1])
                    new_neighs.append(x)
                except ValueError:
                    pass
            #NNrow = np.array([bisect_index(workingSet, x[0]) for x in neighbors])
            #NNDrow = np.array([x[1] for x in neighbors])
            NNrow = np.array(NNrow)
            NNDrow = np.array(NNDrow)
            neighbors = new_neighs[0:K_neighs]
        except ValueError:
            printStatus(
                INFO,
                'ERROR: id_img %s has non-standard format!' % (workingSet[i]))
            sys.exit(1)

        if len(neighbors) < K_neighs:
            printStatus(
                INFO, 'ERROR: id_img %s has %d < %d neighbors!' %
                (workingSet[i], len(neighbors), K_neighs))
            sys.exit(1)

        if (i + 1) % 1000 == 0:
            printStatus(INFO, '%d / %d done' % (i + 1, tot_images))
        for k in xrange(K_neighs):
            if i != int(NNrow[k]):  # -1 zero on the diagonal for a later step
                I[n_entries] = i
                J[n_entries] = int(NNrow[k])  # -1
                V[n_entries] = NNDrow[k]
                n_entries += 1
                I[n_entries] = int(NNrow[k])  # -1
                J[n_entries] = i
                V[n_entries] = NNDrow[k]
                n_entries += 1

    I = I[0:n_entries]
    J = J[0:n_entries]
    V = V[0:n_entries]

    printStatus(INFO, 'Removing duplicates')
    ind = np.lexsort((V, J, I))
    I = I[ind]
    J = J[ind]
    V = V[ind]
    a = np.concatenate([I.reshape(1, len(I)), J.reshape(1, len(J))], axis=0).T
    b = np.ascontiguousarray(a).view(
        np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    del a
    _, idx = np.unique(b, return_index=True)
    del b

    I = I[idx]
    J = J[idx]
    V = V[idx]

    printStatus(INFO, 'Computing the final laplacian matrix')
    sigma = np.median(V)**2.
    printStatus(INFO, 'Estimated sigma^2 = %f' % sigma)
    V = np.exp(-V / sigma)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tocsr()
    new_diag = matrix.sum(axis=0).T
    V = -V

    I_add = np.zeros((tot_images))
    J_add = np.zeros((tot_images))
    V_add = np.zeros((tot_images))
    for i, v in enumerate(new_diag):
        I_add[i] = i
        J_add[i] = i
        V_add[i] = v

    I = np.append(I, I_add)
    J = np.append(J, J_add)
    V = np.append(V, V_add)

    matrix = coo_matrix((V, (I, J)), shape=(tot_images, tot_images)).tolil()

    printStatus(INFO, 'Saving laplacian matrix to %s' % resultfile)
    makedirsforfile(resultfile)
    scipy.io.savemat(resultfile, {'im_similarity': matrix, 'sigma': sigma})