Ejemplo n.º 1
0
def process(options, collection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    threshold = options.threshold
    encoding = options.encoding
    language = options.language

    vocab_file = os.path.join(rootpath, collection, 'TextData', 'vocab',
                              '%s_%d.pkl' % (encoding, threshold))
    count_file = os.path.join(os.path.dirname(vocab_file),
                              '%s_%d.txt' % (encoding, threshold))

    if checkToSkip(vocab_file, overwrite):
        return 0

    cap_file = os.path.join(rootpath, collection, 'TextData',
                            '%s.caption.txt' % collection)
    vocab, word_counts = build_vocab(cap_file,
                                     encoding,
                                     threshold=threshold,
                                     lang=language)

    makedirsforfile(vocab_file)
    with open(vocab_file, 'wb') as fw:
        pickle.dump(vocab, fw, pickle.HIGHEST_PROTOCOL)
    logger.info("Saved vocabulary of %d words to %s", len(vocab), vocab_file)

    with open(count_file, 'w') as fw:
        fw.write('\n'.join(['%s %d' % x for x in word_counts]))

    logger.info("Saved word-counts to %s", count_file)
Ejemplo n.º 2
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    topk = options.topk

    label_file = os.path.join(rootpath, collection, 'TextData',
                              '%s.imglabel.txt' % collection)
    vocab_file = os.path.join(rootpath, collection, 'Annotations',
                              'concepts%s%d.txt' % (collection, topk))

    if checkToSkip(vocab_file, overwrite):
        return 0

    tag2count = {}

    for line in open(label_file):
        elems = line.strip().split()
        del elems[0]  # because this is imgid
        for x in elems:
            tag2count[x] = tag2count.get(x, 0) + 1

    taglist = sorted(tag2count.iteritems(),
                     key=lambda v: (v[1], v[0]),
                     reverse=True)
    assert (len(taglist) >= topk)
    taglist = taglist[:topk]
    makedirsforfile(vocab_file)
    fw = open(vocab_file, 'w')
    fw.write('\n'.join([x[0] for x in taglist]))
    fw.close()
    logger.info('A vocabulary of %d labels at %s', len(taglist), vocab_file)
Ejemplo n.º 3
0
def process(options, collection, src_file, res_file):
    rootpath = options.rootpath
    overwrite = options.overwrite

    imset_file = os.path.join(rootpath, collection, 'ImageSets',
                              '%s.txt' % collection)
    imset = set(map(str.strip, open(imset_file).readlines()))

    if checkToSkip(res_file, overwrite):
        return 0

    cached = set()
    obtained = 0

    makedirsforfile(res_file)
    fw = open(res_file, 'w')

    for line in open(src_file):
        imgid = line.split()[0]
        if imgid in cached:
            continue
        cached.add(imgid)
        if imgid in imset:
            fw.write(line)
            obtained += 1
    fw.close()
    logger.info('%d wanted, %d obtained', len(imset), obtained)
Ejemplo n.º 4
0
def process(options, collection):
    rootpath = options.rootpath
    lang = options.lang
    overwrite = options.overwrite

    res_file = os.path.join(rootpath, collection, 'TextData', '%s.imglabel.txt' % collection)
    if checkToSkip(res_file, overwrite):
        return 0
   
    pos_method = 'stanford' if 'en' == lang else 'boson'
    pos_sent_file = os.path.join(rootpath, collection, 'TextData', '%s.%spos.txt' % (collection,pos_method))
    
    dict_img = {}

    for line in open(pos_sent_file):
        elems = line.strip().split()
        imgid = elems[0].split('#')[0]

        if imgid not in dict_img:
            dict_img[imgid] = {}

        for x in elems[1:]:
            if len(x.split(':')) != 2:
                #logger.error('invalid format %s', x)
                continue
            word, pos = x.split(':')
            if pos[0] in POS_SET[lang] and word not in STOPWORDS:
                if 'en' == lang:
                    if pos[0] == 'N':
                        word = lemmatizer.lemmatize(word)
                    elif pos[0] == 'V':
                        word = lemmatizer.lemmatize(word, pos='v')
                    else:
                        continue
                else:
                   if pos[0] != 'n' and len(word)<=3:
                       #print (len(word), word)
                       continue
                dict_img[imgid][word] = dict_img[imgid].get(word, 0) + 1


    fw = open(res_file, 'w')

    label_stat = []
    zero = 0
    for imgid, word2count in dict_img.iteritems():
        labels = [word for word,c in word2count.iteritems() if c>= 2]
        fw.write(' '.join([imgid] + labels) + '\n')
        label_stat.append(len(labels))
        if len(labels) == 0:
            logger.info('image %s has no label', imgid)
            zero += 1
    fw.close()
    logger.info('number of images with zero label: %d', zero)
    logger.info('number of labels per image: min (%d), max (%d), mean (%.1f)', min(label_stat), max(label_stat), sum(label_stat)/float(len(label_stat)))
Ejemplo n.º 5
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(rootpath, collection, "id.imagepath.txt")

    if checkToSkip(resultfile, overwrite):
        return 0

    imageFolders = [os.path.join(rootpath, collection, 'ImageData')]
    filenames = []
    imageSet = set()

    for imageFolder in imageFolders:
        for r, d, f in os.walk(imageFolder):
            for filename in f:
                name, ext = os.path.splitext(filename)
                if ext not in FILTER_SET:
                    continue

                if name in imageSet:
                    print("id %s exists, ignore %s" %
                          (name, os.path.join(r, filename)))
                    continue

                imageSet.add(name)
                filenames.append("%s %s" % (name, os.path.join(r, filename)))
    try:
        os.makedirs(os.path.split(resultfile)[0])
    except:
        pass

    fout = open(resultfile, "w")
    fout.write("\n".join(filenames) + "\n")
    fout.close()

    idfile = os.path.join(rootpath, collection, "ImageSets",
                          '%s.txt' % collection)
    try:
        os.makedirs(os.path.split(idfile)[0])
    except:
        pass
    fout = open(idfile, "w")
    fout.write("\n".join(sorted(list(imageSet))) + "\n")
    fout.close()
Ejemplo n.º 6
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    
    resultfile = os.path.join(rootpath, collection, "id.imagepath.txt")
    
    if checkToSkip(resultfile, overwrite):
        return 0

    imageFolders = [os.path.join(rootpath, collection, 'ImageData')]
    filenames = []
    imageSet = set()
    
    for imageFolder in imageFolders:
        for r,d,f in os.walk(imageFolder):
            for filename in f:
                name,ext = os.path.splitext(filename)
                if ext not in FILTER_SET:
                    continue
                    
                if name in imageSet:
                    print ("id %s exists, ignore %s" % (name, os.path.join(r,filename)))
                    continue
                    
                imageSet.add(name)
                filenames.append("%s %s" % (name, os.path.join(r, filename)))
    try:            
        os.makedirs(os.path.split(resultfile)[0])
    except:
        pass

    fout = open(resultfile, "w")
    fout.write("\n".join(filenames) + "\n")
    fout.close()
    
    idfile = os.path.join(rootpath, collection, "ImageSets", '%s.txt' % collection)
    try:            
        os.makedirs(os.path.split(idfile)[0])
    except:
        pass           
    fout = open(idfile, "w")
    fout.write("\n".join(sorted(list(imageSet))) + "\n")
    fout.close()
Ejemplo n.º 7
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    newsize = options.newsize

    imagepathFilename = os.path.join(rootpath, collection, "id.imagepath.txt")
    if not os.path.exists(imagepathFilename):
        print('%s does not exist' % imagepathFilename)
        return 0

    try:
        os.makedirs(os.path.split(resultFilename)[0])
    except:
        pass

    done = 0

    for line in open(imagepathFilename):
        if not line.strip():
            continue
        name, fullpath = str.split(line.strip())
        resultfile = fullpath.replace('ImageData',
                                      'ImageData%dx%d' % (newsize, newsize))
        if checkToSkip(resultfile, overwrite):
            continue
        try:
            os.makedirs(os.path.split(resultfile)[0])
        except:
            pass

        # The '>' flag is to only apply the resize to images 'greater than' the size given
        cmd = "convert %s -resize '%d>x%d>' %s" % (fullpath, newsize, newsize,
                                                   resultfile)
        os.system(cmd)
        done += 1
        if done % 100 == 0:
            printMessage("info", "image_rescale", "%d done" % done)
    # done
    printMessage("info", "image_rescale", "%d done" % done)
    return done
Ejemplo n.º 8
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite
    newsize = options.newsize

    imagepathFilename = os.path.join(rootpath, collection, "id.imagepath.txt")
    if not os.path.exists(imagepathFilename):
        print ('%s does not exist' % imagepathFilename)
        return 0

    
    try:
        os.makedirs(os.path.split(resultFilename)[0])
    except:
        pass
      
    done = 0
    
    for line in open(imagepathFilename):
        if not line.strip():
            continue
        name,fullpath = str.split(line.strip())
        resultfile = fullpath.replace('ImageData', 'ImageData%dx%d'%(newsize,newsize))
        if checkToSkip(resultfile, overwrite):
            continue
        try:
            os.makedirs(os.path.split(resultfile)[0])
        except:
            pass
            
        # The '>' flag is to only apply the resize to images 'greater than' the size given  
        cmd = "convert %s -resize '%d>x%d>' %s" % (fullpath, newsize, newsize, resultfile)        
        os.system(cmd)
        done += 1
        if done % 100 == 0:
            printMessage("info", "image_rescale", "%d done" % done)
    # done
    printMessage("info", "image_rescale", "%d done" % done)  
    return done 
Ejemplo n.º 9
0
def process(options, collection):
    rootpath = options.rootpath
    overwrite = options.overwrite

    imagepathFilename = os.path.join(rootpath, collection, "id.imagepath.txt")
    if not os.path.exists(imagepathFilename):
        print ('%s does not exist' % imagepathFilename)
        return 0

    resultFilename = os.path.join(rootpath, collection, "FeatureData", "color64", "id.feature.txt")
    if checkToSkip(resultFilename, overwrite):
        return 0

    try:
        os.makedirs(os.path.split(resultFilename)[0])
    except:
        pass
      
    done = 0  
    fout = open(resultFilename, "w")
    for line in open(imagepathFilename):
        if not line.strip():
            continue
        name,fullpath = str.split(line.strip())
        feature = extractColor64(fullpath)
        
        if feature:
            output = name + " " + " ".join([str(x) for x in feature])
            fout.write(output + "\n")
        done += 1
        if done % 100 == 0:
            printMessage("info", "docolor64", "%d done" % done)
    # done
    fout.close()        
    printMessage("info", "docolor64", "%d done" % done)  
    return done 
Ejemplo n.º 10
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    testCollection = opt.testCollection

    resume_file = os.path.join(opt.model_path)
    if not os.path.exists(resume_file):
        logging.info(resume_file + ' not exists.')
        sys.exit(0)

    # Load checkpoint
    logger.info('loading model...')
    checkpoint = torch.load(resume_file)
    epoch = checkpoint['epoch']
    best_perf = checkpoint['best_perf']
    config = checkpoint['config']
    if hasattr(config, 't2v_w2v'):
        w2v_feature_file = os.path.join(rootpath, 'word2vec', 'flickr',
                                        'vec500flickr30m', 'feature.bin')
        config.t2v_w2v.w2v.binary_file = w2v_feature_file

    # Construct the model
    model = get_model('w2vvpp')(config)
    print(model.vis_net)
    print(model.txt_net)

    model.load_state_dict(checkpoint['model'])
    print("=> loaded checkpoint '{}' (epoch {}, best_perf {})".format(
        resume_file, epoch, best_perf))

    vis_feat_file = BigFile(
        os.path.join(rootpath, testCollection, 'FeatureData', config.vid_feat))
    vis_ids = map(
        str.strip,
        open(
            os.path.join(rootpath, testCollection, 'VideoSets',
                         testCollection + '.txt')))
    vis_loader = data.vis_provider({
        'vis_feat': vis_feat_file,
        'vis_ids': vis_ids,
        'pin_memory': True,
        'batch_size': opt.batch_size,
        'num_workers': opt.num_workers
    })

    vis_embs = None

    for query_set in opt.query_sets.split(','):
        output_dir = os.path.join(rootpath, testCollection, 'SimilarityIndex',
                                  query_set, opt.sim_name)
        pred_result_file = os.path.join(output_dir, 'id.sent.score.txt')

        if util.checkToSkip(pred_result_file, opt.overwrite):
            continue
        util.makedirs(output_dir)

        if vis_embs is None:
            logger.info('Encoding videos')
            vis_embs, vis_ids = evaluation.encode_vis(model, vis_loader)

        capfile = os.path.join(rootpath, testCollection, 'TextData', query_set)
        # load text data
        txt_loader = data.txt_provider({
            'capfile': capfile,
            'pin_memory': True,
            'batch_size': opt.batch_size,
            'num_workers': opt.num_workers
        })

        logger.info('Encoding %s captions' % query_set)
        txt_embs, txt_ids = evaluation.encode_txt(model, txt_loader)

        t2i_matrix = evaluation.compute_sim(txt_embs,
                                            vis_embs,
                                            measure=config.measure)
        inds = np.argsort(t2i_matrix, axis=1)

        if testCollection == 'msrvtt10ktest':
            label_matrix = np.zeros(inds.shape)
            for index in range(inds.shape[0]):
                ind = inds[index][::-1]
                label_matrix[index][np.where(
                    np.array(vis_ids)[ind] == txt_ids[index].split('#')[0])
                                    [0]] = 1

            (r1, r5, r10, medr, meanr, mir,
             mAP) = evaluation.eval(label_matrix)
            sum_recall = r1 + r5 + r10
            tempStr = " * Text to video:\n"
            tempStr += " * r_1_5_10: {}\n".format(
                [round(r1, 3), round(r5, 3),
                 round(r10, 3)])
            tempStr += " * medr, meanr, mir: {}\n".format(
                [round(medr, 3),
                 round(meanr, 3),
                 round(mir, 3)])
            tempStr += " * mAP: {}\n".format(round(mAP, 3))
            tempStr += " * " + '-' * 10
            print(tempStr)
            open(os.path.join(output_dir, 'perf.txt'), 'w').write(tempStr)

        start = time.time()
        with open(pred_result_file, 'w') as fout:
            for index in range(inds.shape[0]):
                ind = inds[index][::-1]

                fout.write(txt_ids[index] + ' ' + ' '.join(
                    [vis_ids[i] + ' %s' % t2i_matrix[index][i]
                     for i in ind]) + '\n')
        print('writing result into file time: %.3f seconds\n' %
              (time.time() - start))
Ejemplo n.º 11
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent = 2))

    rootpath = opt.rootpath
    trainCollection = opt.trainCollection
    valCollection = opt.valCollection
    val_set = opt.val_set

    config = load_config('configs.%s' % opt.config_name)

    model_path = os.path.join(rootpath, trainCollection, 'w2vvpp_train', valCollection, val_set, opt.config_name, opt.model_prefix)
    if util.checkToSkip(os.path.join(model_path, 'model_best.pth.tar'), opt.overwrite):
        sys.exit(0)
    util.makedirs(model_path)

    global writer
    writer = SummaryWriter(log_dir=model_path, flush_secs=5)

    collections = {'train': trainCollection, 'val': valCollection}

    capfiles = {'train': '%s.caption.txt', 'val': os.path.join(val_set, '%s.caption.txt')}
    cap_file_paths = {x: os.path.join(rootpath, collections[x], 'TextData', capfiles[x]%collections[x]) for x in collections}

    vis_feat_files = {x: BigFile(os.path.join(rootpath, collections[x], 'FeatureData', config.vid_feat)) for x in collections}
    config.vis_fc_layers = map(int, config.vis_fc_layers.split('-'))
    config.vis_fc_layers[0] = vis_feat_files['train'].ndims

    bow_encoding, w2v_encoding, rnn_encoding = config.text_encoding.split('@')
    rnn_encoding, config.pooling = rnn_encoding.split('_', 1)

    bow_vocab_file = os.path.join(rootpath, trainCollection, 'TextData', 'vocab', '%s_%d.pkl'%(bow_encoding, config.threshold))
    config.t2v_bow = get_txt2vec(bow_encoding)(bow_vocab_file, norm=config.bow_norm)

    w2v_data_path = os.path.join(rootpath, 'word2vec', 'flickr', 'vec500flickr30m')
    config.t2v_w2v = get_txt2vec(w2v_encoding)(w2v_data_path)

    rnn_vocab_file = os.path.join(rootpath, trainCollection, 'TextData', 'vocab', '%s_%d.pkl'%(rnn_encoding, config.threshold))
    config.t2v_idx = get_txt2vec('idxvec')(rnn_vocab_file)
    if config.we_dim == 500:
        config.we = get_we(config.t2v_idx.vocab, w2v_data_path)

    config.txt_fc_layers = map(int, config.txt_fc_layers.split('-'))
    if config.pooling == 'mean_last':
        config.txt_fc_layers[0] = config.rnn_size*2 + config.t2v_w2v.ndims + config.t2v_bow.ndims
    else:
        config.txt_fc_layers[0] = config.rnn_size + config.t2v_w2v.ndims + config.t2v_bow.ndims

    # Construct the model
    model = get_model('w2vvpp')(config)
    print(model.vis_net)
    print(model.txt_net)

    data_loaders = {x: data.pair_provider({'vis_feat':vis_feat_files[x], 'capfile':cap_file_paths[x], 'pin_memory': True,
                                           'batch_size':opt.batch_size, 'num_workers':opt.workers,'shuffle':(x=='train')})
                    for x in collections}

    # Train the Model
    best_perf = 0
    no_impr_counter = 0
    val_perf_hist_fout = open(os.path.join(model_path, 'val_perf_hist.txt'), 'w')
    for epoch in range(opt.num_epochs):

        print('Epoch[{0} / {1}] LR: {2}'.format(epoch, opt.num_epochs, model.learning_rate))
        print('-'*10)
        writer.add_scalar('train/learning_rate', model.learning_rate[0], epoch)
        # train for one epoch
        train(model, data_loaders['train'], epoch)

        # evaluate on validation set
        cur_perf = validate(model, data_loaders['val'], epoch, measure=config.measure, metric=opt.metric)
        model.lr_step(val_value=cur_perf)

        print(' * Current perf: {}\n * Best perf: {}\n'.format(cur_perf, best_perf))
        val_perf_hist_fout.write('epoch_%d:\nText2Video(%s): %f\n' % (epoch, opt.metric, cur_perf))
        val_perf_hist_fout.flush()

        # remember best performance and save checkpoint
        is_best = cur_perf > best_perf
        best_perf = max(cur_perf, best_perf)
        save_checkpoint({'epoch': epoch+1, 'model': model.state_dict(), 'best_perf': best_perf,
                         'config': config, 'opt': opt}, is_best, logdir=model_path, only_best=True,
                         filename='checkpoint_epoch_%s.pth.tar'%epoch)
        if is_best:
            no_impr_counter = 0
        else:
            no_impr_counter += 1
            if no_impr_counter > 10:
                print('Early stopping happended.\n')
                break

    val_perf_hist_fout.close()
    message = 'best performance on validation:\n Text to video({}): {}'.format(opt.metric, best_perf)
    print(message)
    with open(os.path.join(model_path, 'val_perf.txt'), 'w') as fout:
        fout.write(message)
Ejemplo n.º 12
0
def process(options, trainCollection, feature, testCollection):
    rootpath = options.rootpath
    tpp = options.tpp
    distance = options.distance
    k = options.k
    r = options.r
    donefile = options.donefile
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize

    if options.testset is None:
        testset = testCollection

    test_tag_file = os.path.join(rootpath, testCollection, "TextData",
                                 "id.userid.%stags.txt" % tpp)
    try:
        testStore = RecordStore(test_tag_file)
        resultName = "tagrel"
    except:
        testStore = None
        printStatus(
            INFO,
            "Failed to load %s, will do image auto-tagging" % test_tag_file)
        resultName = "autotagging"

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, testCollection, resultName, testset,
                              trainCollection,
                              "%s,%s,%d,%s" % (feature, nnName, k, tpp),
                              "id.tagvotes.txt")

    if numjobs > 1:
        resultfile += ".%d.%d" % (numjobs, job)

    if checkToSkip(resultfile, overwrite):
        return 0

    if donefile:
        doneset = set([x.split()[0] for x in open(donefile).readlines()[:-1]])
    else:
        doneset = set()
    printStatus(
        INFO, "%d images have been done already, and they will be ignored" %
        len(doneset))

    test_imset = readImageSet(testCollection, testset, rootpath)
    test_imset = [x for x in test_imset if x not in doneset]
    test_imset = [
        test_imset[i] for i in range(len(test_imset))
        if (i % numjobs + 1) == job
    ]
    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData',
                                 feature)
    test_feat_file = BigFile(test_feat_dir)

    learner = TagrelLearner(trainCollection,
                            feature,
                            distance,
                            tpp=tpp,
                            rootpath=rootpath)
    learner.set_nr_neighbors(k)
    learner.set_nr_autotags(r)

    printStatus(
        INFO, "working on %d-%d, %d test images -> %s" %
        (numjobs, job, len(test_imset), resultfile))

    done = 0
    makedirsforfile(resultfile)

    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0

    while start < len(test_imset):
        end = min(len(test_imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end - 1))

        s_time = time.time()
        renamed, vectors = test_feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time
        nr_images = len(renamed)
        #assert(len(test_imset[start:end]) == nr_images) # some images may have no visual features available

        s_time = time.time()
        output = [None] * nr_images
        for i in range(nr_images):
            if testStore:
                (qry_userid, qry_tags) = testStore.lookup(renamed[i])
            else:
                qry_userid = None
                qry_tags = None

            tagvotes = learner.estimate(vectors[i], qry_tags, qry_userid)
            output[i] = '%s %s\n' % (renamed[i], " ".join([
                "%s %s" % (tag, niceNumber(vote, 8))
                for (tag, vote) in tagvotes
            ]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()

        done += len(output)

    # done
    printStatus(
        INFO, "%d done. read time %g seconds, test_time %g seconds" %
        (done, read_time, test_time))
    fw.close()
    return 1
Ejemplo n.º 13
0
def process(options, trainCollection, feature, testCollection):
    rootpath = options.rootpath
    tpp = options.tpp
    distance = options.distance
    k = options.k
    r = options.r
    donefile = options.donefile
    overwrite = options.overwrite
    numjobs = options.numjobs
    job = options.job
    blocksize = options.blocksize

    if options.testset is None:
        testset = testCollection
    
    test_tag_file = os.path.join(rootpath, testCollection, "TextData", "id.userid.%stags.txt"%tpp)
    try:
        testStore = RecordStore(test_tag_file)
        resultName = "tagrel"
    except:
        testStore = None
        printStatus(INFO, "Failed to load %s, will do image auto-tagging" % test_tag_file)
        resultName = "autotagging"

    nnName = distance + "knn"
    resultfile = os.path.join(rootpath, testCollection,resultName,testset,trainCollection,"%s,%s,%d,%s" % (feature,nnName,k,tpp), "id.tagvotes.txt")
    
    if numjobs>1:
        resultfile += ".%d.%d" % (numjobs,job)

    if checkToSkip(resultfile, overwrite):
        return 0

 
    if donefile:
        doneset = set([x.split()[0] for x in open(donefile).readlines()[:-1]])
    else:
        doneset = set()
    printStatus(INFO, "%d images have been done already, and they will be ignored" % len(doneset))
        
    test_imset = readImageSet(testCollection, testset, rootpath)
    test_imset = [x for x in test_imset if x not in doneset]
    test_imset = [test_imset[i] for i in range(len(test_imset)) if (i%numjobs+1) == job]
    test_feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', feature)
    test_feat_file = BigFile(test_feat_dir)

   
    learner = TagrelLearner(trainCollection, feature, distance, tpp=tpp, rootpath=rootpath)
    learner.set_nr_neighbors(k)
    learner.set_nr_autotags(r)
    
    printStatus(INFO, "working on %d-%d, %d test images -> %s" % (numjobs,job,len(test_imset),resultfile))
 
    done = 0
    makedirsforfile(resultfile)
    
    fw = open(resultfile, "w")

    read_time = 0
    test_time = 0
    start = 0

    while start < len(test_imset):
        end = min(len(test_imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed, vectors = test_feat_file.read(test_imset[start:end])
        read_time += time.time() - s_time
        nr_images = len(renamed)
        #assert(len(test_imset[start:end]) == nr_images) # some images may have no visual features available

        s_time = time.time()
        output = [None] * nr_images
        for i in range(nr_images):
            if testStore:
                (qry_userid, qry_tags) = testStore.lookup(renamed[i])
            else:
                qry_userid = None
                qry_tags = None

            tagvotes = learner.estimate(vectors[i], qry_tags, qry_userid)
            output[i] = '%s %s\n' % (renamed[i], " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes]))
        test_time += time.time() - s_time
        start = end
        fw.write(''.join(output))
        fw.flush()

        done += len(output)

    # done    
    printStatus(INFO, "%d done. read time %g seconds, test_time %g seconds" % (done, read_time, test_time))
    fw.close()
    return 1