Example #1
0
 def precompute_annotator(self, concept):
     NegativeEngine.precompute_annotator(self, concept)
     for subconcept in concept.split('-'):
         expandedTagSet = set([subconcept] + wn_expand(subconcept))
         self.annotator = self.annotator.union(expandedTagSet)
     INFO = 'dataengine.%s' % self.__class__.__name__
     printStatus(INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
Example #2
0
def process(options, source_dir, feat_dim, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    featurefile = BigFile(source_dir, feat_dim)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        renamed, vectors = featurefile.read(imset[start:end])
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    resultfile = os.path.join(result_dir, 'id.txt')
    fw = open(resultfile, 'w')
    fw.write(' '.join(done))
    fw.close()

    print '%d requested, %d obtained' % (len(imset), len(done))
Example #3
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    neg_filter = options.neg_filter
    
    concepts = readConcepts(collection, annotationName, rootpath)
    newAnnotationName = annotationName[:-4] + 'social.txt'
    ne = STRING_TO_NEGATIVE_ENGINE[neg_filter](collection, rootpath)

    newConcepts = []
    for concept in concepts:
        resultfile = os.path.join(rootpath, collection, 'Annotations', 'Image', newAnnotationName, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            newConcepts.append(concept)
            continue

        try:
            pos_set = readLabeledImageSet(collection, concept, tpp='lemm', rootpath=rootpath)
        except:
            pos_set = None 
        if not pos_set:
            printStatus(INFO, '*** %s has not labeled examples, will be ignored ***' % concept)
            continue
        neg_set = ne.sample(concept, int(1e8))
        assert(len(set(pos_set).intersection(set(neg_set))) == 0)
        newlabels = [1] * len(pos_set) + [-1] * len(neg_set)
        newnames = pos_set + neg_set
        printStatus(INFO, "anno(%s) %d pos %d neg -> %s" % (concept,len(pos_set),len(neg_set),resultfile))
        writeAnnotations(newnames, newlabels, resultfile)
        newConcepts.append(concept)

    writeConceptsTo(newConcepts, collection, newAnnotationName, rootpath)
Example #4
0
    def __init__(self,
                 datafile,
                 k=5,
                 ndims=0,
                 language='en',
                 L1_normalize=0,
                 L2_normalize=0):
        Text2Vec.__init__(self, datafile, k, ndims, language, L1_normalize,
                          L2_normalize)
        self.words_simi = {}
        word_vob = map(str.strip, open(datafile).readlines())
        self.word2index = dict(zip(word_vob, range(len(word_vob))))
        self.soft_file = os.path.join(
            datafile.rsplit('/', 1)[0], 'word_vocab_soft_5.txt')
        for line in open(self.soft_file).readlines():
            word, s_sim = line.strip().split(' ', 1)
            w_s = s_sim.split(' ')
            assert len(w_s) % 2 == 0
            self.words_simi[word] = [(w_s[i], float(w_s[i + 1]))
                                     for i in range(0, len(w_s), 2)]

        if ndims != 0:
            assert len(
                word_vob
            ) == self.ndims, "feat dimension is not match %d != %d" % (
                len(word_vob), self.ndims)
        else:
            self.ndims = len(word_vob)
        printStatus(INFO + '.' + self.__class__.__name__,
                    "%d words" % self.ndims)
Example #5
0
    def precompute(self, concept):
        self.precompute_annotator(concept)
        self.candidateset = []

        for i, line in enumerate(self.data):
            elems = str.split(line)
            imageid = elems[0]
            if imageid in self.tabooImset:
                continue

            negative = 1

            for tag in elems[1:]:
                if tag in self.annotator:
                    negative = 0
                    break

            if negative:
                self.candidateset.append(imageid)

        self.candidateset = [x for x in self.candidateset if x in self.imset]
        self.target = concept
        INFO = 'dataengine.%s' % self.__class__.__name__
        printStatus(INFO,
                    "%d candidates for %s" % (self.getCount(concept), concept))
 def __init__(self, model_path, weight_path):
     self.model = model_from_json(open(model_path).read())
     self.model.load_weights(weight_path)
     # any loss ang optimizers are ok
     self.model.compile(loss='mse', optimizer='sgd')
     printStatus(INFO + '.' + self.__class__.__name__,
                 'loaded a trained Word2VisualVec model successfully')
Example #7
0
    def __init__(self,
                 trainCollection,
                 tpp="lemm",
                 feature="color64+dsift",
                 k=1000,
                 rootpath=ROOT_PATH):
        self.trainCollection = trainCollection
        self.k = k
        self.name = "%s(%s,%s,%s,%d)" % (self.__class__.__name__,
                                         self.trainCollection, tpp, feature, k)

        vobfile = os.path.join(rootpath, trainCollection, "TextData",
                               "wn.%s.txt" % trainCollection)
        self.vob = set(map(str.strip, open(vobfile).readlines()))
        printStatus(
            INFO, 'the vocabulary of %s contains %d tags' %
            (trainCollection, len(self.vob)))

        self.gamma = (1.0 / MEDIAN_DISTANCE[feature])**2
        self.feat_dir = os.path.join(rootpath, trainCollection, 'FeatureIndex',
                                     feature)
        self.dim = FEATURE_TO_DIM[feature]
        self.fcs = FlickrContextSim(trainCollection, rootpath)

        printStatus(INFO, self.name + ' okay')
Example #8
0
def classify_large_data(model, imset, feat_file, prob_output=False, blocksize=DEFAULT_BLOCK_SIZE):
    start = 0
    results = []

    read_time = 0.0
    test_time = 0.0

    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'classifying images from %d to %d' % (start, end-1))

        s_time = time.time()
        renamed,vectors = feat_file.read(imset[start:end])
        read_time += time.time() - s_time

        s_time = time.time()
        if prob_output:
            scores = [model.predict_probability(vectors[i]) for i in range(len(renamed))]
        else:
            scores = [model.predict(vectors[i]) for i in range(len(renamed))]
        test_time += time.time() - s_time

        results += zip(renamed, scores)
        start = end

    #printStatus('%.sclassifyLargeData'%INFO, 'read time %g seconds, test time %g seconds' % (read_time, test_time))
    results.sort(key=lambda v: (v[1], v[0]), reverse=True)
    return results
Example #9
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH,
                 k=DEFAULT_K):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        self.imset = readImageSet(collection, collection, rootpath)
        self.nr_of_images = len(self.imset)
        self.knndir = os.path.join(collection,
                                   '%s,%sknn,1500' % (feature, distance))

        self.k = k
        self.noise = 0

        self._load_tag_data(collection, tpp, rootpath)

        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #10
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH):
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
        id_file = os.path.join(feat_dir, 'id.txt')
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int,
                                          open(shape_file).readline().split())

        self.searcher = simpleknn.load_model(
            os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images,
            id_file)
        self.searcher.set_distance(distance)
        self.k = DEFAULT_K

        self._load_tag_data(collection, tpp, rootpath)

        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #11
0
    def __init__(self,
                 collection,
                 annotationName,
                 feature,
                 distance,
                 tpp=DEFAULT_TPP,
                 rootpath=ROOT_PATH):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(
            zip(self.concepts, range(self.nr_of_concepts)))

        featuredir = os.path.join(rootpath, collection, 'FeatureData', feature)
        id_file = os.path.join(featuredir, "id.txt")
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int,
                                          open(shape_file).readline().split())

        self.searcher = load_model(featuredir,
                                   self.nr_of_images,
                                   feat_dim,
                                   nr_of_segments=512,
                                   segmentk=256,
                                   coarsek=4096)
        self.k = DEFAULT_K
        self._load_tag_data(collection, tpp, rootpath)
        printStatus(
            INFO,
            "%s, %d images, %d unique tags, %s %d neighbours for voting" %
            (self.__class__.__name__, self.nr_of_images, len(
                self.tag2freq), distance, self.k))
Example #12
0
def cal_perf_t2i(prediction_file, verbose=1):
    scorers = [RecallScorer(k) for k in [1, 5, 10]]

    res = [0] * len(scorers)
    nr_of_sents = 0
    nr_of_images = 0

    first_matched_idexs = []
    for line in open(prediction_file):
        elems = line.strip().split()
        sentid = elems[0]
        del elems[0]

        assert (len(elems) % 2 == 0)
        imageids = elems[::2]
        nr_of_images = len(imageids)
        hit_list = []
        flag = 1
        for i in range(len(imageids)):
            if sentid.find(imageids[i]) == 0:
                hit_list.append(1)
                if flag == 1:
                    first_matched_idexs.append(i + 1)
                    flag = 0
            else:
                hit_list.append(0)
            if len(hit_list) > 20 and flag == 0:
                break

        hit_list = hit_list[:
                            20]  # consider at most the first 20 predicted tags

        perf = [scorer.score(hit_list) for scorer in scorers]
        res = [res[i] + perf[i] for i in range(len(scorers))]

    printStatus(INFO, 'nr of sentences: %d' % nr_of_sents)
    printStatus(INFO, 'nr of images: %d' % nr_of_images)
    res = [x / nr_of_images for x in res]

    recall_name = ' '.join([x.name() for x in scorers])
    recall_score = ' '.join(['%.3f' % x for x in res])

    assert len(first_matched_idexs) == nr_of_images
    med_r = sorted(first_matched_idexs)[nr_of_images / 2 - 1]
    mean_r = np.mean(first_matched_idexs)

    mean_invert_r = []
    for i in first_matched_idexs:
        mean_invert_r.append(1.0 / i)
    mean_invert_r = np.mean(mean_invert_r)

    if verbose == 1:
        print recall_name
        print recall_score
        print 'Med r: ', med_r
        print 'Mean r: ', mean_r
        print 'mean inverted r: ', round(mean_invert_r, 3)

    return (recall_name, recall_score, med_r, mean_r, mean_invert_r)
Example #13
0
 def sampling(predictions, strategy, n):
     printStatus(INFO, '%s sampling: %d out of %d instances' % (strategy, n, len(predictions)))
     if 'toprand' == strategy:
         temp = [(x[0], x[1]*random.uniform(0.9,1)) for x in predictions]   
         temp.sort(key=lambda v:v[1], reverse=True)
         return [x[0] for x in temp[:n]]
     else:
         return [x[0] for x in predictions[:n]]
Example #14
0
def process(options, collection, annotationName, pos_num):
    assert(annotationName.endswith('.txt'))
    rootpath = options.rootpath
    pos_bag_num = options.pos_bag_num
    neg_bag_num = options.neg_bag_num
    neg_pos_ratio = options.neg_pos_ratio

    annotationNameStr = annotationName[:-4] + ('.random%d' % pos_num) + '.%d' + ('.npr%d' % neg_pos_ratio) + '.%d.txt'

    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    
    skip = 0
    newAnnotationNames = [None] * (pos_bag_num * neg_bag_num)

    for idxp in range(pos_bag_num):
        for idxn in range(neg_bag_num):
            anno_idx = idxp * neg_bag_num + idxn
            newAnnotationNames[anno_idx] = annotationNameStr % (idxp, idxn)
            resultfile = os.path.join(rootpath,collection,'Annotations',newAnnotationNames[anno_idx])
            if checkToSkip(resultfile, options.overwrite):
                skip += 1
                continue
            writeConcepts(concepts,resultfile)

    first,second,last = annotationNameStr.split('%d')
    scriptfile = os.path.join(rootpath,collection,'annotationfiles',first + '0-%d'%(pos_bag_num-1) + second + '0-%d'%(neg_bag_num-1) + last)

    makedirsforfile(scriptfile)
    fout = open(scriptfile,'w')
    fout.write('\n'.join(newAnnotationNames) + '\n')
    fout.close()

    if len(newAnnotationNames) == skip:
        return 0
        
    for concept in concepts:
        names,labels = readAnnotationsFrom(collection, annotationName, concept, skip_0=True, rootpath=rootpath)
        positivePool = [x[0] for x in zip(names,labels) if x[1]>0]
        negativePool = [x[0] for x in zip(names,labels) if x[1]<0]
        
        for idxp in range(pos_bag_num):
            if len(positivePool) > pos_num:
                positiveBag = random.sample(positivePool, pos_num)
            else:
                positiveBag = positivePool
            for idxn in range(neg_bag_num):
                anno_idx = idxp * neg_bag_num + idxn
                newAnnotationName = newAnnotationNames[anno_idx]
                resultfile = os.path.join(rootpath,collection,'Annotations','Image',newAnnotationName,'%s.txt'%concept)
                if checkToSkip(resultfile, options.overwrite):
                    continue
                real_neg_num = max(len(positiveBag) * neg_pos_ratio, 1000)
                real_neg_num = min(len(negativePool), real_neg_num)
                negativeBag = random.sample(negativePool, real_neg_num)

                assert(len(set(positiveBag).intersection(set(negativeBag))) == 0)
                printStatus(INFO, "anno(%s,%d) %d pos %d neg -> %s" % (concept,anno_idx,len(positiveBag),len(negativeBag),resultfile))
                writeAnnotations(positiveBag + negativeBag, [1]*len(positiveBag) + [-1]*len(negativeBag), resultfile)
Example #15
0
 def __init__(self,
              corpus,
              modelName,
              wnid2words_file='data/wnid2words.pkl',
              rootpath=ROOT_PATH):
     printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
     word2vec_dir = os.path.join(rootpath, corpus, 'word2vec', modelName)
     self.wnid2words = pickle.load(open(wnid2words_file, 'rb'))
     self.word2vec = BigFile(word2vec_dir)
Example #16
0
 def precompute_annotator(self, concept):
     NegativeEngine.precompute_annotator(self, concept)
     for subconcept in concept.split('-'):
         expandedTagSet = set([subconcept] + wn_expand(subconcept))
         self.annotator = self.annotator.union(expandedTagSet)
     INFO = 'dataengine.%s' % self.__class__.__name__
     printStatus(
         INFO, 'precomputing the virtual annotator for %s: %d tags' %
         (concept, len(self.annotator)))
Example #17
0
    def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0):
        printStatus(INFO + '.' + self.__class__.__name__, 'initializing ...')
        self.datafile = datafile
        self.ndims = ndims
        self.L1_normalize = L1_normalize
        self.L2_normalize = L2_normalize

        assert type(L1_normalize) == int
        assert type(L2_normalize) == int
        assert (L1_normalize + L2_normalize) <= 1
Example #18
0
 def sampling(predictions, strategy, n):
     printStatus(
         INFO, '%s sampling: %d out of %d instances' %
         (strategy, n, len(predictions)))
     if 'toprand' == strategy:
         temp = [(x[0], x[1] * random.uniform(0.9, 1)) for x in predictions]
         temp.sort(key=lambda v: v[1], reverse=True)
         return [x[0] for x in temp[:n]]
     else:
         return [x[0] for x in predictions[:n]]
Example #19
0
def process(options, testCollection):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding_model = options.embedding
    Y0 = options.Y0
    Y1 = options.Y1
    pY0 = options.pY0
    r = options.r
    blocksize = 2000

    embedding_name = '%s,%s,%s' % (corpus, word2vec_model, embedding_model)
    for synset_name in [Y0, Y1]:
        assert(os.path.exists(os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)))

    resfile = os.path.join(rootpath, testCollection, 'autotagging', testCollection, embedding_name, pY0, 'id.tagvotes.txt')
    if checkToSkip(resfile, overwrite):
        return 0

    label_file = 'data/ilsvrc12/synsets.txt'
    label2vec_dir = os.path.join(rootpath, 'synset2vec', Y0, embedding_name)
    i2v = Image2Vec(label_file, label2vec_dir)

    tagger = ZeroshotTagger(synset_name=Y1, embedding_name=embedding_name, rootpath=rootpath)

    imset = readImageSet(testCollection, testCollection, rootpath)
    feat_dir = os.path.join(rootpath, testCollection, 'FeatureData', pY0)
    feat_file = BigFile(feat_dir)
    

    printStatus(INFO, 'tagging %d images' % len(imset))
    makedirsforfile(resfile)
    fw = open(resfile, 'w')

    start = 0
    while start < len(imset):
        end = min(len(imset), start + blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end))
        todo = imset[start:end]
        if not todo:
            break

        renamed, vectors = feat_file.read(todo)
        output = []
        for _id,_vec in zip(renamed, vectors):
            im_vec = i2v.embedding(_vec)
            pred = tagger.predict(im_vec, topk=options.r)
            output.append('%s %s\n' % (_id, ' '.join(['%s %s'%(x[0],x[1]) for x in pred])))
        start = end
        fw.write(''.join(output))

    fw.close()
Example #20
0
 def __init__(self, datafile, ndims=0, L1_normalize=0, L2_normalize=0):
     Text2Vec.__init__(self, datafile, ndims, L1_normalize, L2_normalize)
     word_vob = map(str.strip, open(datafile).readlines())
     self.word2index = dict(zip(word_vob, range(len(word_vob))))
     if ndims != 0:
         assert len(
             word_vob
         ) == self.ndims, "feat dimension is not match %d != %d" % (
             len(word_vob), self.ndims)
     else:
         self.ndims = len(word_vob)
     printStatus(INFO + '.' + self.__class__.__name__,
                 "%d words" % self.ndims)
Example #21
0
    def __init__(self, collection, tpp="lemm", rootpath=ROOT_PATH):
        self.name = "%s(%s,%s)" % (self.__class__.__name__, collection, tpp)
        self.photoid2tags = {}
        datafile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp)
        self.vob = []

        with open(datafile) as fin:
            for line in fin:
                [photoid, userid, tags] = line.split("\t")
                self.photoid2tags[photoid] = tags
                self.vob += str.split(tags)
        self.vob = set(self.vob)
        printStatus(self.name, "%d images, %d unique tags" % (len(self.photoid2tags), len(self.vob)))
Example #22
0
def submit(searchers, collection,annotationName, rootpath=ROOT_PATH, overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(), concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print ("%s: %s %d -> %s" % (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit'%os.path.basename(__file__), "done")
Example #23
0
    def __init__(self, collection, rootpath=ROOT_PATH):
        self.name = '%s.%s' % (self.__class__.__name__, collection)
        imsetfile = os.path.join(rootpath, collection, "ImageSets", "%s.txt" % collection) 
        self.imset = set(map(str.strip, open(imsetfile).readlines()))

        holdoutfile = os.path.join(rootpath, collection, "ImageSets", "holdout.txt") 
        holdoutSet = set(map(str.strip, open(holdoutfile).readlines()))
        printStatus(self.name, '%d examples, %d holdout' % (len(self.imset), len(holdoutSet)))

        self.collection = collection
        self.target = None
        self.imset = set([x for x in self.imset if x not in holdoutSet])
        self.candidateset = sorted(list(self.imset))
        self.datadir = os.path.join(rootpath, collection)
Example #24
0
    def __init__(self, trainCollection, tpp="lemm", feature="color64+dsift",  k=1000, rootpath=ROOT_PATH):
        self.trainCollection = trainCollection
        self.k = k
        self.name = "%s(%s,%s,%s,%d)" % (self.__class__.__name__, self.trainCollection, tpp, feature, k)

        vobfile = os.path.join(rootpath, trainCollection, "TextData", "wn.%s.txt"%trainCollection)
        self.vob = set(map(str.strip, open(vobfile).readlines()))
        printStatus(INFO, 'the vocabulary of %s contains %d tags' % (trainCollection, len(self.vob)))

        self.gamma = (1.0/MEDIAN_DISTANCE[feature])**2
        self.feat_dir = os.path.join(rootpath, trainCollection, 'FeatureIndex', feature)
        self.dim = FEATURE_TO_DIM[feature]
        self.fcs = FlickrContextSim(trainCollection,rootpath)  
        
        printStatus(INFO, self.name + ' okay')
Example #25
0
    def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
        self.rootpath = rootpath
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))

        featuredir = os.path.join(rootpath,collection,'FeatureData',feature)
        id_file = os.path.join(featuredir, "id.txt")
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split())

        self.searcher = load_model(featuredir, self.nr_of_images, feat_dim,nr_of_segments=512,segmentk=256,coarsek=4096)
        self.k = DEFAULT_K
        self._load_tag_data(collection, tpp, rootpath)
        printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance,  self.k))
Example #26
0
    def __init__(self, label_file, label2vec_dir):
        self.labels = map(str.strip, open(label_file).readlines())
        self.nr_of_labels = len(self.labels)
        feat_file = BigFile(label2vec_dir)
        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        self.feat_dim = feat_file.ndims

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
Example #27
0
 def precompute_annotator(self, concept):
     INFO = 'dataengine.%s.precompute_annotator'%self.__class__.__name__
     topn = 100
     NegativeEngine.precompute_annotator(self, concept)
     
     for subconcept in concept.split('-'):
         expandedTagSet = set([subconcept] + wn_expand(subconcept))
         try:
             datafile = os.path.join(ROOT_PATH, self.collection, 'SimilarityIndex', 'ngd', '%s.txt' % subconcept)
             rankedtags = readRankingResults(datafile)
             expandedTagSet = expandedTagSet.union(set([x[0] for x in rankedtags[:topn]]))
         except:
             printError(INFO, 'failed to load ranktag file for %s' % subconcept)
         self.annotator = self.annotator.union(expandedTagSet)
     printStatus(INFO, 'precomputing the virtual annotator for %s: %d tags' % (concept, len(self.annotator)))
Example #28
0
    def __init__(self, label_file, label2vec_dir):
        self.labels = map(str.strip, open(label_file).readlines())
        self.nr_of_labels = len(self.labels)
        feat_file = BigFile(label2vec_dir)
        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        self.feat_dim = feat_file.ndims

        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
Example #29
0
def process(options, testCollection, trainCollection, tagsimMethod):
    rootpath = options.rootpath
    overwrite = options.overwrite
    testsetName = options.testset if options.testset else testCollection 
    tpp = options.tpp
    numjobs = options.numjobs
    job = options.job
    useWnVob = 1

    outputName = tagsimMethod + '-wn' if useWnVob else tagsimMethod

    if tagsimMethod == 'wns':
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, outputName,'id.tagvotes.txt')
    else:    
        resultfile = os.path.join(rootpath, testCollection, "tagrel", testsetName, trainCollection, outputName,'id.tagvotes.txt')
    if numjobs>1:
        resultfile = resultfile.replace("id.tagvotes.txt", "id.tagvotes.%d.%d.txt" % (numjobs,job))

    if checkToSkip(resultfile, overwrite):
        sys.exit(0)

    makedirsforfile(resultfile)

    try:
        doneset = set([str.split(x)[0] for x in open(options.donefile).readlines()[:-1]])
    except:
        doneset = set()
        
    printStatus(INFO, "done set: %d" % len(doneset))

 
    testImageSet = readImageSet(testCollection, testCollection, rootpath)
    testImageSet = [x for x in testImageSet if x not in doneset]
    testImageSet = [testImageSet[i] for i in range(len(testImageSet)) if (i%numjobs+1) == job]
    printStatus(INFO, 'working on %d-%d, %d test images -> %s' % (numjobs,job,len(testImageSet),resultfile) )
    
    testreader = TagReader(testCollection, rootpath=rootpath)    

    if tagsimMethod == "wns":
        tagrel = SIM_TO_TAGREL["wns"](trainCollection, useWnVob, "wup", rootpath)
    else:
        tagrel = SIM_TO_TAGREL[tagsimMethod](trainCollection, useWnVob, rootpath)

 
    done = 0
    fw = open(resultfile, "w")
    
    for qry_id in testImageSet:
        qry_tags = testreader.get(qry_id)    
        tagvotes = tagrel.estimate(qry_tags)
        newline = qry_id + " " + " ".join(["%s %s" % (tag, niceNumber(vote,8)) for (tag,vote) in tagvotes])
        fw.write(newline+"\n")
        done += 1
        if done%1000 == 0:
            printStatus(INFO, "%d done" % done)
    # done    
    fw.close()
    printStatus(INFO, "%d done" % done)
Example #30
0
def buildHitLists(collection, tpp='lemm', rootpath=ROOT_PATH):
    vobfile = os.path.join(rootpath, collection, 'TextData', 'wn.%s.txt' % collection)
    vob = set(map(str.strip, open(vobfile).readlines()))
    
    printStatus(INFO, '%s, %d unique tags' % (collection, len(vob)))
    
    tagfile = os.path.join(rootpath, collection, 'TextData', 'id.userid.%stags.txt'%tpp) 
    hitlists = {}
    for line in open(tagfile).readlines():
        elems = line.strip().split()
        name = elems[0]
        tagset = set(elems[2:]).intersection(vob)
        for tag in tagset:
            hitlists.setdefault(tag,[]).append(name)
    assert(len(hitlists)<=len(vob))
    return hitlists        
Example #31
0
 def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
     self.rootpath = rootpath
     self.concepts = readConcepts(collection, annotationName, rootpath)
     self.nr_of_concepts = len(self.concepts)
     self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))
     
     self.imset = readImageSet(collection, collection, rootpath)
     self.nr_of_images = len(self.imset)
     self.knndir = os.path.join(collection, '%s,%sknn,uu,1500' % (feature, distance))
     
     self.k = DEFAULT_K
     self.noise = 0
     
     self._load_tag_data(collection, tpp, rootpath)
     
     printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
Example #32
0
    def __init__(self, collection, tpp='lemm', rootpath=ROOT_PATH):
        self.name = '%s(%s,%s)' % (self.__class__.__name__, collection, tpp)
        self.photoid2tags = {}
        datafile = os.path.join(rootpath, collection, "TextData",
                                "id.userid.%stags.txt" % tpp)
        self.vob = []

        with open(datafile) as fin:
            for line in fin:
                [photoid, userid, tags] = line.split("\t")
                self.photoid2tags[photoid] = tags
                self.vob += str.split(tags)
        self.vob = set(self.vob)
        printStatus(
            self.name, "%d images, %d unique tags" %
            (len(self.photoid2tags), len(self.vob)))
def buildHitLists(collection, tpp="lemm", rootpath=ROOT_PATH):
    vobfile = os.path.join(rootpath, collection, "TextData", "wn.%s.txt" % collection)
    vob = set(map(str.strip, open(vobfile).readlines()))

    printStatus(INFO, "%s, %d unique tags" % (collection, len(vob)))

    tagfile = os.path.join(rootpath, collection, "TextData", "id.userid.%stags.txt" % tpp)
    hitlists = {}
    for line in open(tagfile).readlines():
        elems = line.strip().split()
        name = elems[0]
        tagset = set(elems[2:]).intersection(vob)
        for tag in tagset:
            hitlists.setdefault(tag, []).append(name)
    assert len(hitlists) <= len(vob)
    return hitlists
Example #34
0
def process(options, pklfile, hdf5file):
    if checkToSkip(hdf5file, options.overwrite):
        return 0

    printStatus(INFO, 'Loading pkl file %s' % pklfile)
    with open(pklfile, 'r') as f:
        data = pkl.load(f)
    printStatus(INFO, 'Found %d elements.' % len(data))

    printStatus(INFO, 'Saving hdf5 file %s' % hdf5file)
    with h5py.File(hdf5file, 'w') as f:
        for k, v in data.items():
            printStatus(INFO, 'Dumping %s' % k)
            f[k] = v

    printStatus(INFO, 'Done.')
Example #35
0
def process(options, pklfile, hdf5file):
    if checkToSkip(hdf5file, options.overwrite):
        return 0

    printStatus(INFO, 'Loading pkl file %s' % pklfile)
    with open(pklfile, 'r') as f:
        data = pkl.load(f)
    printStatus(INFO, 'Found %d elements.' % len(data))

    printStatus(INFO, 'Saving hdf5 file %s' % hdf5file)
    with h5py.File(hdf5file,'w') as f:
        for k,v in data.items():
            printStatus(INFO, 'Dumping %s' % k)
            f[k] = v

    printStatus(INFO, 'Done.')
Example #36
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite
    random = options.random

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.zeros((len(id_images), len(concepts)))

    id_images = []
    tag2idx = dict(zip(concepts, xrange(len(concepts))))
    with open(
            os.path.join(rootpath, workingCollection, 'TextData',
                         'id.userid.lemmtags.txt')) as f:
        cnt = 0
        for line in f:
            id_img, _, tags = line.split('\t')
            tags = tags.split()
            if len(tags) > 0:
                tags = [(tag2idx.get(x, -1), y)
                        for x, y in zip(tags, xrange(len(tags)))]
                idx = np.array([x[0] for x in tags])
                vals = 1. / (1. + np.array([x[1] for x in tags]))
                tagmatrix[cnt, idx] = vals

            id_images.append(id_img)
            cnt += 1

    # random rank for untagged images
    if random:
        tagmatrix += np.min(tagmatrix[tagmatrix > 0]) * np.random.rand(
            tagmatrix.shape[0], tagmatrix.shape[1])

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump(
            {
                'concepts': concepts,
                'id_images': map(int, id_images),
                'scores': tagmatrix
            }, f, pickle.HIGHEST_PROTOCOL)
Example #37
0
    def __init__(self, collection, annotationName, feature, distance, tpp=DEFAULT_TPP, rootpath=ROOT_PATH):
        self.concepts = readConcepts(collection, annotationName, rootpath)
        self.nr_of_concepts = len(self.concepts)
        self.concept2index = dict(zip(self.concepts, range(self.nr_of_concepts)))
        
        feat_dir = os.path.join(rootpath, collection, "FeatureData", feature)
        id_file = os.path.join(feat_dir, 'id.txt')
        shape_file = os.path.join(feat_dir, 'shape.txt')
        self.nr_of_images, feat_dim = map(int, open(shape_file).readline().split())

        self.searcher = simpleknn.load_model(os.path.join(feat_dir, 'feature.bin'), feat_dim, self.nr_of_images, id_file)
        self.searcher.set_distance(distance)
        self.k = DEFAULT_K
        
        self._load_tag_data(collection, tpp, rootpath)
        
        printStatus(INFO, "%s, %d images, %d unique tags, %s %d neighbours for voting" % (self.__class__.__name__, self.nr_of_images, len(self.tag2freq), distance, self.k))
Example #38
0
def process(options, workingCollection, annotationName, outputpkl):
    rootpath = options.rootpath
    overwrite = options.overwrite

    resultfile = os.path.join(outputpkl)
    if checkToSkip(resultfile, overwrite):
        return 0

    concepts = readConcepts(workingCollection, annotationName, rootpath)
    id_images = readImageSet(workingCollection, workingCollection, rootpath)
    tagmatrix = np.random.rand(len(id_images), len(concepts))

    # save results in pkl format
    printStatus(INFO, "Dump results in pkl format at %s" % resultfile)    
    makedirsforfile(resultfile)
    with open(resultfile, 'w') as f:
        pickle.dump({'concepts':concepts, 'id_images':map(int, id_images), 'scores':tagmatrix}, f, pickle.HIGHEST_PROTOCOL)
Example #39
0
def process(options, model_name, concept_file, weight_dir, result_dir):
    rootpath = options.rootpath
    overwrite = options.overwrite

    if 'fastlinear' == model_name:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model

    concepts = [
        x.strip() for x in open(concept_file).readlines()
        if x.strip() and not x.strip().startswith('#')
    ]
    todo = [
        x for x in concepts if overwrite
        or not os.path.exists(os.path.join(result_dir, '%s.model' % x))
    ]
    printStatus(INFO, '%d concepts to do' % len(todo))

    for concept in todo:
        weight_file = os.path.join(weight_dir, '%s.txt' % concept)
        weight_data = map(str.strip, open(weight_file).readlines())
        nr_of_models = len(weight_data)
        assert (nr_of_models >= 2)
        weights = [0] * nr_of_models
        models = [None] * nr_of_models

        for i, line in enumerate(weight_data):
            w, model_dir = line.split()
            weights[i] = float(w)
            model_dir = model_dir if model_dir.startswith(
                rootpath) else os.path.join(rootpath, model_dir)
            assert (model_dir.find(model_name) > 0)
            model_file_name = os.path.join(model_dir, '%s.model' % concept)
            models[i] = load_model(model_file_name)

        new_model = models[0]
        new_model.add_fastsvm(models[1], weights[0], weights[1])
        for i in range(2, len(models)):
            new_model.add_fastsvm(models[i], 1, weights[i])

        new_model_file = os.path.join(result_dir, '%s.model' % concept)
        makedirsforfile(new_model_file)
        save_model(new_model_file, new_model)
Example #40
0
    def __init__(self, synset_name='imagenet1k2hop', embedding_name='flickr4m,tagvec500,hierse2', rootpath=ROOT_PATH):
        feat_dir = os.path.join(rootpath, 'synset2vec', synset_name, embedding_name)
        feat_file = BigFile(feat_dir)
        self.labels = feat_file.names
        self.nr_of_labels = len(self.labels)
        self.feat_dim = feat_file.ndims

        renamed, vectors = feat_file.read(self.labels)
        name2index = dict(zip(renamed, range(len(renamed))))
        self.label_vectors = [None] * self.nr_of_labels
        
        for i in xrange(self.nr_of_labels):
            idx = name2index.get(self.labels[i], -1)
            self.label_vectors[i] = np.array(vectors[idx]) if idx >= 0 else None

        nr_of_inactive_labels = len([x for x in self.label_vectors if x is None])    
        printStatus(INFO + '.' + self.__class__.__name__, '#active_labels=%d, embedding_size=%d' % (self.nr_of_labels - nr_of_inactive_labels, self.feat_dim))
Example #41
0
    def __init__(self, collection, rootpath=ROOT_PATH):
        self.name = '%s.%s' % (self.__class__.__name__, collection)
        imsetfile = os.path.join(rootpath, collection, "ImageSets",
                                 "%s.txt" % collection)
        self.imset = set(map(str.strip, open(imsetfile).readlines()))

        holdoutfile = os.path.join(rootpath, collection, "ImageSets",
                                   "holdout.txt")
        holdoutSet = set(map(str.strip, open(holdoutfile).readlines()))
        printStatus(
            self.name,
            '%d examples, %d holdout' % (len(self.imset), len(holdoutSet)))

        self.collection = collection
        self.target = None
        self.imset = set([x for x in self.imset if x not in holdoutSet])
        self.candidateset = sorted(list(self.imset))
        self.datadir = os.path.join(rootpath, collection)
Example #42
0
 def __init__(self, trainCollection, trainAnnotationName, feature, modelName, rootpath=ROOT_PATH):
     assert(modelName.startswith('fastlinear')), modelName
     self.concepts = readConcepts(trainCollection, trainAnnotationName, rootpath=rootpath)
     self.nr_of_concepts = len(self.concepts)
     modeldir = os.path.join(rootpath, trainCollection, 'Models', trainAnnotationName, feature, modelName)
     model = load_model(os.path.join(modeldir, self.concepts[0]+'.model'))
     self.feat_dim = model.get_feat_dim()
      
     self.W = np.zeros((self.feat_dim, self.nr_of_concepts))
     self.AB = np.zeros((2, self.nr_of_concepts))
     for i in range(self.nr_of_concepts):
         model_file_name = os.path.join(modeldir, "%s.model" % self.concepts[i])
         model = load_model(model_file_name)
         self.W[:,i] = model.get_w()
         [A,B] = model.get_probAB()
         self.AB[:,i] = [A,B] if abs(A)>1e-8 else [-1,0]
         printStatus(INFO, '%s, A=%g, B=%g' % (self.concepts[i], A, B))
     printStatus(INFO, '%s-%s-%s -> %dx%d ModelArray' % (trainCollection,trainCollection,feature,self.feat_dim,self.nr_of_concepts))
Example #43
0
def process(options, trainCollection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite
    
    resultfile = os.path.join(rootpath, trainCollection, 'TextData', 'tag.concept-rank.%s.pkl' % annotationName)
    if checkToSkip(resultfile, overwrite):
        return 0
        
    concepts = readConcepts(trainCollection, annotationName, rootpath)
    concept_num = len(concepts)
    concept2index = dict(zip(concepts, range(concept_num)))
    tcb = TagCooccurBase(trainCollection, rootpath=rootpath)
    tag_num = tcb.tag_num()
    DEFAULT_RANK = tag_num
    rank_matrix = np.zeros((tag_num, concept_num), dtype=np.int) + DEFAULT_RANK
    tag_list = []
    
    for i,u in enumerate(tcb.vob):
        ranklist = tcb.top_cooccur(u,-1)
        concept2rank = {}
        rank = [DEFAULT_RANK] * concept_num
        
        hit = 0
        for j,x in enumerate(ranklist):
            idx = concept2index.get(x[0], -1)
            if idx>=0:
                rank_matrix[i,idx] = j+1
                hit += 1
                if hit == concept_num:
                    break
        tag_list.append(u)
        
        if (i+1) % 1e4 == 0:
            printStatus(INFO, '%d done' % (i+1) )
    
    assert(len(tag_list) == tag_num)
    
    import cPickle as pickle
    makedirsforfile(resultfile)
    output = open(resultfile, 'wb')
    pickle.dump({'tags':tag_list, 'concepts':concepts, 'rank_matrix':rank_matrix}, output, -1)
    output.close()
    printStatus(INFO, '%dx%d dumped to %s' % (tag_num, concept_num, resultfile))
def process(options, collection, conceptfile):
    rootpath = options.rootpath
    tpp = options.tpp
    overwrite = options.overwrite

    concepts = [x.strip() for x in open(conceptfile).readlines() if x.strip() and not x.strip().startswith('#')]
    resultdir = os.path.join(rootpath, collection, 'tagged,%s'%tpp)

    todo = []
    for concept in concepts:
        resultfile = os.path.join(resultdir, '%s.txt'%concept)
        if checkToSkip(resultfile, overwrite):
            continue
        todo.append(concept)

    if not todo:
        printStatus(INFO, 'nothing to do')
        return 0

    try:
        holdoutfile = os.path.join(rootpath,collection,'ImageSets','holdout.txt')
        holdoutSet = set(map(str.strip,open(holdoutfile).readlines()))
    except:
        holdoutSet = set()

    hitlists = buildHitlists(collection, todo, tpp, rootpath)
    min_hit = 1e6
    max_hit = 0

    for concept in todo:
        resultfile = os.path.join(resultdir, '%s.txt' % concept)
        if checkToSkip(resultfile,overwrite):
            continue
        subconcepts = concept.split('-')
        labeledSet = set(hitlists[subconcepts[0]])
        for i in range(1,len(subconcepts)):
            labeledSet = labeledSet.intersection(hitlists[subconcepts[i]])
        labeledSet = labeledSet.difference(holdoutSet)
        if len(labeledSet) == 0:
            printStatus(INFO, '%s has ZERO hit' % concept)
        else:
            printStatus(INFO, '%s, %d hits -> %s' %(concept, len(labeledSet), resultfile))
            makedirsforfile(resultfile)
            fw = open(resultfile, 'w')
            fw.write('\n'.join(labeledSet) + '\n')
            fw.close()
        if len(labeledSet) > max_hit:
            max_hit = len(labeledSet)
        if len(labeledSet) < min_hit:
            min_hit = len(labeledSet)
            
    printStatus(INFO, 'max hits: %d, min hits: %d' % (max_hit, min_hit))
Example #45
0
def process(options, feat_dir):
    resultfile = os.path.join(feat_dir, "minmax.txt")
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    nr_of_images, feat_dim = map(int, open(os.path.join(feat_dir, "shape.txt")).readline().split())
    min_vals = [1e6] * feat_dim
    max_vals = [-1e6] * feat_dim

    offset = np.float32(1).nbytes * feat_dim
    res = array.array("f")

    feat_file = os.path.join(feat_dir, "feature.bin")
    id_file = os.path.join(feat_dir, "id.txt")
    nr_of_images = len(open(id_file).readline().strip().split())
    printStatus(INFO, "parsing %s" % feat_file)
    fr = open(feat_file, "rb")

    s_time = time.time()

    for i in xrange(nr_of_images):
        res.fromfile(fr, feat_dim)
        vec = res
        for d in xrange(feat_dim):
            if vec[d] > max_vals[d]:
                max_vals[d] = vec[d]
            if vec[d] < min_vals[d]:
                min_vals[d] = vec[d]
        del res[:]
    fr.close()

    timecost = time.time() - s_time
    printStatus(
        INFO,
        "%g seconds to find min [%g,%g] and max [%g,%g]"
        % (timecost, min(min_vals), max(min_vals), min(max_vals), max(max_vals)),
    )

    with open(resultfile, "w") as f:
        f.write("%s\n" % " ".join(map(str, min_vals)))
        f.write("%s\n" % " ".join(map(str, max_vals)))
        f.close()
def submit(searchers,
           collection,
           annotationName,
           rootpath=ROOT_PATH,
           overwrite=0):
    concepts = readConcepts(collection, annotationName, rootpath=rootpath)
    nr_of_runs = len(searchers)

    for concept in concepts:
        for j in range(nr_of_runs):
            resultfile = os.path.join(searchers[j].getOutputdir(),
                                      concept + ".txt")
            if checkToSkip(resultfile, overwrite):
                continue
            searchresults = searchers[j].scoreCollection(concept)
            print("%s: %s %d -> %s" %
                  (searchers[j].name, concept, len(searchresults), resultfile))
            writeRankingResults(searchresults, resultfile)

    printStatus('%s.submit' % os.path.basename(__file__), "done")
Example #47
0
def process(options, synset_file, synset_name):
    overwrite = options.overwrite
    rootpath = options.rootpath
    corpus = options.corpus
    word2vec_model = options.word2vec
    embedding = options.embedding

    resdir = os.path.join(rootpath, 'synset2vec', synset_name,
                          '%s,%s,%s' % (corpus, word2vec_model, embedding))
    resfile = os.path.join(resdir, 'feature.bin')
    if checkToSkip(resfile, overwrite):
        return 0

    synsets = map(str.strip, open(synset_file).readlines())
    s2v = get_synset_encoder(embedding)(corpus,
                                        word2vec_model,
                                        rootpath=rootpath)
    makedirsforfile(resfile)

    good = []
    with open(resfile, 'wb') as fw:
        for i, wnid in enumerate(synsets):
            #if i % 1e3 == 0:
            #    printStatus(INFO, '%d done' % i)
            vec = s2v.embedding(wnid)

            if vec is not None:
                vec = np.array(vec, dtype=np.float32)
                vec.tofile(fw)
                good.append(wnid)

        fw.close()
        printStatus(INFO, '%d done, %d okay' % ((i + 1), len(good)))

    with open(os.path.join(resdir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(good))
        fw.close()

    with open(os.path.join(resdir, 'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(good), s2v.get_feat_dim()))
        fw.close()
Example #48
0
    def precompute_annotator(self, concept):
        INFO = 'dataengine.%s.precompute_annotator' % self.__class__.__name__
        topn = 100
        NegativeEngine.precompute_annotator(self, concept)

        for subconcept in concept.split('-'):
            expandedTagSet = set([subconcept] + wn_expand(subconcept))
            try:
                datafile = os.path.join(ROOT_PATH, self.collection,
                                        'SimilarityIndex', 'ngd',
                                        '%s.txt' % subconcept)
                rankedtags = readRankingResults(datafile)
                expandedTagSet = expandedTagSet.union(
                    set([x[0] for x in rankedtags[:topn]]))
            except:
                printError(INFO,
                           'failed to load ranktag file for %s' % subconcept)
            self.annotator = self.annotator.union(expandedTagSet)
        printStatus(
            INFO, 'precomputing the virtual annotator for %s: %d tags' %
            (concept, len(self.annotator)))
Example #49
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(collection, annotationName, rootpath)
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")

    todo = [
        x for x in concepts
        if not os.path.exists(os.path.join(resultdir, x + '.txt')) or overwrite
    ]
    if not todo:
        printStatus(INFO, 'nothing to do')
        return

    fcs = FlickrContextSim(collection, rootpath=rootpath)
    vob = fcs.vob
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")
    printStatus(
        INFO, 'expanding tags for %s-%s -> %s' %
        (collection, annotationName, resultdir))

    for concept in todo:
        resultfile = os.path.join(resultdir, concept + '.txt')

        vals = []
        for tag in vob:
            dist = fcs.computeNGD(concept, tag, img=1)
            if dist < 10:
                vals.append((tag, dist))
        vals.sort(key=lambda v: v[1])
        printStatus(INFO,
                    '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]])))
        writeRankingResults(vals, resultfile)
Example #50
0
def process(options, model_name, concept_file, weight_dir, result_dir):
    rootpath = options.rootpath
    overwrite = options.overwrite

    if 'fastlinear' == model_name:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = [x.strip() for x in open(concept_file).readlines() if x.strip() and not x.strip().startswith('#')]
    todo = [x for x in concepts if overwrite or not os.path.exists(os.path.join(result_dir, '%s.model'%x))]
    printStatus(INFO, '%d concepts to do' % len(todo))

    for concept in todo:
        weight_file = os.path.join(weight_dir, '%s.txt' % concept)
        weight_data = map(str.strip, open(weight_file).readlines())
        nr_of_models = len(weight_data)
        assert(nr_of_models >= 2)
        weights = [0] * nr_of_models
        models = [None] * nr_of_models

        for i,line in enumerate(weight_data):
            w, model_dir = line.split()
            weights[i] = float(w)
            model_dir =  model_dir if model_dir.startswith(rootpath) else os.path.join(rootpath, model_dir)
            assert (model_dir.find(model_name)>0)
            model_file_name = os.path.join(model_dir, '%s.model' % concept)
            models[i] = load_model(model_file_name)

        new_model = models[0]
        new_model.add_fastsvm(models[1], weights[0], weights[1])
        for i in range(2, len(models)):
            new_model.add_fastsvm(models[i], 1, weights[i])    

        new_model_file = os.path.join(result_dir, '%s.model'%concept)
        makedirsforfile(new_model_file)
        save_model(new_model_file, new_model)
Example #51
0
def process(options, trainCollection, modelAnnotationName, trainAnnotationName, feature):
    rootpath = options.rootpath
    modelName = options.model

    if 'fastlinear' == modelName:
        from fastlinear.fastlinear import fastlinear_load_model as load_model
        from fastlinear.fastlinear import fastlinear_save_model as save_model
    else:
        from fiksvm.fiksvm import fiksvm_load_model as load_model
        from fiksvm.fiksvm import fiksvm_save_model as save_model


    concepts = readConcepts(trainCollection, trainAnnotationName, rootpath)
    concepts = [concepts[i] for i in range(len(concepts)) if (i%options.numjobs + 1) == options.job]

    feat_file = BigFile(os.path.join(rootpath, trainCollection, "FeatureData", feature))

    for concept in concepts:
        modelfile = os.path.join(rootpath, trainCollection, 'Models', modelAnnotationName, feature, modelName, '%s.model' % concept)
        model = load_model(modelfile)
        (A0, B0) = model.get_probAB()
        if abs(A0) > 1e-8 and not options.overwrite:
            printStatus(INFO, "old parameters exist as A=%g, B=%g, skip" % (A0, B0))
            continue
        names,labels = readAnnotationsFrom(trainCollection, trainAnnotationName, concept, skip_0=True, rootpath=rootpath)
        name2label = dict(zip(names, labels))
        results = classify_large_data(model, names, feat_file, prob_output=False)
        labels = [name2label[x[0]] for x in results]
        dec_values = [x[1] for x in results]
        printStatus(INFO, "%s +%d -%d" % (concept, len([x for x in labels if x==1]), len([x for x in labels if x==-1])))
        [A,B] = sigmoid_train(dec_values, labels)
        model.set_probAB(A, B)
        save_model(modelfile, model)
        (A1, B1) = model.get_probAB()
        printStatus(INFO, "A: %g -> %g, B: %g -> %g" % (A0, A1, B0, B1))
Example #52
0
def process(options, collection, annotationName):
    rootpath = options.rootpath
    overwrite = options.overwrite

    concepts = readConcepts(collection,annotationName,rootpath)
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")

    todo = [x for x in concepts if not os.path.exists(os.path.join(resultdir,x+'.txt')) or overwrite]
    if not todo:
        printStatus(INFO, 'nothing to do')
        return

    fcs = FlickrContextSim(collection, rootpath=rootpath)
    vob = fcs.vob
    resultdir = os.path.join(rootpath, collection, "SimilarityIndex", "ngd")
    printStatus(INFO, 'expanding tags for %s-%s -> %s' % (collection, annotationName, resultdir))
    
    for concept in todo:
        resultfile = os.path.join(resultdir, concept + '.txt')
            
        vals = []
        for tag in vob:
            dist = fcs.computeNGD(concept, tag, img=1)
            if dist < 10:
                vals.append((tag,dist))
        vals.sort(key=lambda v:v[1])
        printStatus(INFO, '%s -> %s' % (concept, ' '.join([x[0] for x in vals[:3]])))
        writeRankingResults(vals, resultfile)
Example #53
0
def process(options, testCollection, method):
    rootpath = options.rootpath

    scorers = [HitScorer(k) for k in [1, 2, 5, 10]]
    im2truth = load_ground_truth(testCollection, imset=None, rootpath=rootpath)
    printStatus(INFO, 'nr of ground-truthed images: %d' % len(im2truth))

    tag_prediction_file = os.path.join(rootpath, testCollection,'autotagging', testCollection, method, 'id.tagvotes.txt')
    printStatus(INFO, 'evaluating %s' % tag_prediction_file)
    res = [0] * len(scorers)
    nr_of_images = 0

    for line in open(tag_prediction_file):
        elems = line.strip().split()
        imageid = elems[0]
        del elems[0]
        assert(len(elems)%2 == 0)
        pred_labels = [elems[i] for i in range(0, len(elems), 2)]
        pred_labels = pred_labels[:10] # consider at most the first 20 predicted tags
        truth = im2truth.get(imageid, None)
        if not truth:
            continue
        sorted_labels = [int(x in truth) for x in pred_labels]
        perf = [scorer.score(sorted_labels) for scorer in scorers]
        res = [res[i] + perf[i] for i in range(len(scorers))]
        nr_of_images += 1

    printStatus(INFO, 'nr of images: %d' % nr_of_images)
    res = [x/nr_of_images for x in res]

    print ' '.join([x.name() for x in scorers])
    print ' '.join(['%.3f' % x for x in res])
Example #54
0
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, "feature.bin")
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)

    makedirsforfile(resultfile)
    fw = open(resultfile, "wb")

    done = []
    start = 0

    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, "processing images from %d to %d" % (start, end - 1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert len(done) == len(set(done))
    with open(os.path.join(result_dir, "id.txt"), "w") as fw:
        fw.write(" ".join(done))
        fw.close()

    with open(os.path.join(result_dir, "shape.txt"), "w") as fw:
        fw.write("%d %d" % (len(done), feat_file.ndims))
        fw.close()
    print "%d requested, %d obtained" % (len(imset), len(done))