Exemple #1
0
def merge_dbs(input_dbs, output_db, shuffle=True):

    map_size = 2**40

    total_size = sum([db.size(input_db) for input_db in input_dbs])
    ids = range(total_size)
    if shuffle:
        random.shuffle(ids)

    env_out = lmdb.open(output_db, readonly=False, map_size=map_size)
    idx = 0

    for input_db in input_dbs:
        env_in = lmdb.open(input_db, readonly=True, map_size=map_size)

        with env_out.begin(write=True) as txn_out:
            with env_in.begin() as txn_in:
                for key, data in txn_in.cursor():
                    txn_out.put(db.str_id(ids[idx]), data)
                    idx += 1
                    if idx % db.LOG_EVERY == 0:
                        print 'Processed {0} samples'.format(idx)

        env_in.close()
    env_out.close()

    if idx % db.LOG_EVERY != 0:
        print 'Processed {0} samples'.format(idx)
Exemple #2
0
    def init(self, db_path=None, batch_size=None):
        if self.is_opened():
            self.close()

        if db_path is not None:
            self.db_path = db_path
            self.size = db.size(self.db_path)
            self.entry_shape = db.entry_shape(self.db_path)
            self.shape = tuple([self.size] + list(self.entry_shape))

        if batch_size is not None:
            self.batch_size = batch_size
Exemple #3
0
    print usage
    sys.exit(0)
if ('--verbose' in sys.argv):
    verbose = 1
dirArgIndex = sys.argv.index('--dir')
try:
    dir = sys.argv[dirArgIndex + 1]
except IndexError:
    print 'Invalid directory argument'

if (not os.path.exists(dir)) or (not os.path.isdir(dir)):
    print 'Invalid directory argument'
    sys.exit(2)

images = listFiles(
    dir,
    '*.JPG;*.jpg;*.png;*.jpeg;*.bmp;*.dcx;*.gif;*.pcx;*.ppm;*.psd;*.tga;*.tif;*.tiff;*.xpm'
)

index = db.size()
for image in images:
    imat = preprocess.ProcessImage(image)
    if (numpy.count_nonzero(imat) > 0):
        iid = str(index).zfill(6)
        db.addimg(iid, image)
        db.addsig(iid, imat[0], 0)
        db.addsig(iid, imat[1], 1)
        db.addsig(iid, imat[2], 2)
        if (verbose): print 'added ', iid, ':', image
        index += 1
Exemple #4
0
    """
    # Logger
    logger = log.file_console_log()

    # Connect to the MongoDB database
    stored_tweets = db.twitter_collection()

    # Retrieve tweets from user timeline and store new ones in database
    # If any error occurs, log it in log file
    try:
        timeline_tweets = twitter.home_timeline()
    except Exception, error:
        logger.error(traceback.format_exc()[:-1])  # log error
        raise error
    else:
        before = db.size(stored_tweets)  # initial collection size
        # Insert each tweet in database
        # If any error occurs, log it in log file
        for tweet in timeline_tweets:
            try:
                db.insert(tweet, stored_tweets)  # insert in mongoDB collection
                # Note: if tweet already in DB, the insertion will fail silently
            except db.DBError, error:
                logger.error(traceback.format_exc()[:-1])  # log error
                raise error

        after = db.size(stored_tweets)  # new collection size

        # log insertion information
        message = "[%s] +%d new, %d stored" % (db.name(stored_tweets),
                                                after - before,
Exemple #5
0
def ScoreQuery(query):
    """ generate scores for query image using m wavelet coefficients

    query is a path to uery image file
    returns a list of image scores
    number of wavelet coefficients is redefined in the preprocess model
    """
    qmat = preprocess.ProcessImage(query)

    print(qmat.shape)

    if (numpy.count_nonzero(qmat) == 0):
        raise AssertionError, 'Cannot preprocess query image'
        return

    scores = numpy.zeros(db.size(), float)  #initialize scores for each image

    print(" score size is %s ", scores.size)

    #open databases for reading
    try:
        idb = shelve.open(db.idbpath, flag='r')
    except:
        raise AssertionError, 'Cannot open image database'

    print(" w shape is %s ", w.shape)

    for c in range(0, 3):  #for each color channel
        #open searcharrays for this color channel
        print 'color channel: ', c
        try:
            sa_plus = shelve.open(db.sadbpaths[c][0], flag='r')
            sa_minus = shelve.open(db.sadbpaths[c][1], flag='r')
        except:
            raise IOError, 'Cannot open search array(s) for colorplane ' + str(
                c)

        for iid in idb.keys():  #for each image
            print(" iid id %s int(iid) is %s  c is %s ", iid, int(iid), c)
            print(len(idb[iid]))
            if (len(idb[iid]) == 4):  # otherwise corrupt data in db
                scores[int(iid)] += w[0, c] * math.fabs(
                    qmat[c, 0, 0] - idb[iid][c + 1])  #idb[iid][c+1]
            print(" done ")

        #set this color channel's avg color val to 0 so that it does not take
        #part in further scoring of images
        qmat[c, 0, 0] = 0

        print(qmat[c, 0, 0])

        print(" we  are here %s ", w.size)

        for row in range(0, len(qmat[c])):
            indices = nonzero(qmat[c, row])
            indices = indices[0]
            # bug-fixed
            for col in indices:  #for each non-zero coefficient
                rowcolid = str(row).zfill(3) + str(col).zfill(3)
                #print rowcolid,
                imglist = []
                print(qmat[c, row, col])
                if (qmat[c, row, col] > 0):  #positive search array
                    try:
                        iList = sa_plus[rowcolid]
                        imglist.extend(iList)
                    except KeyError:
                        continue
                else:  #negative search array
                    try:
                        iList = sa_minus[rowcolid]
                        imglist.extend(iList)
                    except KeyError:
                        continue

                #print imglist
                for iid in imglist:  #for each image in the matched im list
                    scores[int(iid)] -= w[bin(row, col), c]  #update the scores

        sa_plus.close()
        sa_minus.close()

    idb.close()
    return scores
 def test_insertion_mongo(self):
     """ Tests that fetched tweets are correclty inserted in mongo. """
     timeline = twitter.home_timeline()
     for tweet in timeline:
         db.insert(tweet, self.tweets_collection)
     self.assertEqual(db.size(self.tweets_collection), len(timeline))
 def test_insertion_mongo(self):
     """ Tests that fetched tweets are correclty inserted in mongo. """
     timeline = twitter.home_timeline()
     for tweet in timeline:
         db.insert(tweet, self.tweets_collection)
     self.assertEqual(db.size(self.tweets_collection), len(timeline))
Exemple #8
0
def learn(solver_path, snapshot_path, iters_to_init, max_samples_to_use):
    net_path = proto.get_net_from_solver(solver_path)
    train_db_path = proto.get_db_from_net(net_path)
    train_db_len = db.size(train_db_path)

# prepare path to the temporary model files
    active_solver_path = utils.create_temp_path(solver_path + config.POSTFIX)
    active_net_path = utils.create_temp_path(net_path + config.POSTFIX)
    active_db_path = utils.create_temp_path(train_db_path + config.POSTFIX)

# prepare temporary model files
    proto.prepare_net(net_path, active_net_path, active_db_path)
    snapshot_prefix, snapshot_iter = proto.prepare_solver(
        solver_path, active_solver_path, active_net_path, snapshot_path, iters_to_init
    )

    print snapshot_prefix

    # recover the snapshot folder
    snapshot_path = '/'.join(snapshot_prefix.split('/')[:-1])
    epoch_file = os.path.join(snapshot_path, config.EPOCH_FILE)

    # deploy net
    # deploy_net = net.Net(active_net_path, output_layers=config.OUTPUT_LAYERS)
    deploy_net = net.DropoutNet(active_net_path, config.DROPOUT_ITERS, aggregate='mean',
                                output_layers=config.OUTPUT_LAYERS, output_processor=utils.softmax)

    epoch_used_samples = set()
    dataset = samples.Dataset(train_db_path, deploy_net.batch_size, epoch_used_samples)

# initialize net
#     solverstate_path = proto.solverstate_path(snapshot_prefix, iters_to_init)
#     if not os.path.exists(solverstate_path):
    if os.path.exists(active_db_path):
        shutil.rmtree(active_db_path)
        # shutil.copytree(train_db_path, active_db_path)

    used_samples = db.extract_samples(train_db_path, active_db_path, iters_to_init * deploy_net.batch_size)
    init_network(active_solver_path)

# do the real learning
    print 'train samples:', train_db_len
    for epoch in xrange(config.MAX_EPOCHS):
        print 'Epoch #{0}'.format(epoch)
        epoch_used_samples.clear()
        epoch_used_samples.update(used_samples)

        while len(epoch_used_samples) < train_db_len:
            if snapshot_iter > config.MAX_ITER:
                break

            solverstate_path = proto.solverstate_path(snapshot_prefix, snapshot_iter)
            caffemodel_path = proto.caffemodel_path(snapshot_prefix, snapshot_iter)

            print 'Using snapshot iter #{0}'.format(snapshot_iter)
            deploy_net.load_model(caffemodel_path)
            # active_samples = samples.choose_active(deploy_net, dataset, config.BATCHES_PER_RUN)
            # epoch_used_samples.update(active_samples)
            # assert len(active_samples) <= int(max(active_samples)), \
            #     'Index of the highest sample is lower than the number of used samples'
            #            # check if it makes sense to continue
            # iters_to_do = len(active_samples) / deploy_net.batch_size
            # if iters_to_do == 0:
            #     break

            num_samples_to_choose = min(max_samples_to_use - len(epoch_used_samples), config.NEW_SAMPLES_PER_ITER)
            batches_to_choose = num_samples_to_choose / deploy_net.batch_size

            chosen_samples = samples.choose_active(deploy_net, dataset, batches_to_choose)
            active_samples = chosen_samples + list(epoch_used_samples)
            epoch_used_samples.update(chosen_samples)

            print 'Used {} samples'.format(len(epoch_used_samples))
            # check if it makes sense to continue
            iters_to_do = len(active_samples) / deploy_net.batch_size
            if iters_to_do == 0:
                break

            db.extract_samples(train_db_path, active_db_path, active_samples)
            proto.increase_max_iters(active_solver_path, iters_to_do)
            train_network(active_solver_path, solverstate_path)

            snapshot_iter += iters_to_do
            utils.append_to_file(epoch_file, '{}:{}'.format(snapshot_iter, len(epoch_used_samples)))