Ejemplo n.º 1
0
def _generate_leveldb(file_path, image_paths, targets, width, height):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\t\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 10000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(image_paths)):
        # Each image is a top level key with a keyname like 00000000011, in increasing
        # order starting from 00000000000.
        key = utils.get_key(idx)

        # Do common normalization that might happen across both testing and validation.
        try:
            image = _preprocess_data(
                _load_numpy_image(image_paths[idx], width, height))
        except:
            print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[
                idx]
            continue

        # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
        datum = Datum()
        datum.channels = 3  # RGB
        datum.height = height
        datum.width = width
        datum.data = image.tostring()
        datum.label = targets[idx]
        value = datum.SerializeToString()
        wb.put(key, value)

        if (idx + 1) % commit_every == 0:
            wb.write()
            del wb
            wb = db.write_batch()
            end_time = int(round(time.time() * 1000))
            total_time = end_time - start_time
            print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % (
                key, total_time)
            start_time = int(round(time.time() * 1000))

    end_time = int(round(time.time() * 1000))
    total_time = end_time - start_time
    print "\t\t\tWriting final batch, time for batch: %d ms" % total_time
    wb.write()
    db.close()
Ejemplo n.º 2
0
    def _generate_leveldb(self, file_path, pairs, target, single_data):
        """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
        print "\tGenerating LevelDB file at %s..." % file_path
        shutil.rmtree(file_path, ignore_errors=True)
        db = plyvel.DB(file_path, create_if_missing=True)
        wb = db.write_batch()
        commit_every = 250000
        start_time = int(round(time.time() * 1000))
        for idx in range(len(pairs)):
            # Each image pair is a top level key with a keyname like 00000000011, in increasing
            # order starting from 00000000000.
            key = siamese_utils.get_key(idx)

            # Actually expand our images now, taking the index reference and turning it into real
            # image pairs; we delay doing this until now for efficiency reasons, as we will probably
            # have more pairs of images than actual computer memory.
            image_1 = single_data[pairs[idx][0]]
            image_2 = single_data[pairs[idx][1]]
            paired_image = np.concatenate([image_1, image_2])

            # Do things like mean normalize, etc. that happen across both testing and validation.
            paired_image = self._preprocess_data(paired_image)

            # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
            datum = Datum()
            # One channel for each image in the pair.
            datum.channels = 2  # One channel for each image in the pair.
            datum.height = constants.HEIGHT
            datum.width = constants.WIDTH
            datum.data = paired_image.tostring()
            datum.label = target[idx]
            value = datum.SerializeToString()
            wb.put(key, value)

            if (idx + 1) % commit_every == 0:
                wb.write()
                del wb
                wb = db.write_batch()
                end_time = int(round(time.time() * 1000))
                total_time = end_time - start_time
                print "Wrote batch, key: %s, time for batch: %d ms" % (
                    key, total_time)
                start_time = int(round(time.time() * 1000))

        wb.write()
        db.close()
Ejemplo n.º 3
0
  def _generate_leveldb(self, file_path, pairs, target, single_data):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 250000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(pairs)):
      # Each image pair is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = siamese_utils.get_key(idx)

      # Actually expand our images now, taking the index reference and turning it into real
      # image pairs; we delay doing this until now for efficiency reasons, as we will probably
      # have more pairs of images than actual computer memory.
      image_1 = single_data[pairs[idx][0]]
      image_2 = single_data[pairs[idx][1]]
      paired_image = np.concatenate([image_1, image_2])

      # Do things like mean normalize, etc. that happen across both testing and validation.
      paired_image = self._preprocess_data(paired_image)

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      # One channel for each image in the pair.
      datum.channels = 2 # One channel for each image in the pair.
      datum.height = constants.HEIGHT
      datum.width = constants.WIDTH
      datum.data = paired_image.tostring()
      datum.label = target[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    wb.write()
    db.close()
Ejemplo n.º 4
0
def _generate_leveldb(file_path, image_paths, targets, width, height):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\t\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 10000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(image_paths)):
      # Each image is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = utils.get_key(idx)

      # Do common normalization that might happen across both testing and validation.
      try:
        image = _preprocess_data(_load_numpy_image(image_paths[idx], width, height))
      except:
        print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[idx]
        continue

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      datum.channels = 3 # RGB
      datum.height = height
      datum.width = width
      datum.data = image.tostring()
      datum.label = targets[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    end_time = int(round(time.time() * 1000))
    total_time = end_time - start_time
    print "\t\t\tWriting final batch, time for batch: %d ms" % total_time
    wb.write()
    db.close()
Ejemplo n.º 5
0
def _generate_leveldb(self, file_path, image, target, single_data):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 250000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(pairs)):
      # Each image is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = utils.get_key(idx)

      # Do things like mean normalize, etc. that happen across both testing and validation.
      paired_image = self._preprocess_data(paired_image)

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      # TODO(neuberg): Confirm that this is the correct way to setup RGB images for
      # Caffe for our dataset.
      datum.channels = 3
      datum.height = constants.HEIGHT
      datum.width = constants.WIDTH
      datum.data = image.tostring()
      datum.label = target[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    wb.write()
    db.close()
def create_lmdb_file(dataset, phase, w2v_dict ):

    print 'Starting %s' % phase
    db_name = './examples/language_model/lm_%s_db' % phase
    subprocess.call(['rm', '-rf', db_name])
    env = lmdb.open(db_name, map_size=2147483648*8)

    print 'Writing %s sentences, %s' % (len(dataset), phase)

    last = None
    index = 0
    with env.begin(write=True) as txn:
        for i in range(len(dataset)):
            sentence = dataset[i]

            ##sentence
            datum = Datum()
            datum.channels = 1
            datum.width = 300
            datum.height = len(sentence)
            datum.label = int(sentence[-1])

            current = datum.height
            if last and last != current:
                print sys._getframe().f_lineno, "length not equal"
                sys.exit(-1)

            #print sys._getframe().f_lineno, "sentence length:", len(sentence)
            for j in range(0, len(sentence)-1):
                word_idx = sentence[j]
                #print word_idx

                if word_idx in w2v_dict:
                    elem_vector = w2v_dict[word_idx]
                else:
                    elem_vector = w2v_dict[DEF_IDX]
                    
                if len(elem_vector) != W2V_LEN:
                    print sys._getframe().f_lineno, "w2v length not equal 300"
                    sys.exit(-1)

                for elem in elem_vector:
                    datum.float_data.append(elem)
                #datum.float_data += elem_vector


            key = str(i)
            txn.put(key, datum.SerializeToString())

            index += 1

            if index % 100 == 0:
                print "finished num:", index 
                sys.stdout.flush()

            '''
            if index > 2 :
                print '11111111111111111111111111111111111111111111'
                break;
            '''
            last = current

    print 'Writing %s sentences, %s. End' % (len(dataset), phase)
Ejemplo n.º 7
0
        else:
            datum.height, datum.width = submap.shape
            datum.channels = 1

        datum.float_data.extend(list(submap.flatten()))
        if mean_blob is None:
            mean_blob = BlobProto()
            mean_blob.height = datum.height
            mean_blob.width = datum.width
            mean_blob.channels = datum.channels
            mean_blob.num = 1
            img_mean = submap
        else:
            img_mean += submap

        datum.label = 0
        if not txn.put(key, datum.SerializeToString(), dupdata=False):
            print 'Key {}: failed.'.format(key)

        n += 1
        if n % 1000 == 0:
            txn.commit()
            print "Proccessed {} samples.".format(n)
            txn = db.begin(write=True)

    # commit last batch
    if n % 1000 != 0:
        txn.commit()
        print "Proccessed {} samples.".format(n)
    img_mean /= len(maps)
    print "Totally proccessed {} samples.".format(n)