Ejemplo n.º 1
0
def make_data_layer_2():
    data_set = {}
    
    data_set['train'] = 'data/imagernn/train_indices.npy'
    data_set['val'] = 'data/imagernn/valid_indices.npy'
    
    for phase in ['train', 'val']:
        print 'Starting %s' % phase
        db_name = 'data/imagernn/fc7_%s_lmdb' % phase
        subprocess.call(['rm', '-rf', db_name])
        
     
        env = lmdb.open(db_name, map_size=2147483648*8)
        data = np.load(data_set[phase])
        with env.begin(write=True) as txn:
            for var in xrange(data.shape[0]):
                
                if (var%1000 == 0):
                    print "%08d %08d %02.2f %s"%(var,data.shape[0],float(var)/data.shape[0],time.ctime())
                datum = Datum()
                datum.channels = 4096
                datum.width = 1
                datum.height = 1
                
                for j in xrange(4096):
                    datum.float_data.append(data[var,j])
                
                key = "%010d"%(var)
                txn.put(key, datum.SerializeToString())
Ejemplo n.º 2
0
def make_data(param):
    for phase in ['train', 'valid']:
        print 'Starting %s' % phase
        db_name = 'data/imagernn/lm_%s_db' % phase
        subprocess.call(['rm', '-rf', db_name])
        env = lmdb.open(db_name, map_size=2147483648*8)

        def vocab_transform(target_input):
            def t_foo(x):
                return x if x < param['unknown_symbol'] else param['unknown_symbol']

            target_line = [t_foo(int(x)) for x in target_input.split(' ')[:param['maximum_length']]]
            #add end symbol
            target_line.append(param['end_symbol'])
            
            target_line = target_line[:param['maximum_length']] + \
                          [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']]))
            assert len(target_line) == param['maximum_length']
            return target_line

        allX = []
        with open('data/imagernn/%s_indices.txt' % phase, 'r') as f1:
            for en in f1.readlines():
                allX.append(vocab_transform(en))

        print 'Writing %s sentences' % len(allX)
        
        feature_np_in = 'data/imagernn/%s_indices.npy'% phase
        feature_np = np.load(feature_np_in)
        
        print "dim %d"%(2 * param['maximum_length'] + param['image_feature_length'])
        with env.begin(write=True) as txn:
            for i, target_line in enumerate(allX):
                if (i%1000 == 0):
                    print "%08d %08d %02.2f %s"%(i,len(allX),float(i)/len(allX),time.ctime())
                #if (i == 3000):
                #    break
                datum = Datum()
                datum.channels = 2 * param['maximum_length'] + param['image_feature_length']
                datum.width = 1
                datum.height = 1
                for j in range(param['image_feature_length']):
                    datum.float_data.append(feature_np[i,j])
                for j in range(param['maximum_length']):
                    if j == 0:
                        datum.float_data.append(param['start_symbol'])
                    else:
                        datum.float_data.append(target_line[j - 1])
                for j in range(param['maximum_length']):
                    datum.float_data.append(target_line[j])
                
                assert(len(datum.float_data) == datum.channels)
                key = "%08d"%(i)
                txn.put(key, datum.SerializeToString())
Ejemplo n.º 3
0
def make_data(param):
    for phase in ['train', 'valid', 'test']:
        print 'Starting %s' % phase
        db_name = './examples/language_model/lm_%s_db' % phase
        #print db_name
        if os.path.isfile(db_name):
            subprocess.call(['rm', '-rf', db_name])
        env = lmdb.open(db_name, map_size=10485760*64)

        def vocab_transform(target_input):
            def t_foo(x):
                return x if x < param['unknown_symbol'] else param['unknown_symbol']

            target_line = [t_foo(int(x)) for x in target_input.split(' ')[:param['maximum_length']]]

            target_line = target_line[:param['maximum_length']] + \
                          [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']]))
            assert len(target_line) == param['maximum_length']
            return target_line

        allX = []
        with open('./data/language_model/%s_indices.txt' % phase, 'r') as f1:
            for en in f1.readlines():
                allX.append(vocab_transform(en))

        print 'Writing %s sentences' % len(allX)

        with env.begin(write=True) as txn:
            for i, target_line in enumerate(allX):
                datum = Datum()
                datum.channels = 2 * param['maximum_length']
                datum.width = 1
                datum.height = 1
                for j in range(param['maximum_length']):
                    if j == 0:
                        datum.float_data.append(param['start_symbol'])
                    else:
                        datum.float_data.append(target_line[j - 1])
                for j in range(param['maximum_length']):
                    datum.float_data.append(target_line[j])
                key = str(i)
                txn.put(key, datum.SerializeToString())
def make_data(param):
    for phase in ["train", "valid", "test"]:
        print "Starting %s" % phase
        db_name = "./examples/language_model/lm_%s_db" % phase
        subprocess.call(["rm", "-rf", db_name])
        env = lmdb.open(db_name, map_size=2147483648 * 8)

        def vocab_transform(target_input):
            def t_foo(x):
                return x if x < param["unknown_symbol"] else param["unknown_symbol"]

            target_line = [t_foo(int(x)) for x in target_input.split(" ")[: param["maximum_length"]]]

            target_line = target_line[: param["maximum_length"]] + [param["zero_symbol"]] * (
                param["maximum_length"] - len(target_line[: param["maximum_length"]])
            )
            assert len(target_line) == param["maximum_length"]
            return target_line

        allX = []
        with open("./data/language_model/%s_indices.txt" % phase, "r") as f1:
            for en in f1.readlines():
                allX.append(vocab_transform(en))

        print "Writing %s sentences" % len(allX)

        with env.begin(write=True) as txn:
            for i, target_line in enumerate(allX):
                datum = Datum()
                datum.channels = 2 * param["maximum_length"]
                datum.width = 1
                datum.height = 1
                for j in range(param["maximum_length"]):
                    if j == 0:
                        datum.float_data.append(param["start_symbol"])
                    else:
                        datum.float_data.append(target_line[j - 1])
                for j in range(param["maximum_length"]):
                    datum.float_data.append(target_line[j])
                key = str(i)
                txn.put(key, datum.SerializeToString())
Ejemplo n.º 5
0
  def _generate_leveldb(self, file_path, pairs, target, single_data):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 250000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(pairs)):
      # Each image pair is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = siamese_utils.get_key(idx)

      # Actually expand our images now, taking the index reference and turning it into real
      # image pairs; we delay doing this until now for efficiency reasons, as we will probably
      # have more pairs of images than actual computer memory.
      image_1 = single_data[pairs[idx][0]]
      image_2 = single_data[pairs[idx][1]]
      paired_image = np.concatenate([image_1, image_2])

      # Do things like mean normalize, etc. that happen across both testing and validation.
      paired_image = self._preprocess_data(paired_image)

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      # One channel for each image in the pair.
      datum.channels = 2 # One channel for each image in the pair.
      datum.height = constants.HEIGHT
      datum.width = constants.WIDTH
      datum.data = paired_image.tostring()
      datum.label = target[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    wb.write()
    db.close()
Ejemplo n.º 6
0
def _generate_leveldb(file_path, image_paths, targets, width, height):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\t\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 10000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(image_paths)):
      # Each image is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = utils.get_key(idx)

      # Do common normalization that might happen across both testing and validation.
      try:
        image = _preprocess_data(_load_numpy_image(image_paths[idx], width, height))
      except:
        print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[idx]
        continue

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      datum.channels = 3 # RGB
      datum.height = height
      datum.width = width
      datum.data = image.tostring()
      datum.label = targets[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    end_time = int(round(time.time() * 1000))
    total_time = end_time - start_time
    print "\t\t\tWriting final batch, time for batch: %d ms" % total_time
    wb.write()
    db.close()
Ejemplo n.º 7
0
def _generate_leveldb(self, file_path, image, target, single_data):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 250000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(pairs)):
      # Each image is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = utils.get_key(idx)

      # Do things like mean normalize, etc. that happen across both testing and validation.
      paired_image = self._preprocess_data(paired_image)

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      # TODO(neuberg): Confirm that this is the correct way to setup RGB images for
      # Caffe for our dataset.
      datum.channels = 3
      datum.height = constants.HEIGHT
      datum.width = constants.WIDTH
      datum.data = image.tostring()
      datum.label = target[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    wb.write()
    db.close()
def create_lmdb_file(dataset, phase, w2v_dict ):

    print 'Starting %s' % phase
    db_name = './examples/language_model/lm_%s_db' % phase
    subprocess.call(['rm', '-rf', db_name])
    env = lmdb.open(db_name, map_size=2147483648*8)

    print 'Writing %s sentences, %s' % (len(dataset), phase)

    last = None
    index = 0
    with env.begin(write=True) as txn:
        for i in range(len(dataset)):
            sentence = dataset[i]

            ##sentence
            datum = Datum()
            datum.channels = 1
            datum.width = 300
            datum.height = len(sentence)
            datum.label = int(sentence[-1])

            current = datum.height
            if last and last != current:
                print sys._getframe().f_lineno, "length not equal"
                sys.exit(-1)

            #print sys._getframe().f_lineno, "sentence length:", len(sentence)
            for j in range(0, len(sentence)-1):
                word_idx = sentence[j]
                #print word_idx

                if word_idx in w2v_dict:
                    elem_vector = w2v_dict[word_idx]
                else:
                    elem_vector = w2v_dict[DEF_IDX]
                    
                if len(elem_vector) != W2V_LEN:
                    print sys._getframe().f_lineno, "w2v length not equal 300"
                    sys.exit(-1)

                for elem in elem_vector:
                    datum.float_data.append(elem)
                #datum.float_data += elem_vector


            key = str(i)
            txn.put(key, datum.SerializeToString())

            index += 1

            if index % 100 == 0:
                print "finished num:", index 
                sys.stdout.flush()

            '''
            if index > 2 :
                print '11111111111111111111111111111111111111111111'
                break;
            '''
            last = current

    print 'Writing %s sentences, %s. End' % (len(dataset), phase)
Ejemplo n.º 9
0
    db = lmdb.open(args.save_db, max_dbs=2, map_size=1099511627776)
    txn = db.begin(write=True)
    maps = os.listdir(args.submap_dir)
    maps = sorted(maps)

    mean_blob = None
    n = 0
    for cur_map in maps:
        # print "Proccessing {}".format(cur_map)
        key = os.path.splitext(cur_map)[0]

        # try:
        #     value = txn.get(key)
        # except KeyError:
        # Make data blob
        datum = Datum()
        submap = sio.loadmat(os.path.join(args.submap_dir, cur_map))
        submap = submap[args.variable].astype('float')
        if submap.ndim == 3:
            submap = submap.swapaxes(1,2).swapaxes(0,1).astype('float')
            datum.channels, datum.height, datum.width = submap.shape
        else:
            datum.height, datum.width = submap.shape
            datum.channels = 1

        datum.float_data.extend(list(submap.flatten()))
        if mean_blob is None:
            mean_blob = BlobProto()
            mean_blob.height = datum.height
            mean_blob.width = datum.width
            mean_blob.channels = datum.channels