Beispiel #1
0
def test_casts(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    with x.create('block', Nfile=1, dtype='f8', size=128) as b:
        assert_raises(BigFileError, b.write, 0, numpy.array('aaaaaa'))
        b.write(0, numpy.array(True, dtype='?'))
Beispiel #2
0
def test_slicing(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    numpy.random.seed(1234)

    # test creating
    with x.create("data", Nfile=1, dtype=('f8', 32), size=128) as b:
        data = numpy.random.uniform(100000, size=(128, 32))
        junk = numpy.random.uniform(100000, size=(128, 32))

        b.write(0, data)

        with x['data'] as b:
            assert_equal(b[:], data)
            assert_equal(b[0], data[0])

        b[:len(junk)] = junk

        with x['data'] as b:
            assert_equal(b[:], junk)
            assert_equal(b[0], junk[0])

        b[3] = data[3]

        with x['data'] as b:
            assert_equal(b[3], data[3])

    shutil.rmtree(fname)
Beispiel #3
0
def test_pickle(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # test creating
    column = x.create("abc", dtype='f8', size=128)
    
    import pickle
    str = pickle.dumps(column)
    column1 = pickle.loads(str)

    assert type(column) == type(column1)
    assert column.size == column1.size
    assert column.dtype == column1.dtype
    assert column.comm is column1.comm

    column.close()
    str = pickle.dumps(column)
    column1 = pickle.loads(str)

    str = pickle.dumps(x)
    x1 = pickle.loads(str)

    assert type(x) == type(x1)
    assert x1.basename == x.basename

    x.close()
    str = pickle.dumps(x)
    x1 = pickle.loads(str)
    assert tuple(sorted(x1.blocks)) == tuple(sorted(x.blocks))
    shutil.rmtree(fname)
Beispiel #4
0
def test_attr(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    with x.create('.', dtype=None) as b:
        b.attrs['int'] = 128
        b.attrs['float'] = [128.0, 3, 4]
        b.attrs['string'] = 'abcdefg'
        b.attrs['complex'] = 128 + 128J
        b.attrs['bool'] = True
        b.attrs['arrayustring'] = numpy.array(u'unicode')
        b.attrs['arraysstring'] = numpy.array('str')

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 128)
        assert_equal(b.attrs['float'], [128.0, 3, 4])
        assert_equal(b.attrs['string'],  'abcdefg')
        assert_equal(b.attrs['complex'],  128 + 128J)
        assert_equal(b.attrs['bool'],  True)
        b.attrs['int'] = 30
        b.attrs['float'] = [3, 4]
        b.attrs['string'] = 'defg'
        b.attrs['complex'] = 32 + 32J
        b.attrs['bool'] = False

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 30)
        assert_equal(b.attrs['float'], [3, 4])
        assert_equal(b.attrs['string'],  'defg')
        assert_equal(b.attrs['complex'],  32 + 32J)
        assert_equal(b.attrs['bool'],  False)

    shutil.rmtree(fname)
Beispiel #5
0
def test_append(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    name = 'f4'
    d = numpy.dtype(('f4', 3))
    numpy.random.seed(1234)

    data = numpy.random.uniform(100000, size=(100, 3)).astype('f4')
    # test creating
    with x.create(name, Nfile=3, dtype=d, size=100) as b:
        b.write(0, data)

        b.append(data, Nfile=2)
        with x.open(name) as bb:
            assert bb.size == 200
        assert b.size == 200

    with x.open(name) as b:
        assert b.Nfile == 5
        assert_equal(b[:100], data)
        assert_equal(b[100:], data)
        assert b.size == 200

    shutil.rmtree(fname)
Beispiel #6
0
def test_attr(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    with x.create('.', dtype=None) as b:
        b.attrs['int'] = 128
        b.attrs['float'] = [128.0, 3, 4]
        b.attrs['string'] = 'abcdefg'
        b.attrs['complex'] = 128 + 128J
        b.attrs['bool'] = True
        b.attrs['arrayustring'] = numpy.array(u'unicode')
        b.attrs['arraysstring'] = numpy.array('str')

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 128)
        assert_equal(b.attrs['float'], [128.0, 3, 4])
        assert_equal(b.attrs['string'],  'abcdefg')
        assert_equal(b.attrs['complex'],  128 + 128J)
        assert_equal(b.attrs['bool'],  True)
        b.attrs['int'] = 30
        b.attrs['float'] = [3, 4]
        b.attrs['string'] = 'defg'
        b.attrs['complex'] = 32 + 32J
        b.attrs['bool'] = False

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 30)
        assert_equal(b.attrs['float'], [3, 4])
        assert_equal(b.attrs['string'],  'defg')
        assert_equal(b.attrs['complex'],  32 + 32J)
        assert_equal(b.attrs['bool'],  False)

    shutil.rmtree(fname)
Beispiel #7
0
def test_casts(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    with x.create('block', Nfile=1, dtype='f8', size=128) as b:
        assert_raises(BigFileError, b.write, 0, numpy.array('aaaaaa'))
        b.write(0, numpy.array(True, dtype='?'))
Beispiel #8
0
def test_pickle(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # test creating
    column = x.create("abc", dtype='f8', size=128)
    
    import pickle
    str = pickle.dumps(column)
    column1 = pickle.loads(str)

    assert type(column) == type(column1)
    assert column.size == column1.size
    assert column.dtype == column1.dtype
    assert column.comm is column1.comm

    column.close()
    str = pickle.dumps(column)
    column1 = pickle.loads(str)

    str = pickle.dumps(x)
    x1 = pickle.loads(str)

    assert type(x) == type(x1)
    assert x1.basename == x.basename

    x.close()
    str = pickle.dumps(x)
    x1 = pickle.loads(str)
    assert tuple(sorted(x1.blocks)) == tuple(sorted(x.blocks))
    shutil.rmtree(fname)
Beispiel #9
0
def test_slicing(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    numpy.random.seed(1234)

    # test creating
    with x.create("data", Nfile=1, dtype=('f8', 32), size=128) as b:
        data = numpy.random.uniform(100000, size=(128, 32))
        junk = numpy.random.uniform(100000, size=(128, 32))

        b.write(0, data)

        with x['data'] as b:
            assert_equal(b[:], data)
            assert_equal(b[0], data[0])

        b[:len(junk)] = junk

        with x['data'] as b:
            assert_equal(b[:], junk)
            assert_equal(b[0], junk[0])

        b[3] = data[3]

        with x['data'] as b:
            assert_equal(b[3], data[3])

    shutil.rmtree(fname)
Beispiel #10
0
def test_attr(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    with x.create('.', dtype=None) as b:
        b.attrs['int'] = 128
        b.attrs['float'] = [128.0, 3, 4]
        b.attrs['string'] = 'abcdefg'
        b.attrs['complex'] = 128 + 128J

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 128)
        assert_equal(b.attrs['float'], [128.0, 3, 4])
        assert_equal(b.attrs['string'], 'abcdefg')
        assert_equal(b.attrs['complex'], 128 + 128J)
        b.attrs['int'] = 30
        b.attrs['float'] = [3, 4]
        b.attrs['string'] = 'defg'
        b.attrs['complex'] = 32 + 32J

    with x.open('.') as b:
        assert_equal(b.attrs['int'], 30)
        assert_equal(b.attrs['float'], [3, 4])
        assert_equal(b.attrs['string'], 'defg')
        assert_equal(b.attrs['complex'], 32 + 32J)

    shutil.rmtree(fname)
Beispiel #11
0
def test_grow(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    d = numpy.dtype(('f4', 3))
    numpy.random.seed(1234)

    data = numpy.random.uniform(100000, size=(100, 3)).astype('f4')
    # test creating
    with x.create(d.str, Nfile=3, dtype=d, size=100) as b:
        b.write(0, data)

        b.grow(size=100, Nfile=2)
        with x.open(d.str) as bb:
            assert bb.size == 200
        b.write(100, data)
        assert b.size == 200

    with x.open(d.str) as b:
        assert b.Nfile == 5
        assert_equal(b[:100], data)
        assert_equal(b[100:], data)
        assert b.size == 200

    shutil.rmtree(fname)
    def __init__(self,
                 collection,
                 vocab_file,
                 feature,
                 language,
                 flag_shuffle=True,
                 method=None,
                 fluency_threshold=DEFAULT_FLUENCY_U,
                 rootpath=ROOT_PATH):
        self.language = language
        self.anno_file_path = utility.get_sent_file(collection, language,
                                                    rootpath)
        self.fluency_threshold = fluency_threshold
        self.method = method
        if method:
            self.sent_score_file = utility.get_sent_score_file(
                collection, language, rootpath)
            assert method in ['sample', 'filter', 'weighted']
            assert self.sent_score_file != None
            assert fluency_threshold > 0
            if method == 'weighted':
                # Not sampling the data if fluency-guided method is weighted_loss
                self.method = method = None
        else:
            self.sent_score_file = None

        self.textbank = TextBank(vocab_file)
        assert self.textbank.vocab[TOKEN_PAD] == 0
        self.vf_reader = BigFile(
            utility.get_feat_dir(collection, feature, rootpath))
        self.vf_names = set(self.vf_reader.names)
        self.vf_size = self.vf_reader.ndims
        self.flag_shuffle = flag_shuffle
        self._load_data()
Beispiel #13
0
def SaveSnapshot(comm, filename, P, blocks=None):
    file = BigFile(filename)
    if blocks is None:
        blocks = P.keys()
    for key in blocks:
        # hack, skip scalar mass
        if numpy.isscalar(P[key]): 
            continue
        file.mpi_create_from_data(comm, '1/%s' % key, P[key])    
Beispiel #14
0
def SaveSnapshot(comm, filename, P, blocks=None):
    file = BigFile(filename)
    if blocks is None:
        blocks = P.keys()
    for key in blocks:
        # hack, skip scalar mass
        if numpy.isscalar(P[key]):
            continue
        file.mpi_create_from_data(comm, '1/%s' % key, P[key])
Beispiel #15
0
def test_closed(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')
    x.close()
    assert x.blocks == []
    try:
        h = x['.']
    except BigFileClosedError:
        pass
Beispiel #16
0
def test_passby(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # half floats are pass-through types, no casting is supported
    data = numpy.array([3.0, 5.0], dtype='f2')
    with x.create('block', Nfile=1, dtype='f2', size=128) as b:
        b.write(0, data)
        assert_equal(b[:2], data)
        assert_raises(BigFileError, b.write, 0, numpy.array((30, 20.)))
Beispiel #17
0
def test_passby(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # half floats are pass-through types, no casting is supported
    data = numpy.array([3.0, 5.0], dtype='f2')
    with x.create('block', Nfile=1, dtype='f2', size=128) as b:
        b.write(0, data)
        assert_equal(b[:2], data)
        assert_raises(BigFileError, b.write, 0, numpy.array((30, 20.)))
Beispiel #18
0
def test_attr_objects(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    with x.create('block', dtype=None) as b:
        def set_obj1():
            b.attrs['objects'] = numpy.array([object()])
        assert_raises(ValueError, set_obj1);
        def set_obj_scalar():
            b.attrs['objects'] = object()
        assert_raises(ValueError, set_obj_scalar);
    shutil.rmtree(fname)
Beispiel #19
0
def test_dataset(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    for name, d in dtypes:
        dt = numpy.dtype(d)
        numpy.random.seed(1234)
        # test creating
        with x.create(name, Nfile=1, dtype=dt, size=128) as b:
            data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] 
                    + list(dt.shape))[:b.size]
            b.write(0, data)

    bd = Dataset(x)
    assert set(bd.dtype.names) == set(x.blocks)
    assert isinstance(bd[:], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert_equal(len(bd['f8'].dtype), 0)
    # tuple of one item is the same as non-tuple
    assert isinstance(bd[('f8',)], BigBlock)
    assert_equal(len(bd[('f8',)].dtype), 0)

    assert isinstance(bd['f8', :10], numpy.ndarray)
    assert_equal(len(bd['f8', :10]), 10)
    assert_equal(len(bd['f8', :10].dtype), 0)
    assert_equal(len(bd[['f8',], :10].dtype), 1)

    # tuple of one item is the same as non-tuple
    assert_equal(len(bd[('f8',), :10].dtype), 0)
    assert isinstance(bd[:10, 'f8'], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert isinstance(bd[['f8', 'f4'],], Dataset)
    assert_equal(len(bd[['f8', 'f4'],].dtype), 2)
    assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray)

    for name, d in dtypes:
        assert_array_equal(x[name][:], bd[:][name])

    data1 = bd[:10]
    data2 = bd[10:20]

    bd[:10] = data2
    assert_array_equal(bd[:10], data2)

    bd[10:20] = data1
    assert_array_equal(bd[:10], data2)
    assert_array_equal(bd[10:20], data1)

    bd.append(data1)
    assert bd.size == 128 + 10
    assert_array_equal(bd[-10:], data1)

    shutil.rmtree(fname)
Beispiel #20
0
def test_attr_objects(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    with x.create('block', dtype=None) as b:
        def set_obj1():
            b.attrs['objects'] = numpy.array([object()])
        assert_raises(ValueError, set_obj1);
        def set_obj_scalar():
            b.attrs['objects'] = object()
        assert_raises(ValueError, set_obj_scalar);
    shutil.rmtree(fname)
Beispiel #21
0
def test_create(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    for d in dtypes:
        d = numpy.dtype(d)
        numpy.random.seed(1234)

        # test creating
        with x.create(d.str, Nfile=1, dtype=d, size=128) as b:
            data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(d.shape))[:b.size]
            b.write(0, data)

        with x[d.str] as b:
            assert_equal(b[:], data.astype(d.base))
            assert_equal(b[:],  b[...])

        # test creating
        data = numpy.random.uniform(100000, size=128*128).view(dtype=d.base).reshape([-1] + list(d.shape))[:128]
        with x.create_from_array(d.str, data) as b:
            pass

        with x[d.str] as b:
            assert_equal(b[:], data)

        # test writing with an offset
        with x[d.str] as b:
            b.write(1, data[0:1])
            assert_equal(b[1:2], data[0:1].astype(d.base))

        # test writing beyond file length
        with x[d.str] as b:
            caught = False
            try:
                b.write(1, data)
            except:
                caught = True
            assert caught
    assert_equal(set(x.blocks), set([numpy.dtype(d).str for d in dtypes]))
    import os
    os.system("ls -r %s" % fname)
    for b in x.blocks:
        assert b in x

    for b in x:
        assert b in x

    bd = BigData(x)
    assert set(bd.dtype.names) == set(x.blocks)
    d = bd[:]

    shutil.rmtree(fname)
Beispiel #22
0
def get_we(vocab, w2v_dir):
    w2v = BigFile(w2v_dir)
    ndims = w2v.ndims
    nr_words = len(vocab)
    words = [vocab[i] for i in range(nr_words)]
    we = np.random.uniform(low=-1.0, high=1.0, size=(nr_words, ndims))

    renamed, vecs = w2v.read(words)
    for i, word in enumerate(renamed):
        idx = vocab.find(word)
        we[idx] = vecs[i]

    return torch.Tensor(we)
Beispiel #23
0
class Text2W2VEncoder:
    def __init__(self, data_path):
        self.w2v = BigFile(data_path)
        vocab_size, self.ndims = self.w2v.shape()
        print("Text2W2VEncoder", "vocab_size", vocab_size, "dim", self.ndims)

    def encode(self, words):
        renamed, vectors = self.w2v.read(words)

        if len(vectors) > 0:
            vec = np.array(vectors).mean(axis=0)
        else:
            vec = np.zeros([self.ndims])
        return torch.Tensor(vec)
Beispiel #24
0
class W2Vec(Txt2Vec):
    def __init__(self, data_path, norm=0, clean=True):
        super(W2Vec, self).__init__(data_path, norm, clean)
        self.w2v = BigFile(data_path)
        vocab_size, self.ndims = self.w2v.shape()
        logger.info('vob size: %d, vec dim: %d' % (vocab_size, self.ndims))

    def _encoding(self, words):
        renamed, vectors = self.w2v.read(words)

        if len(vectors) > 0:
            vec = np.array(vectors).mean(axis=0)
        else:
            vec = np.zeros(self.ndims, )
        return vec
Beispiel #25
0
class VisionDataset(torch.utils.data.Dataset):
    def __init__(self, filename):
        self.vis_feat_file = BigFile(filename)
        self.vis_ids = self.vis_feat_file.names

    def __getitem__(self, index):
        vis_tensor = self.vis_feat_file.read_one(self.vis_ids[index])
        return self.vis_ids[index], torch.Tensor(vis_tensor)

    def get_by_name(self, name):
        vis_tensor = self.vis_feat_file.read_one(name)
        return torch.Tensor(vis_tensor)

    def __len__(self):
        return len(self.vis_ids)
Beispiel #26
0
def test_create(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    for d in dtypes:
        d = numpy.dtype(d)
        numpy.random.seed(1234)

        # test creating
        with x.create(d.str, Nfile=1, dtype=d, size=128) as b:
            data = numpy.random.uniform(100000, size=128 * 128).view(
                dtype=b.dtype.base).reshape([-1] + list(d.shape))[:b.size]
            b.write(0, data)

        with x[d.str] as b:
            assert_equal(b[:], data.astype(d.base))
            assert_equal(b[:], b[...])

        # test writing with an offset
        with x[d.str] as b:
            b.write(1, data[0:1])
            assert_equal(b[1:2], data[0:1].astype(d.base))

        # test writing beyond file length
        with x[d.str] as b:
            caught = False
            try:
                b.write(1, data)
            except:
                caught = True
            assert caught
    assert_equal(set(x.blocks), set([numpy.dtype(d).str for d in dtypes]))
    import os
    os.system("ls -r %s" % fname)
    for b in x.blocks:
        assert b in x

    for b in x:
        assert b in x

    bd = BigData(x)
    assert set(bd.dtype.names) == set(x.blocks)
    d = bd[:]

    shutil.rmtree(fname)
def calc_mf_each_bf(dir_name: str, bf: BigFile):
    """ Calculate MFs of a single bf file"""
    header = bf.open('Header')
    redshift = 1. / header.attrs['Time'][0] - 1.

    bhmass = bf.open('5/BlackholeMass')[:] * TO_MSUN
    halomass = bf.open('FOFGroups/Mass')[:] * TO_MSUN
    starmass = bf.open('FOFGroups/MassByType')[:][:, 4] * TO_MSUN

    halo_mf = mass_function(halomass, HALO_MIN, HALO_MAX, N_BIN, BOXSIZE)
    star_mf = mass_function(starmass, STAR_MIN, STAR_MAX, N_BIN, BOXSIZE)
    bh_mf = mass_function(bhmass, BH_MIN, BH_MAX, N_BIN, BOXSIZE)

    print('    Saving MFs at z = %0.4f' % redshift)
    np.save('{}halo_mf_%0.4f'.format(dir_name) %redshift, halo_mf)
    np.save('{}star_mf_%0.4f'.format(dir_name) %redshift, star_mf)
    np.save('{}bh_mf_%0.4f'.format(dir_name) %redshift, bh_mf)
Beispiel #28
0
def test_threads(comm):
    # This test shall not core dump
    # raise many errors here and there on many threads

    from threading import Thread, Event
    import gc
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    b = x.create("Threading", Nfile=1, dtype='i8', size=128)

    old = gc.get_threshold()

    gc.set_threshold(1, 1, 1)
    E = Event()
    def func(i, b):
        E.wait()
        x['.'].attrs['v3'] = [1, 2, 3]
        err = 0
        for j in range(100 * i):
            try:
                with pytest.raises(BigFileError):
                    b.attrs['v 3'] = ['a', 'bb', 'ccc']

                b.write(0, numpy.ones(128))
            except BigBlockClosedError:
                err = err + 1

        b.close()

        x['Threading'].attrs['v3'] = [1, 2, 3]

    t = []
    for i in range(4):
        t.append(Thread(target = func, args=(i, b)))

    for i in t: i.start()

    E.set()

    for i in t: i.join()

    gc.set_threshold(*old)
    shutil.rmtree(fname)
Beispiel #29
0
def test_threads(comm):
    # This test shall not core dump
    # raise many errors here and there on many threads

    from threading import Thread, Event
    import gc
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    b = x.create("Threading", Nfile=1, dtype='i8', size=128)

    old = gc.get_threshold()

    gc.set_threshold(1, 1, 1)
    E = Event()
    def func(i, b):
        E.wait()
        x['.'].attrs['v3'] = [1, 2, 3]
        err = 0
        for j in range(100 * i):
            try:
                with pytest.raises(BigFileError):
                    b.attrs['v 3'] = ['a', 'bb', 'ccc']

                b.write(0, numpy.ones(128))
            except BigBlockClosedError:
                err = err + 1

        b.close()

        x['Threading'].attrs['v3'] = [1, 2, 3]

    t = []
    for i in range(4):
        t.append(Thread(target = func, args=(i, b)))

    for i in t: i.start()

    E.set()

    for i in t: i.join()

    gc.set_threshold(*old)
    shutil.rmtree(fname)
 def __init__(self,
              collection,
              vocab_file,
              feature,
              language,
              flag_shuffle=False,
              fluency_threshold=DEFAULT_FLUENCY_U,
              rootpath=ROOT_PATH):
     self.language = language
     self.anno_file_path = utility.get_sent_file(collection, language,
                                                 rootpath)
     self.fluency_threshold = fluency_threshold
     self.textbank = TextBank(vocab_file)
     assert self.textbank.vocab[TOKEN_PAD] == 0
     self.vf_reader = BigFile(
         utility.get_feat_dir(collection, feature, rootpath))
     self.vf_names = set(self.vf_reader.names)
     self.vf_size = self.vf_reader.ndims
     self.flag_shuffle = flag_shuffle
     self._load_data()
Beispiel #31
0
    def __init__(self,
                 collections,
                 concept_files,
                 feature,
                 batch_size=100,
                 rootpath=ROOT_PATH):
        assert (len(collections) == len(concept_files))
        self.batch_size = batch_size
        self.feat_file = BigFile(
            os.path.join(rootpath, collections[0], 'FeatureData', feature))
        self.label_set = LabelSet(collections[0], concept_files[0], rootpath)
        self.aux_label_set = None

        if len(collections) > 1:
            self.aux_label_set = LabelSet(collections[1], concept_files[1],
                                          rootpath)

        self.img_ids = sorted(self.label_set.im2labels.keys())
        self.num_labels = self.label_set.num_labels
        self.aux_num_labels = self.aux_label_set.num_labels if self.aux_label_set else 0
        self.update()
 def __init__(self, collection, vocab_file, feature, language,
             flag_shuffle=False,  fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH):
     self.language = language
     self.anno_file_path = utility.get_sent_file(collection, language, rootpath)
     self.fluency_threshold = fluency_threshold
     self.textbank = TextBank(vocab_file)
     assert self.textbank.vocab[TOKEN_PAD] == 0
     self.vf_reader = BigFile(utility.get_feat_dir(collection, feature, rootpath))
     self.vf_names = set(self.vf_reader.names)
     self.vf_size = self.vf_reader.ndims
     self.flag_shuffle = flag_shuffle
     self._load_data()
Beispiel #33
0
def test_string(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # test creating
    with x.create("Header", Nfile=1, dtype=None, size=128) as b:
        b.attrs['v3'] = ['a', 'bb', 'ccc']
        b.attrs['v32'] = [
                            ['a', 'bb', 'ccc'],
                            ['1', '22', '333'],]

        b.attrs['s'] = 'abc'
        b.attrs['l'] = 'a' * 65536

    with x.open("Header") as b:
        assert_equal(b.attrs['v3'], ['a', 'bb', 'ccc'])
        assert_equal(b.attrs['v32'], ['a', 'bb', 'ccc', '1', '22', '333'])
        assert_equal(b.attrs['s'], 'abc')
        assert_equal(b.attrs['l'], 'a' * 65536)

    shutil.rmtree(fname)
Beispiel #34
0
def ReadIC(filename):
    # this reads in a MP-Gadget3/GENIC format IC
    # major thing is to scale vel by a0**1.5
    file = BigFile(filename)
    header = file.open('header')
    BoxSize = header.attrs['BoxSize'][0]
    a0 = header.attrs['Time'][0]

    Ntot = file.open('1/ID').size
    myslice = slice(
            MPI.COMM_WORLD.rank * Ntot // MPI.COMM_WORLD.size,
            (MPI.COMM_WORLD.rank + 1) * Ntot // MPI.COMM_WORLD.size,
            )
    P = dict()
    P['Mass'] = header.attrs['MassTable'][1]
    P['Position'] = file.open('1/Position')[myslice] 
    P['Velocity'] = file.open('1/Velocity')[myslice] 
    P['Velocity'] *= a0 ** 1.5
    P['ID'] = file.open('1/ID')[myslice] 
    
    return P, BoxSize, a0
Beispiel #35
0
def test_string(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    # test creating
    with x.create("Header", Nfile=1, dtype=None, size=128) as b:
        b.attrs['v3'] = ['a', 'bb', 'ccc']
        b.attrs['v32'] = [
                            ['a', 'bb', 'ccc'],
                            ['1', '22', '333'],]

        b.attrs['s'] = 'abc'
        b.attrs['l'] = 'a' * 65536

    with x.open("Header") as b:
        assert_equal(b.attrs['v3'], ['a', 'bb', 'ccc'])
        assert_equal(b.attrs['v32'], ['a', 'bb', 'ccc', '1', '22', '333'])
        assert_equal(b.attrs['s'], 'abc')
        assert_equal(b.attrs['l'], 'a' * 65536)

    shutil.rmtree(fname)
Beispiel #36
0
def process(options, feat_dir, imsetfile, result_dir):

    resultfile = os.path.join(result_dir, 'feature.bin')
    if checkToSkip(resultfile, options.overwrite):
        sys.exit(0)

    imset = map(str.strip, open(imsetfile).readlines())
    print "requested", len(imset)

    feat_file = BigFile(feat_dir)
    
    makedirsforfile(resultfile)
    fw = open(resultfile, 'wb')

    done = []
    start = 0
  
    while start < len(imset):
        end = min(len(imset), start + options.blocksize)
        printStatus(INFO, 'processing images from %d to %d' % (start, end-1))
        toread = imset[start:end]
        if len(toread) == 0:
            break
        renamed, vectors = feat_file.read(toread)
        for vec in vectors:
            vec = np.array(vec, dtype=np.float32)
            vec.tofile(fw)
        done += renamed
        start = end
    fw.close()

    assert(len(done) == len(set(done)))
    with open(os.path.join(result_dir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(done))
        fw.close()
    
    with open(os.path.join(result_dir,'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(done), feat_file.ndims))
        fw.close()
    print '%d requested, %d obtained' % (len(imset), len(done))
Beispiel #37
0
def test_bigdata(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    for d in dtypes:
        dt = numpy.dtype(d)
        numpy.random.seed(1234)

        # test creating
        with x.create(str(d), Nfile=1, dtype=dt, size=128) as b:
            data = numpy.random.uniform(100000, size=128 * 128).view(
                dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size]
            b.write(0, data)

    bd = BigData(x)
    assert set(bd.dtype.names) == set(x.blocks)
    assert isinstance(bd[:], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert_equal(len(bd['f8'].dtype), 0)
    # tuple of one item is the same as non-tuple
    assert isinstance(bd[('f8', )], BigBlock)
    assert_equal(len(bd[('f8', )].dtype), 0)

    assert isinstance(bd['f8', :10], numpy.ndarray)
    assert_equal(len(bd['f8', :10]), 10)
    assert_equal(len(bd['f8', :10].dtype), 0)
    assert_equal(len(bd[[
        'f8',
    ], :10].dtype), 1)

    # tuple of one item is the same as non-tuple
    assert_equal(len(bd[('f8', ), :10].dtype), 0)
    assert isinstance(bd[:10, 'f8'], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert isinstance(bd[['f8', 'f4'], ], BigData)
    assert_equal(len(bd[['f8', 'f4'], ].dtype), 2)
    assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray)

    shutil.rmtree(fname)
Beispiel #38
0
def test_closed(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')
    x.close()
    assert x.blocks == []
    try:
        h = x['.']
    except BigFileClosedError:
        pass
    try:
        x.refresh()
    except BigFileClosedError:
        pass
Beispiel #39
0
def test_file_large_attr(comm):
    import os.path
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    data = numpy.ones(1024 * 128 * 8, dtype='f8')

    with x['.'] as bb:
        bb.attrs['value'] = data

    with x['.'] as bb:
        assert_equal(bb.attrs['value'], data)

    shutil.rmtree(fname)
Beispiel #40
0
def test_fileattr(comm):
    import os.path
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    assert not os.path.exists(os.path.join(fname, 'attr-v2'))
    assert not os.path.exists(os.path.join(fname, '000000'))
    with x['.'] as bb:
        bb.attrs['value'] = 1234
        assert bb.attrs['value'] == 1234
    assert not os.path.exists(os.path.join(fname, 'header'))
    assert os.path.exists(os.path.join(fname, 'attr-v2'))

    shutil.rmtree(fname)
Beispiel #41
0
def find_redshift(redshift, directory, pig=True):
    """Find a snapshot at a given redshift from a directory list. Returns snapshot number."""
    if pig:
        fname = "PIG_*"
    else:
        fname = "PART_*"
    globs = glob.glob(os.path.join(directory, fname))
    for gg in globs:
        bf = BigFile(gg)
        rr = 1/bf['Header'].attrs['Time']-1
        if np.abs(rr - redshift) < 0.05:
            return gg
        del bf
    return None
Beispiel #42
0
def test_create_odd(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    
    d = numpy.dtype('f4')
    numpy.random.seed(1234)

    # test creating
    with x.create(d.str, Nfile=3, dtype=d, size=455**3) as b:
        data = numpy.random.uniform(100000, size=455**3).astype(d)
        b.write(0, data)

    import os
    os.system("ls -r %s" % fname)
    for b in x.blocks:
        assert b in x

    for b in x:
        assert b in x

    shutil.rmtree(fname)
Beispiel #43
0
def test_create_odd(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    
    d = numpy.dtype('f4')
    numpy.random.seed(1234)

    # test creating
    with x.create(d.str, Nfile=3, dtype=d, size=455**3) as b:
        data = numpy.random.uniform(100000, size=455**3).astype(d)
        b.write(0, data)

    import os
    os.system("ls -r %s" % fname)
    for b in x.blocks:
        assert b in x

    for b in x:
        assert b in x

    shutil.rmtree(fname)
Beispiel #44
0
def test_bigdata(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)
    x.create('.')

    for d in dtypes:
        dt = numpy.dtype(d)
        numpy.random.seed(1234)

        # test creating
        with x.create(str(d), Nfile=1, dtype=dt, size=128) as b:
            data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size]
            b.write(0, data)

    bd = BigData(x)
    assert set(bd.dtype.names) == set(x.blocks)
    assert isinstance(bd[:], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert_equal(len(bd['f8'].dtype), 0)
    # tuple of one item is the same as non-tuple
    assert isinstance(bd[('f8',)], BigBlock)
    assert_equal(len(bd[('f8',)].dtype), 0)

    assert isinstance(bd['f8', :10], numpy.ndarray)
    assert_equal(len(bd['f8', :10]), 10)
    assert_equal(len(bd['f8', :10].dtype), 0)
    assert_equal(len(bd[['f8',], :10].dtype), 1)

    # tuple of one item is the same as non-tuple
    assert_equal(len(bd[('f8',), :10].dtype), 0)
    assert isinstance(bd[:10, 'f8'], numpy.ndarray)
    assert isinstance(bd['f8'], BigBlock)
    assert isinstance(bd[['f8', 'f4'],], BigData)
    assert_equal(len(bd[['f8', 'f4'],].dtype), 2)
    assert isinstance(bd[['f8', 'f4'],:10], numpy.ndarray)

    shutil.rmtree(fname)
Beispiel #45
0
def plot_bhmf(pig, label=None):
    """Plot a black hole mass function from a FOF table."""
    bf = BigFile(pig)
    redshift = 1/bf['Header'].attrs['Time']-1
    hh = bf['Header'].attrs['HubbleParam']
    lbox = bf['Header'].attrs['BoxSize']/1000/hh
    lfm = getbmf(bf,lbox, hh)
    plt.plot(lfm[0],lfm[1],label=(label or '')+' z=%.1f'%redshift)
    plt.fill_between(lfm[0],lfm[2],lfm[3],alpha=0.2)
    plt.xlabel(r'$\mathrm{log}_{10} [M_{\rm BH}/M_{\odot}]$',fontsize=17)
    plt.ylabel(r'$\mathrm{log}_{10} \phi/[\mathrm{dex}^{-1} \mathrm{Mpc}^{-3}]$',fontsize=15)
    plt.xlim(6,12)
    plt.ylim(-7,-2.5)
    plt.title('BH Mass function',fontsize=15)
    plt.legend(fontsize=15)
Beispiel #46
0
 def __init__(self,
              collection,
              feature,
              batch_size=100,
              rootpath=ROOT_PATH):
     self.feat_file = BigFile(
         os.path.join(rootpath, collection, 'FeatureData', feature))
     self.batch_size = batch_size
     self.label_set = None
     self.aux_label_set = None
     self.img_ids = map(
         str.strip,
         open(
             os.path.join(rootpath, collection, 'ImageSets',
                          collection + '.txt')).readlines())
     self.update()
Beispiel #47
0
def test_blank_attr(comm):
    fname = tempfile.mkdtemp()
    x = BigFile(fname, create=True)

    with x.create("Header", Nfile=1, dtype=None, size=128) as b:
        with pytest.raises(BigFileError):
            b.attrs['v 3'] = ['a', 'bb', 'ccc']

        with pytest.raises(BigFileError):
            b.attrs['v\t3'] = ['a', 'bb', 'ccc']

        with pytest.raises(BigFileError):
            b.attrs['v\n3'] = ['a', 'bb', 'ccc']

    with pytest.raises(BigFileError):
        x.create(" ", Nfile=1, dtype=None, size=128)

    with pytest.raises(BigFileError):
        x.create("\t", Nfile=1, dtype=None, size=128)

    with pytest.raises(BigFileError):
        x.create("\n", Nfile=1, dtype=None, size=128)
    shutil.rmtree(fname)
Beispiel #48
0
        if newtag != tag:
            raise Exception("rank = %d, tag is %s, on root tag is %s" %
                    (self.comm.rank, tag, newtag))
        self.comm.Barrier()
        if self.comm.rank == 0:
            print(tag)
#bigfile = BigFile(argv[1])
from argparse import ArgumentParser

ap = ArgumentParser()
ap.add_argument("config")
ap.add_argument("input")
ap.add_argument("output")

ns = ap.parse_args()
bigfile = BigFile(ns.input)

world = MPI.COMM_WORLD

if world.rank == 0:
    attrs = bigfile.open("Header").attrs
    HEADER = dict([(i, attrs[i]) for i in attrs])
else:
    HEADER = None
HEADER = world.bcast(HEADER)
BoxSize = HEADER['BoxSize']

# set up some defaults
SMLFACTOR = 1.0
TilePadding = 256
Beispiel #49
0
import sys
import os

from bigfile import BigFile

os.system("python merge_feat.py f3d toydata,toydata,toydata,toydata2 newdata --rootpath ./ --overwrite 1")

feat_file = BigFile("newdata/FeatureData/f3d")
renamed, vectors = feat_file.read(feat_file.names)

for _id, _vec in zip(renamed, vectors):
    print _id, _vec
Beispiel #50
0
if __name__ == '__main__':
    from sys import argv

    # this will set the units to
    #
    # time: 980 Myear/h
    # distance: 1 Kpc/h
    # speed: 100 km/s
    # mass: 1e10 Msun /h

    DH = 3e5 / 100.
    G = 43007.1
    H0 = 0.1
    Nmesh = int(argv[2])
    file = BigFile(argv[1])
    header = file.open('header')
    BoxSize = header.attrs['BoxSize'][0]
    a0 = header.attrs['Time'][0]

    Ntot = file.open('1/ID').size

    myslice = slice(
            MPI.COMM_WORLD.rank * Ntot // MPI.COMM_WORLD.size,
            (MPI.COMM_WORLD.rank + 1) * Ntot // MPI.COMM_WORLD.size,
            )

    P = lambda : None

    P.Pos = file.open('1/Position')[myslice] 
    
Beispiel #51
0
if __name__ == "__main__":
    rootpath = './'
    trainCollection = 'toydata'
    nimages = 2
    feature = 'f1'
    dim = 3

    testCollection = trainCollection
    testset = testCollection
   
    featureDir = os.path.join(rootpath, trainCollection, "FeatureData", feature)
    searcher = simpleknn.load_model(os.path.join(featureDir, "feature.bin"), dim, nimages, os.path.join(featureDir, "id.txt"))
    searcher.set_distance('l2')
    searcher.set_distance('l1')
    print ("[simpleknn] dim=%d, nr_images=%d" % (searcher.get_dim(), searcher.get_nr_images()))


    testfeaturedir = os.path.join(rootpath, testCollection, 'FeatureData', feature)
    testfeaturefile = BigFile(testfeaturedir, dim)
    testset = testfeaturefile.names

    for testid in testset:
        testfeature = testfeaturefile.read_one(testid)
        visualNeighbors = searcher.search_knn(testfeature, max_hits=20000)
        print testid, len(visualNeighbors), " ".join(["%s %.3f" % (v[0],v[1]) for v in visualNeighbors[:3]])

 


Beispiel #52
0
def main(unused_args):

  length_normalization_factor = FLAGS.length_normalization_factor

  # Load model configuration
  config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py')
  config = utility.load_config(config_path)

  config.trainCollection = FLAGS.train_collection
  config.word_cnt_thr = FLAGS.word_cnt_thr
  config.rootpath = FLAGS.rootpath

  train_collection =  FLAGS.train_collection
  test_collection = FLAGS.test_collection
  overwrite = FLAGS.overwrite
  feature = FLAGS.vf_name


  img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection)
  if not os.path.exists(img_set_file):
      img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection)
  img_list = map(str.strip, open(img_set_file).readlines())

  # have visual feature ready
  FLAGS.vf_dir = os.path.join(rootpath, test_collection, 'FeatureData', feature)
  vf_reader = BigFile(FLAGS.vf_dir)

  textbank = TextBank(utility.get_train_vocab_file(FLAGS))
  config.vocab_size = len(textbank.vocab)
  config.vf_size = int(open(os.path.join(FLAGS.vf_dir, 'shape.txt')).read().split()[1])

  model_dir = utility.get_model_dir(FLAGS)
  output_dir = utility.get_pred_dir(FLAGS)

  checkpoint_style = FLAGS.checkpoint_style

  if checkpoint_style == 'file':
    #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file)
    # read validated top models
    validation_output_dir = utility.get_sim_dir(FLAGS)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file)
    shutil.copy(eval_model_list_file, output_dir)
    test_iter_list = []
    for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]:
      iter_current = int(line.strip().split()[0])
      test_iter_list.append(iter_current)

  elif checkpoint_style == 'iter_interval':
    #output_per_filename =  'model_perf_in_%s' % FLAGS.eval_stat
    test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")])
  elif checkpoint_style == 'iter_num':
    #output_per_filename =  'model_perf_in_iter_%d' % FLAGS.iter_num
    test_iter_list = [FLAGS.iter_num]

  with_image_embedding = True if FLAGS.with_image_embedding != 0 else False
  g = tf.Graph()
  with g.as_default():
    model = InferenceWrapper(config=config,model_dir=model_dir,
                             gpu_memory_fraction=FLAGS.gpu_memory_fraction,
                             gpu=FLAGS.gpu,
                             with_image_embedding=with_image_embedding)
    model.build_model()
  
  for k, iter_n in enumerate(test_iter_list):
    model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n)
    while not os.path.exists(model_path+'.meta'):
      logger.error('Model path: %s', model_path)
      logger.error('Cannot load model file and exit')
      sys.exit(0)

    top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt')
    top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt')
    # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt')

    if os.path.exists(top_one_pred_sent_file) and not overwrite:
      # write existing perf file and print out
      logger.info('%s exists. skip', top_one_pred_sent_file)
      continue

    if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]):
      os.makedirs(os.path.split(top_one_pred_sent_file)[0])

    logger.info('save results to %s', top_one_pred_sent_file)

    # load the trained model
    generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor)
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    config_proto = tf.ConfigProto(
      intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True)
    #with  tf.Session(config=config_proto) as session:
      #model.build_model(session, model_path)
    model.load_model(model_path)

    fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8')
    fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8')

    for progress,img in enumerate(img_list):
        print(img)
        # predict sentences given a visual feature
        visual_feature = np.array(vf_reader.read_one(img))
        sentences = generator.beam_search( visual_feature, FLAGS.beam_size)

        # output top one sentence info
        sent_score = sentences[0].score
        sent = ' '.join(sentences[0].words)
        fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n')
        logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent)

        # output top n sentences info
        fout_n_sent.write(img)
        for sentence in sentences:
            sent_score = sentence.score
            sent = ' '.join(sentence.words)
            fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent)
        fout_n_sent.write('\n')
      
        if progress % 100 == 0:
          logger.info('%d images decoded' % (progress+1))

    logger.info('%d images decoded' % (progress+1))
 
    fout_one_sent.close()
    fout_n_sent.close()
Beispiel #53
0
import os, random

import simpleknn
from bigfile import BigFile

rootpath = '/Users/xirong/VisualSearch'
collection = 'train10k'
nr_of_images = 10000
feature = 'color64'
dim = 64

feature_dir = os.path.join(rootpath,collection,'FeatureData',feature)
feature_file = BigFile(feature_dir, dim)
imset = map(str.strip, open(os.path.join(rootpath,collection,'ImageSets','%s.txt'%collection)).readlines())
imset = random.sample(imset, 10)

searcher = simpleknn.load_model(os.path.join(feature_dir, "feature.bin"), dim, nr_of_images, os.path.join(feature_dir, "id.txt"))
searcher.set_distance('l1')
renamed,vectors = feature_file.read(imset)

for name,vec in zip(renamed,vectors):
    visualNeighbors = searcher.search_knn(vec, max_hits=100)
    print name, visualNeighbors[:3]
class BucketDataProvider(object):
    """TensorFlow Data Provider with Buckets"""
    def __init__(self, collection, vocab_file, feature, language,
                flag_shuffle=False,  fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH):
        self.language = language
        self.anno_file_path = utility.get_sent_file(collection, language, rootpath)
        self.fluency_threshold = fluency_threshold
        self.textbank = TextBank(vocab_file)
        assert self.textbank.vocab[TOKEN_PAD] == 0
        self.vf_reader = BigFile(utility.get_feat_dir(collection, feature, rootpath))
        self.vf_names = set(self.vf_reader.names)
        self.vf_size = self.vf_reader.ndims
        self.flag_shuffle = flag_shuffle
        self._load_data()

    def shuffle_data_queue(self):
        random.shuffle(self._data_queue)

    def generate_batches(self, batch_size, buckets):
        """Return a list generator of mini-batches of training data."""
        # create Batches
        batches = []
        for max_seq_len in buckets:
            batches.append(Batch(batch_size, max_seq_len, self.vf_size, self.textbank.vocab[TOKEN_BOS]))
        
        # shuffle if necessary
        if self.flag_shuffle:
            np.random.shuffle(self._data_queue)
        # scan data queue
        for data in self._data_queue:
            # pdb.set_trace()
            sentence = data['sentence']
            # Load visual features
            # print(len(data['image_id']))
            visual_features = np.array(self.vf_reader.read_one(data['image_id']))
            #print("11111111")
            # print (data['image_id'])
            # print(visual_features)
            # print(data['sentence'])
            # sent = self.textbank.decode_tokens(data['sentence'], flag_remove_bos=True)
            # for word in sent:
            #     print (word)
            # # pdb.set_trace()
            if len(sentence) >= buckets[-1]:
                feed_res = batches[-1].feed_and_vomit(visual_features, sentence)
                ind_buc = len(buckets) - 1
            else:
                for (ind_b, batch) in enumerate(batches):
                    if len(sentence) < batch.max_seq_len:
                        feed_res = batches[ind_b].feed_and_vomit(visual_features, sentence)
                        ind_buc = ind_b
                        break
            if feed_res:
                yield (ind_buc,) + feed_res
                batches[ind_buc].empty()

            
    def _load_data(self, verbose=True):
        logger.debug('Loading data')
        self._data_queue = []
        annoss = codecs.open(self.anno_file_path,'r','utf-8').readlines()
        annos = [an.encode('utf-8').decode('utf-8-sig') for an in annoss]

        for (ind_a, line) in enumerate(annos):
            data = {}
            sid, sent = line.strip().split(" ", 1)
            imgid = sid.strip().split("#", 1)[0]
            # print(imgid)
            assert(imgid in self.vf_names)
            # pdb.set_trace()
            # if imgid not in self.vf_names:
            #    print(imgid)
            #    logger.info('%s not in feature data, skipping that.'%imgid)
            #    pdb.set_trace()
            #    continue
            data['image_id'] = imgid
            # print(imgid)
            # # Encode sentences

            tokens = TextTool.tokenize(sent, self.language)
            data['sentence'] = self.textbank.encode_tokens(tokens, flag_add_bos=False)
            self._data_queue.append(data)
            if verbose and (ind_a + 1) % 20000 == 0:
                logger.debug('%d/%d annotation', ind_a + 1, len(annos))
        random.shuffle( self._data_queue )   #       ############################# changed by gxr
        
        nr_of_images = len(set([data['image_id'] for data in self._data_queue]))
        logger.info('%d images, %d sentences from %s', nr_of_images, len(self._data_queue), self.anno_file_path)