def test_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('.', dtype=None) as b: b.attrs['int'] = 128 b.attrs['float'] = [128.0, 3, 4] b.attrs['string'] = 'abcdefg' b.attrs['complex'] = 128 + 128J with x.open('.') as b: assert_equal(b.attrs['int'], 128) assert_equal(b.attrs['float'], [128.0, 3, 4]) assert_equal(b.attrs['string'], 'abcdefg') assert_equal(b.attrs['complex'], 128 + 128J) b.attrs['int'] = 30 b.attrs['float'] = [3, 4] b.attrs['string'] = 'defg' b.attrs['complex'] = 32 + 32J with x.open('.') as b: assert_equal(b.attrs['int'], 30) assert_equal(b.attrs['float'], [3, 4]) assert_equal(b.attrs['string'], 'defg') assert_equal(b.attrs['complex'], 32 + 32J) shutil.rmtree(fname)
def test_append(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) name = 'f4' d = numpy.dtype(('f4', 3)) numpy.random.seed(1234) data = numpy.random.uniform(100000, size=(100, 3)).astype('f4') # test creating with x.create(name, Nfile=3, dtype=d, size=100) as b: b.write(0, data) b.append(data, Nfile=2) with x.open(name) as bb: assert bb.size == 200 assert b.size == 200 with x.open(name) as b: assert b.Nfile == 5 assert_equal(b[:100], data) assert_equal(b[100:], data) assert b.size == 200 shutil.rmtree(fname)
def test_slicing(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') numpy.random.seed(1234) # test creating with x.create("data", Nfile=1, dtype=('f8', 32), size=128) as b: data = numpy.random.uniform(100000, size=(128, 32)) junk = numpy.random.uniform(100000, size=(128, 32)) b.write(0, data) with x['data'] as b: assert_equal(b[:], data) assert_equal(b[0], data[0]) b[:len(junk)] = junk with x['data'] as b: assert_equal(b[:], junk) assert_equal(b[0], junk[0]) b[3] = data[3] with x['data'] as b: assert_equal(b[3], data[3]) shutil.rmtree(fname)
def __init__(self, collection, vocab_file, feature, language, flag_shuffle=True, method=None, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.method = method if method: self.sent_score_file = utility.get_sent_score_file( collection, language, rootpath) assert method in ['sample', 'filter', 'weighted'] assert self.sent_score_file != None assert fluency_threshold > 0 if method == 'weighted': # Not sampling the data if fluency-guided method is weighted_loss self.method = method = None else: self.sent_score_file = None self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile( utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data()
def test_casts(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('block', Nfile=1, dtype='f8', size=128) as b: assert_raises(BigFileError, b.write, 0, numpy.array('aaaaaa')) b.write(0, numpy.array(True, dtype='?'))
def test_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('.', dtype=None) as b: b.attrs['int'] = 128 b.attrs['float'] = [128.0, 3, 4] b.attrs['string'] = 'abcdefg' b.attrs['complex'] = 128 + 128J b.attrs['bool'] = True b.attrs['arrayustring'] = numpy.array(u'unicode') b.attrs['arraysstring'] = numpy.array('str') with x.open('.') as b: assert_equal(b.attrs['int'], 128) assert_equal(b.attrs['float'], [128.0, 3, 4]) assert_equal(b.attrs['string'], 'abcdefg') assert_equal(b.attrs['complex'], 128 + 128J) assert_equal(b.attrs['bool'], True) b.attrs['int'] = 30 b.attrs['float'] = [3, 4] b.attrs['string'] = 'defg' b.attrs['complex'] = 32 + 32J b.attrs['bool'] = False with x.open('.') as b: assert_equal(b.attrs['int'], 30) assert_equal(b.attrs['float'], [3, 4]) assert_equal(b.attrs['string'], 'defg') assert_equal(b.attrs['complex'], 32 + 32J) assert_equal(b.attrs['bool'], False) shutil.rmtree(fname)
def test_pickle(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # test creating column = x.create("abc", dtype='f8', size=128) import pickle str = pickle.dumps(column) column1 = pickle.loads(str) assert type(column) == type(column1) assert column.size == column1.size assert column.dtype == column1.dtype assert column.comm is column1.comm column.close() str = pickle.dumps(column) column1 = pickle.loads(str) str = pickle.dumps(x) x1 = pickle.loads(str) assert type(x) == type(x1) assert x1.basename == x.basename x.close() str = pickle.dumps(x) x1 = pickle.loads(str) assert tuple(sorted(x1.blocks)) == tuple(sorted(x.blocks)) shutil.rmtree(fname)
def SaveSnapshot(comm, filename, P, blocks=None): file = BigFile(filename) if blocks is None: blocks = P.keys() for key in blocks: # hack, skip scalar mass if numpy.isscalar(P[key]): continue file.mpi_create_from_data(comm, '1/%s' % key, P[key])
def test_closed(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') x.close() assert x.blocks == [] try: h = x['.'] except BigFileClosedError: pass
def test_passby(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # half floats are pass-through types, no casting is supported data = numpy.array([3.0, 5.0], dtype='f2') with x.create('block', Nfile=1, dtype='f2', size=128) as b: b.write(0, data) assert_equal(b[:2], data) assert_raises(BigFileError, b.write, 0, numpy.array((30, 20.)))
def test_dataset(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for name, d in dtypes: dt = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(name, Nfile=1, dtype=dt, size=128) as b: data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size] b.write(0, data) bd = Dataset(x) assert set(bd.dtype.names) == set(x.blocks) assert isinstance(bd[:], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert_equal(len(bd['f8'].dtype), 0) # tuple of one item is the same as non-tuple assert isinstance(bd[('f8',)], BigBlock) assert_equal(len(bd[('f8',)].dtype), 0) assert isinstance(bd['f8', :10], numpy.ndarray) assert_equal(len(bd['f8', :10]), 10) assert_equal(len(bd['f8', :10].dtype), 0) assert_equal(len(bd[['f8',], :10].dtype), 1) # tuple of one item is the same as non-tuple assert_equal(len(bd[('f8',), :10].dtype), 0) assert isinstance(bd[:10, 'f8'], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert isinstance(bd[['f8', 'f4'],], Dataset) assert_equal(len(bd[['f8', 'f4'],].dtype), 2) assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray) for name, d in dtypes: assert_array_equal(x[name][:], bd[:][name]) data1 = bd[:10] data2 = bd[10:20] bd[:10] = data2 assert_array_equal(bd[:10], data2) bd[10:20] = data1 assert_array_equal(bd[:10], data2) assert_array_equal(bd[10:20], data1) bd.append(data1) assert bd.size == 128 + 10 assert_array_equal(bd[-10:], data1) shutil.rmtree(fname)
def test_attr_objects(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create('block', dtype=None) as b: def set_obj1(): b.attrs['objects'] = numpy.array([object()]) assert_raises(ValueError, set_obj1); def set_obj_scalar(): b.attrs['objects'] = object() assert_raises(ValueError, set_obj_scalar); shutil.rmtree(fname)
def test_create(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for name, d in dtypes: d = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(name, Nfile=1, dtype=d, size=128) as b: data = numpy.random.uniform(100000, size=128*128).view(dtype=b.dtype.base).reshape([-1] + list(d.shape))[:b.size] b.write(0, data) with x[name] as b: assert_equal(b[:], data.astype(d.base)) assert_equal(b[:], b[...]) # test creating data = numpy.random.uniform(100000, size=128*128).view(dtype=d.base).reshape([-1] + list(d.shape))[:128] with x.create_from_array(name, data) as b: pass with x[name] as b: assert_equal(b[:], data) # test writing with an offset with x[name] as b: b.write(1, data[0:1]) assert_equal(b[1:2], data[0:1].astype(d.base)) # test writing beyond file length with x[name] as b: caught = False try: b.write(1, data) except: caught = True assert caught assert_equal(set(x.blocks), set([name for name, d in dtypes])) import os os.system("ls -r %s" % fname) for b in x.blocks: assert b in x for b in x: assert b in x bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) d = bd[:] shutil.rmtree(fname)
def get_we(vocab, w2v_dir): w2v = BigFile(w2v_dir) ndims = w2v.ndims nr_words = len(vocab) words = [vocab[i] for i in range(nr_words)] we = np.random.uniform(low=-1.0, high=1.0, size=(nr_words, ndims)) renamed, vecs = w2v.read(words) for i, word in enumerate(renamed): idx = vocab.find(word) we[idx] = vecs[i] return torch.Tensor(we)
def test_fileattr(comm): import os.path fname = tempfile.mkdtemp() x = BigFile(fname, create=True) assert not os.path.exists(os.path.join(fname, 'attr-v2')) assert not os.path.exists(os.path.join(fname, '000000')) with x['.'] as bb: bb.attrs['value'] = 1234 assert bb.attrs['value'] == 1234 assert not os.path.exists(os.path.join(fname, 'header')) assert os.path.exists(os.path.join(fname, 'attr-v2')) shutil.rmtree(fname)
def test_file_large_attr(comm): import os.path fname = tempfile.mkdtemp() x = BigFile(fname, create=True) data = numpy.ones(1024 * 128 * 8, dtype='f8') with x['.'] as bb: bb.attrs['value'] = data with x['.'] as bb: assert_equal(bb.attrs['value'], data) shutil.rmtree(fname)
def find_redshift(redshift, directory, pig=True): """Find a snapshot at a given redshift from a directory list. Returns snapshot number.""" if pig: fname = "PIG_*" else: fname = "PART_*" globs = glob.glob(os.path.join(directory, fname)) for gg in globs: bf = BigFile(gg) rr = 1/bf['Header'].attrs['Time']-1 if np.abs(rr - redshift) < 0.05: return gg del bf return None
def plot_bhmf(pig, label=None): """Plot a black hole mass function from a FOF table.""" bf = BigFile(pig) redshift = 1/bf['Header'].attrs['Time']-1 hh = bf['Header'].attrs['HubbleParam'] lbox = bf['Header'].attrs['BoxSize']/1000/hh lfm = getbmf(bf,lbox, hh) plt.plot(lfm[0],lfm[1],label=(label or '')+' z=%.1f'%redshift) plt.fill_between(lfm[0],lfm[2],lfm[3],alpha=0.2) plt.xlabel(r'$\mathrm{log}_{10} [M_{\rm BH}/M_{\odot}]$',fontsize=17) plt.ylabel(r'$\mathrm{log}_{10} \phi/[\mathrm{dex}^{-1} \mathrm{Mpc}^{-3}]$',fontsize=15) plt.xlim(6,12) plt.ylim(-7,-2.5) plt.title('BH Mass function',fontsize=15) plt.legend(fontsize=15)
def __init__(self, collection, feature, batch_size=100, rootpath=ROOT_PATH): self.feat_file = BigFile( os.path.join(rootpath, collection, 'FeatureData', feature)) self.batch_size = batch_size self.label_set = None self.aux_label_set = None self.img_ids = map( str.strip, open( os.path.join(rootpath, collection, 'ImageSets', collection + '.txt')).readlines()) self.update()
def plot_gsmf(pig, label=None, plot_data=True): """Plot a galaxy stellar mass function from a FOF table, compared to some observations.""" bf = BigFile(pig) redshift = 1 / bf['Header'].attrs['Time'] - 1 #Note! Assumes kpc units! hh = bf['Header'].attrs['HubbleParam'] lbox = bf['Header'].attrs['BoxSize'] / 1000 / hh print('z=', redshift) lfm = get_gsmf(bf, lbox, hh) plt.plot(lfm[0], lfm[1], label=(label or '') + ' z=%.1f' % redshift) plt.fill_between(lfm[0], lfm[2], lfm[3], alpha=0.2) color2 = {'Song2016':'#0099e6','Grazian2015':'#7f8c83','Gonzalez2011':'#ffa64d',\ 'Duncan2014':'#F08080','Stefanon2017':'#30ba52'} marker2 = {'Song2016':'o','Grazian2015':'s','Gonzalez2011':'v',\ 'Duncan2014':'^','Stefanon2017':'<'} if plot_data: obs = number_density(feature="GSMF", z_target=redshift, quiet=1, h=hh) for ii in range(obs.n_target_observation): data = obs.target_observation['Data'][ii] label = obs.target_observation.index[ii] datatype = obs.target_observation['DataType'][ii] if datatype == 'data': data[:, 1:] = np.log10(data[:, 1:]) try: color = color2[label] marker = marker2[label] except KeyError: color = None marker = 'o' plt.errorbar(data[:,0], data[:,1], yerr = [data[:,1]-data[:,3],data[:,2]- data[:,1]],\ label=label,color=color,fmt=marker) else: continue plt.legend(fontsize=14) plt.title('GSMF,bhfdbk,z=%.1f' % redshift, fontsize=15) plt.ylabel( r'$\mathrm{log}_{10} \phi/[\mathrm{dex}^{-1} \mathrm{Mpc}^{-3}]$', fontsize=15)
def test_threads(comm): # This test shall not core dump # raise many errors here and there on many threads from threading import Thread, Event import gc fname = tempfile.mkdtemp() x = BigFile(fname, create=True) b = x.create("Threading", Nfile=1, dtype='i8', size=128) old = gc.get_threshold() gc.set_threshold(1, 1, 1) E = Event() def func(i, b): E.wait() x['.'].attrs['v3'] = [1, 2, 3] err = 0 for j in range(100 * i): try: with pytest.raises(BigFileError): b.attrs['v 3'] = ['a', 'bb', 'ccc'] b.write(0, numpy.ones(128)) except BigBlockClosedError: err = err + 1 b.close() x['Threading'].attrs['v3'] = [1, 2, 3] t = [] for i in range(4): t.append(Thread(target = func, args=(i, b))) for i in t: i.start() E.set() for i in t: i.join() gc.set_threshold(*old) shutil.rmtree(fname)
def __init__(self, collection, vocab_file, feature, language, flag_shuffle=False, fluency_threshold=DEFAULT_FLUENCY_U, rootpath=ROOT_PATH): self.language = language self.anno_file_path = utility.get_sent_file(collection, language, rootpath) self.fluency_threshold = fluency_threshold self.textbank = TextBank(vocab_file) assert self.textbank.vocab[TOKEN_PAD] == 0 self.vf_reader = BigFile( utility.get_feat_dir(collection, feature, rootpath)) self.vf_names = set(self.vf_reader.names) self.vf_size = self.vf_reader.ndims self.flag_shuffle = flag_shuffle self._load_data()
def test_string(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) # test creating with x.create("Header", Nfile=1, dtype=None, size=128) as b: b.attrs['v3'] = ['a', 'bb', 'ccc'] b.attrs['v32'] = [ ['a', 'bb', 'ccc'], ['1', '22', '333'],] b.attrs['s'] = 'abc' b.attrs['l'] = 'a' * 65536 with x.open("Header") as b: assert_equal(b.attrs['v3'], ['a', 'bb', 'ccc']) assert_equal(b.attrs['v32'], ['a', 'bb', 'ccc', '1', '22', '333']) assert_equal(b.attrs['s'], 'abc') assert_equal(b.attrs['l'], 'a' * 65536) shutil.rmtree(fname)
def __init__(self, collections, concept_files, feature, batch_size=100, rootpath=ROOT_PATH): assert (len(collections) == len(concept_files)) self.batch_size = batch_size self.feat_file = BigFile( os.path.join(rootpath, collections[0], 'FeatureData', feature)) self.label_set = LabelSet(collections[0], concept_files[0], rootpath) self.aux_label_set = None if len(collections) > 1: self.aux_label_set = LabelSet(collections[1], concept_files[1], rootpath) self.img_ids = sorted(self.label_set.im2labels.keys()) self.num_labels = self.label_set.num_labels self.aux_num_labels = self.aux_label_set.num_labels if self.aux_label_set else 0 self.update()
def ReadIC(filename): # this reads in a MP-Gadget3/GENIC format IC # major thing is to scale vel by a0**1.5 file = BigFile(filename) header = file.open('header') BoxSize = header.attrs['BoxSize'][0] a0 = header.attrs['Time'][0] Ntot = file.open('1/ID').size myslice = slice( MPI.COMM_WORLD.rank * Ntot // MPI.COMM_WORLD.size, (MPI.COMM_WORLD.rank + 1) * Ntot // MPI.COMM_WORLD.size, ) P = dict() P['Mass'] = header.attrs['MassTable'][1] P['Position'] = file.open('1/Position')[myslice] P['Velocity'] = file.open('1/Velocity')[myslice] P['Velocity'] *= a0**1.5 P['ID'] = file.open('1/ID')[myslice] return P, BoxSize, a0
def process(options, feat_dir, imsetfile, result_dir): resultfile = os.path.join(result_dir, 'feature.bin') if checkToSkip(resultfile, options.overwrite): sys.exit(0) imset = map(str.strip, open(imsetfile).readlines()) print "requested", len(imset) feat_file = BigFile(feat_dir) makedirsforfile(resultfile) fw = open(resultfile, 'wb') done = [] start = 0 while start < len(imset): end = min(len(imset), start + options.blocksize) printStatus(INFO, 'processing images from %d to %d' % (start, end-1)) toread = imset[start:end] if len(toread) == 0: break renamed, vectors = feat_file.read(toread) for vec in vectors: vec = np.array(vec, dtype=np.float32) vec.tofile(fw) done += renamed start = end fw.close() assert(len(done) == len(set(done))) with open(os.path.join(result_dir, 'id.txt'), 'w') as fw: fw.write(' '.join(done)) fw.close() with open(os.path.join(result_dir,'shape.txt'), 'w') as fw: fw.write('%d %d' % (len(done), feat_file.ndims)) fw.close() print '%d requested, %d obtained' % (len(imset), len(done))
def test_bigdata(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') for d in dtypes: dt = numpy.dtype(d) numpy.random.seed(1234) # test creating with x.create(str(d), Nfile=1, dtype=dt, size=128) as b: data = numpy.random.uniform(100000, size=128 * 128).view( dtype=b.dtype.base).reshape([-1] + list(dt.shape))[:b.size] b.write(0, data) bd = BigData(x) assert set(bd.dtype.names) == set(x.blocks) assert isinstance(bd[:], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert_equal(len(bd['f8'].dtype), 0) # tuple of one item is the same as non-tuple assert isinstance(bd[('f8', )], BigBlock) assert_equal(len(bd[('f8', )].dtype), 0) assert isinstance(bd['f8', :10], numpy.ndarray) assert_equal(len(bd['f8', :10]), 10) assert_equal(len(bd['f8', :10].dtype), 0) assert_equal(len(bd[[ 'f8', ], :10].dtype), 1) # tuple of one item is the same as non-tuple assert_equal(len(bd[('f8', ), :10].dtype), 0) assert isinstance(bd[:10, 'f8'], numpy.ndarray) assert isinstance(bd['f8'], BigBlock) assert isinstance(bd[['f8', 'f4'], ], BigData) assert_equal(len(bd[['f8', 'f4'], ].dtype), 2) assert isinstance(bd[['f8', 'f4'], :10], numpy.ndarray) shutil.rmtree(fname)
def append_mmbh_data(part: str, redshifts: List, mmbhmasss: List, mmbhids: List, mmbhaccs: List, mmbhposs: List, mmbhvels: List, mmbht1s: List): """ Append the most massive BHs' quantities. """ bf = BigFile(part) header = bf.open('Header') redshift = 1. / header.attrs['Time'][0] - 1. bhmass = bf.open('5/BlackholeMass')[:] * TO_MSUN no_blackhole = len(bhmass) == 0 if no_blackhole: print(' No BH formed at z = %0.2f' % redshift) return None bhid = bf.open('5/ID')[:] bhacc = bf.open('5/BlackholeAccretionRate')[:] * TO_MSUN_YEAR bhpos = bf.open('5/Position')[:] bhvel = bf.open('5/Velocity')[:] mmbhmass = bhmass.max() mmbhid = bhid[np.argmax(bhmass)] mmbhacc = bhacc[np.argmax(bhmass)] mmbhpos = bhpos[np.argmax(bhmass)] mmbhvel = bhvel[np.argmax(bhmass)] print(' Appending BH quantities at z = %0.4f' % redshift) if GET_T1 & (redshift <= STARTZ) & (np.abs(redshift - round(redshift)) < DZ): print(' -- calculating t1... at %0.4f' % redshift) mmbht1 = [calc_t1(part, mmbhpos, nmesh) for nmesh in NMESHS] else: mmbht1 = [np.nan] * len(NMESHS) redshifts.append(redshift) mmbhmasss.append(mmbhmass) mmbhids.append(mmbhid) mmbhaccs.append(mmbhacc) mmbhposs.append(mmbhpos) mmbhvels.append(mmbhvel) mmbht1s.append(mmbht1)
def test_create_odd(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) x.create('.') d = numpy.dtype('f4') numpy.random.seed(1234) # test creating with x.create(d.str, Nfile=3, dtype=d, size=455**3) as b: data = numpy.random.uniform(100000, size=455**3).astype(d) b.write(0, data) import os os.system("ls -r %s" % fname) for b in x.blocks: assert b in x for b in x: assert b in x shutil.rmtree(fname)
def test_blank_attr(comm): fname = tempfile.mkdtemp() x = BigFile(fname, create=True) with x.create("Header", Nfile=1, dtype=None, size=128) as b: with pytest.raises(BigFileError): b.attrs['v 3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): b.attrs['v\t3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): b.attrs['v\n3'] = ['a', 'bb', 'ccc'] with pytest.raises(BigFileError): x.create(" ", Nfile=1, dtype=None, size=128) with pytest.raises(BigFileError): x.create("\t", Nfile=1, dtype=None, size=128) with pytest.raises(BigFileError): x.create("\n", Nfile=1, dtype=None, size=128) shutil.rmtree(fname)