def initialize(self): """ tables.EArray(parentnode, name, atom=None, shape=None, title='', filters=None, expectedrows=None, chunkshape=None, byteorder=None, _log=True)[source] """ self.m.log(1, '+++Init method of MCWaveform algorithm+++') self.NPMTS = int(12) self.LEN_PMT = int(599999) self.NSIPM = int(1792) self.LEN_SIPM = int(600) path = "/Users/jjgomezcadenas/Documents/Development/NEXT/data/Waveforms/" file = "WF_Tl_0.h5" self.NEVENTS = self.logman["CNTJob"].ints["NEVENTS"] self.h5f = tables.open_file(path + file, "w", filters=tables.Filters(complib="blosc", complevel=9)) self.pmtrd = self.h5f.create_earray(self.h5f.root, "pmtrd", atom=tables.IntAtom(), shape=(0, self.NPMTS, self.LEN_PMT), expectedrows=self.NEVENTS) self.sipmrd = self.h5f.create_earray(self.h5f.root, "sipmrd", atom=tables.IntAtom(), shape=(0, self.NSIPM, self.LEN_SIPM), expectedrows=self.NEVENTS) group = self.h5f.create_group(self.h5f.root, "Detector") self.geom_table = self.h5f.create_table(group, "DetectorGeometry", DetectorGeometry, "DetectorGeometry", tables.Filters(0)) group = self.h5f.create_group(self.h5f.root, "Sensors") self.pmt_table = self.h5f.create_table(group, "DataPMT", DataPMT, "DataPMT", tables.Filters(0)) self.sipm_table = self.h5f.create_table(group, "DataSiPM", DataSiPM, "DataSiPM", tables.Filters(0)) group = self.h5f.create_group(self.h5f.root, "MC") self.MCTrack_table = self.h5f.create_table(group, "MCTracks", MCTrack, "MCTracks", tables.Filters(0)) # self.h5f = tables.open_file("pmtrd.h5", "a", # filters=tables.Filters(complib="blosc", complevel=9)) #self.pmt = self.h5f.create_table(self.h5f.root, "pmt", PMTRD) self.t = time() self.n_evt = 0 return
def init_output(outfile): """ Creates a timestamped output directory and writes header to output file. """ with tables.open_file(outfile, 'w') as f: ## Create extendable arrays so we can incrementally write output f.create_earray(f.root, 'ancs', atom=tables.IntAtom(), shape=(0, )) f.create_earray(f.root, 'liks', atom=tables.FloatAtom(), shape=(0, )) ## Trees, which are variable-length, must be added individually f.create_vlarray(f.root, 'trees', atom=tables.IntAtom()) f.create_vlarray(f.root, 'genotypes', atom=tables.IntAtom())
def populate(f, nlevels): g = f.root #arr = numpy.zeros((10,), "f4") #descr = {'f0': tables.Int32Col(), 'f1': tables.Float32Col()} for i in range(nlevels): #dset = f.create_array(g, "DS1", arr) #dset = f.create_array(g, "DS2", arr) f.create_carray(g, "DS1", tb.IntAtom(), (10,)) f.create_carray(g, "DS2", tb.IntAtom(), (10,)) #dset = f.create_table(g, "DS1", descr) #dset = f.create_table(g, "DS2", descr) f.create_group(g, 'group2_') g = f.create_group(g, 'group')
def open_or_create_dataset_file(filename, filters, groups, add_classes): if os.path.exists(filename): return tables.open_file(filename, mode='r+') file = tables.open_file(filename, mode='w') for i in groups: file.create_group(file.root, i) file.create_carray(file.root, 'count', atom=tables.IntAtom(), shape=(1,), filters=filters) if add_classes: file.create_earray(file.root, 'classes', atom=tables.StringAtom(25), shape=(0,), filters=filters) file.create_earray(file.root, 'train', atom=tables.IntAtom(), shape=(0,), filters=filters) file.create_earray(file.root, 'test', atom=tables.IntAtom(), shape=(0,), filters=filters) file.root.count[0] = 0 return file
def populate(f, nlevels): g = f.root arr = numpy.zeros((10, ), "f4") recarr = numpy.zeros((10, ), "i4,f4") descr = {'f0': tables.Int32Col(), 'f1': tables.Float32Col()} for i in range(nlevels): #dset = f.createArray(g, "DS1", arr) #dset = f.createArray(g, "DS2", arr) dset = f.createCArray(g, "DS1", tables.IntAtom(), (10, )) dset = f.createCArray(g, "DS2", tables.IntAtom(), (10, )) #dset = f.createTable(g, "DS1", descr) #dset = f.createTable(g, "DS2", descr) group2 = f.createGroup(g, 'group2_') g = f.createGroup(g, 'group')
def __init__(self, database: AbstractDB): """ Initialize the atoms for meta-data (types, valid tag, and splits) Args: database (AbstractDB): Associated Database object """ super().__init__(database) self.filename_atom = tables.StringAtom(itemsize=255) self.types_atom = tables.StringAtom(itemsize=255) # whether the patch is valid. self.valid_atom = tables.BoolAtom(shape=(), dflt=False) # save the meta info: split # noinspection PyArgumentList self.file_list_atom = tables.StringAtom(itemsize=get_path_limit()) # noinspection PyArgumentList self.split_atom = tables.IntAtom(shape=(), dflt=False) self.hdf5_organizer = H5Organizer(self.database, self.database.group_level) self.data_extractor = DataExtractor(self.database) self.weight_writer = WeightCollector( self.database, self.data_extractor, weight_counter=self.database.weight_counter_callable) self.data_size = {}
def test99b_nonScalarEnum(self): """Describing an enumerated column of non-scalars (not implemented).""" colors = {'red': (1, 2, 3)} self.assertRaises(NotImplementedError, self._createCol, colors, 'red', base=tables.IntAtom(shape=3))
def write_carray(file, nchildren, niter): for i in range(niter): fileh = tables.openFile(file, mode="w") for child in range(nchildren): fileh.createCArray(fileh.root, 'array' + str(child), tables.IntAtom(), (2, ), "child: %d" % child) show_mem("After creating. Iter %s" % i) fileh.close() show_mem("After close")
def write_vlarray(file, nchildren, niter): for i in range(niter): fileh = tables.openFile(file, mode="w") for child in range(nchildren): vl = fileh.createVLArray(fileh.root, 'array' + str(child), tables.IntAtom(), "child: %d" % child) vl.append([1, 2, 3]) show_mem("After creating. Iter %s" % i) fileh.close() show_mem("After close")
def write_earray(file, nchildren, niter): for i in range(niter): fileh = tables.open_file(file, mode="w") for child in range(nchildren): ea = fileh.create_earray(fileh.root, 'array' + str(child), tables.IntAtom(), shape=(0,), title="child: %d" % child) ea.append([1, 2, 3]) show_mem("After creating. Iter %s" % i) fileh.close() show_mem("After close")
def create_db(filename, params, total_env_count=None, traj_per_env=None): """ :param filename: file name for database :param params: dotdict describing the domain :param total_env_count: total number of environments in the dataset (helps to preallocate space) :param traj_per_env: number of trajectories per environment """ N = params.grid_n M = params.grid_m num_state = N * M if total_env_count is not None and traj_per_env is not None: total_traj_count = total_env_count * traj_per_env else: total_traj_count = 0 if os.path.isfile(filename): print (filename + " already exitst, opening.") return tables.open_file(filename, mode='a') db = tables.open_file(filename, mode='w') db.create_earray(db.root, 'envs', tables.IntAtom(), shape=(0, N, M), expectedrows=total_env_count) db.create_earray(db.root, 'expRs', tables.FloatAtom(), shape=(0, ), expectedrows=total_traj_count) db.create_earray(db.root, 'valids', tables.IntAtom(), shape=(0, ), expectedrows=total_traj_count) db.create_earray(db.root, 'bs', tables.FloatAtom(), shape=(0, num_state), expectedrows=total_traj_count) db.create_earray(db.root, 'steps', tables.IntAtom(), shape=(0, 3), # state, action, observation expectedrows=total_traj_count * 10) # rough estimate db.create_earray(db.root, 'samples', tables.IntAtom(), shape=(0, 6), # env_id, goal_state, step_id, traj_length, collisions, failed expectedrows=total_traj_count) db.create_earray(db.root, 'qmdpBeliefs', tables.FloatAtom(), shape=(0, num_state,),expectedrows=total_traj_count*10) return db
def init_h5_result_file(h5, expectedrows=50000): """ Receives a h5 file that has just been created, creates the proper arrays: - query - target - position - n_results """ group = h5.createGroup("/",'results','general, sole group') h5.createEArray(group,'query',tables.StringAtom(18,shape=()),(0,), 'tid of the query', expectedrows=expectedrows) h5.createEArray(group,'target',tables.StringAtom(18,shape=()),(0,), 'tid of the target', expectedrows=expectedrows) h5.createEArray(group,'pos',tables.IntAtom(shape=()),(0,), 'position of the target in the result list', expectedrows=expectedrows) h5.createEArray(group,'n_results',tables.IntAtom(shape=()),(0,), 'lenght of the result list returned by query', expectedrows=expectedrows) # done return
def test19_getCutNodes(self): """Check the getCutNodes method. """ # Add a couple of nodes to the hidden group tmp_db = VTAPP.dbManager.getDB( VTAPP.dbManager.tmp_filepath).getH5File() tmp_db.createGroup('/_p_cutNode', 'Group_A') tmp_db.createCArray('/_p_cutNode', 'Hidden_CArray', tables.IntAtom(), (3, 3)) tmp_db.flush() cut_nodes = VTAPP.dbManager.getCutNodes() cut_nodes.sort() expected = ['Group_A', 'Hidden_CArray'] self.assertEqual(cut_nodes, expected, 'The retrieved list of cut nodes is wrong')
def combine_partial_contribs(partial_files, outfile): """ Combines the sparse results of several allele dropping simulations into a single sparse array. """ ## Dict of sparse contribs per region region_contribs = defaultdict(initialize_sparse_array) for pf in partial_files: print "Loading contribs from", pf with tables.open_file(pf, 'r') as f: ## Iterate through each region node in the sparse file for partial_node in f.list_nodes(f.root.sparse_hist): ## Load sparse data region = partial_node.name r, c, d = np.transpose(partial_node[:]) ## Add results to total new_tot = append_sparse(region_contribs[region], r, c, d) region_contribs[region] = new_tot ## Write results to file with tables.open_file(outfile, 'w') as f: filters = tables.Filters(complevel=5, complib='blosc') g = f.create_group(f.root, 'sparse_hist') for region, contribs in region_contribs.iteritems(): print "Writing contribs for region:", region ## Transpose data so max row size is not exceeded r, c, d = contribs.row, contribs.col, contribs.data contribs_array = np.transpose(np.vstack([r, c, d])) ## Store in compressed array ca = f.create_carray(g, region.strip(), tables.IntAtom(), shape=(contribs_array.shape), filters=filters) ca[:] = contribs_array ## Store number of inds in region with tables.open_file(partial_files[0], 'r') as pf: raw_node = pf.get_node(pf.root.raw, str(region)) ninds_region = raw_node._v_attrs['ninds'] f.set_node_attr(ca, 'ninds', ninds_region)
def create_hdf_arrays(file_name, ports, dig_in, emg_port, emg_channels): hf5 = tables.open_file(file_name, 'r+') n_electrodes = len(ports) * 32 atom = tables.IntAtom() # Create arrays for digital inputs for i in dig_in: dig_inputs = hf5.create_earray('/digital_in', 'dig_in_%i' % i, atom, (0, )) # Create arrays for neural electrodes, and make directories to store stuff coming out from blech_process for i in range(n_electrodes - len(emg_channels)): el = hf5.create_earray('/raw', 'electrode%i' % i, atom, (0, )) # Create arrays for EMG electrodes for i in range(len(emg_channels)): el = hf5.create_earray('/raw_emg', 'emg%i' % i, atom, (0, )) # Close the hdf5 file hf5.close()
def execute(self, context): scene = context.scene trkr = bpy.data.objects['tracker1'] # this script looks for an object named tracker1 fp = scene.render.filepath # get existing output path handler = lose.Loser(os.path.normpath(f'{fp}/ground_truth.h5')) handler.new_group(fmode='w', mat44=(4, 4), pos=(3,), rot_q=(4,)) handler.new_group(atom=t.IntAtom(), frame_id=(1,)) print ('starting to gather data\n\nframes will be saved to "' + os.path.normpath(f'{fp}/frames/') + '"') print ('this handler will be used to save data:') print (handler) print ('starting render...\n') scene.render.filepath = fp try: with handler: # sequence length is pulled from blender animation duration settings for i in range(scene.frame_start, scene.frame_end+1, scene.frame_step): scene.frame_set(i) scene.render.filepath = os.path.normpath(f'{fp}/frames/{i}') bpy.ops.wm.redraw_timer(type='DRAW_WIN_SWAP', iterations=1, time_limit=1/1000) mat_temp = trkr.matrix_world # save tracker data handler.save(mat44=[np.array(mat_temp)], pos=[np.array(mat_temp.to_translation())], rot_q=[np.array(mat_temp.to_quaternion())], frame_id=[[i]]) print (f'frame {i} data saved') bpy.ops.render.render(write_still=True) # render still finally: scene.render.filepath = fp print ('data grab done') return {'FINISHED'}
def process_filelist_train(filelist=None, testartists=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is not in testartist) INPUT filelist - a list of song files testartists - set of artist ID that we should not use tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients), 'cov' (covariance) """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'feats', tables.Float64Atom(shape=()), (0, finaldim), '', expectedrows=len(filelist)) output.createEArray(group, 'year', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'track_id', tables.StringAtom(18, shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #', cnt_f # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0 or artist_id in testartists: continue # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if typecompress == 'picks': if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) # save them to tmp file n_p_feats = processed_feats.shape[0] output.root.data.year.append(np.array([year] * n_p_feats)) output.root.data.track_id.append(np.array([track_id] * n_p_feats)) output.root.data.feats.append(processed_feats) # we're done, close output output.close() return
sys.exit() climb_file_paths = sys.argv[1] climb_merge_file = sys.argv[2] control_file_paths = sys.argv[3] control_merge_file = sys.argv[4] climb_files = sorted([line.strip() for line in open(climb_file_paths)]) control_files = sorted([line.strip() for line in open(control_file_paths)]) assert len(climb_files) == len(control_files) with tables.open_file(control_merge_file, 'w') as control_merge_file: control_liks = tables.EArray(control_merge_file.root, 'control_liks', tables.IntAtom(), shape=(0, )) tot_liks = tables.EArray(control_merge_file.root, 'tot_liks', tables.IntAtom(), shape=(0, )) for fname in control_files: print("Merging", fname) with tables.open_file(fname, 'r') as f: control_liks.append(f.root.control_liks[:]) tot_liks.append(f.root.tot_liks[:]) with tables.open_file(climb_merge_file, 'w') as climb_merge_file: ancs = tables.EArray(climb_merge_file.root, 'ancs',
# Open a file in "w"rite mode fileh = tables.open_file("MDobjects.h5", mode="w") # Create the table with compression 'on' in order to reduce size as # much as possible table = fileh.create_table(fileh.root, 'table', Particle, "A table", filters=tables.Filters(complevel=1)) # Append several rows with default values for i in range(10): table.row.append() table.flush() # create new arrays atom1 = tables.IntAtom() shape1 = (2, 10, 10, 1) filters1 = tables.Filters(complevel=1) #(2, 10, 10, 3) array1 = fileh.create_carray(fileh.root, 'array1', atom1, shape1, filters=filters1) atom2 = tables.FloatAtom() shape2 = (2, 10, 10, 3, 1) filters2 = tables.Filters(complevel=1) #(2, 10, 10, 3, 200) array2 = fileh.create_carray(fileh.root, 'array2', atom2,
def create_all_arrays(h5, expectedrows=1000): """ Utility functions used by both create_song_file and create_aggregate_files, creates all the EArrays (empty). INPUT h5 - hdf5 file, open with write or append permissions metadata and analysis groups already exist! """ # group metadata arrays group = h5.root.metadata h5.createEArray(where=group, name='similar_artists', atom=tables.StringAtom(20, shape=()), shape=(0, ), title=ARRAY_DESC_SIMILAR_ARTISTS) h5.createEArray(group, 'artist_terms', tables.StringAtom(256, shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS, expectedrows=expectedrows * 40) h5.createEArray(group, 'artist_terms_freq', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS_FREQ, expectedrows=expectedrows * 40) h5.createEArray(group, 'artist_terms_weight', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_ARTIST_TERMS_WEIGHT, expectedrows=expectedrows * 40) # group analysis arrays group = h5.root.analysis h5.createEArray(where=group, name='segments_start', atom=tables.Float64Atom(shape=()), shape=(0, ), title=ARRAY_DESC_SEGMENTS_START) h5.createEArray(group, 'segments_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_pitches', tables.Float64Atom(shape=()), (0, 12), ARRAY_DESC_SEGMENTS_PITCHES, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_timbre', tables.Float64Atom(shape=()), (0, 12), ARRAY_DESC_SEGMENTS_TIMBRE, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_max', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_MAX, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_max_time', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_MAX_TIME, expectedrows=expectedrows * 300) h5.createEArray(group, 'segments_loudness_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SEGMENTS_LOUDNESS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'sections_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SECTIONS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'sections_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_SECTIONS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'beats_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BEATS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'beats_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BEATS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'bars_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BARS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'bars_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_BARS_CONFIDENCE, expectedrows=expectedrows * 300) h5.createEArray(group, 'tatums_start', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_TATUMS_START, expectedrows=expectedrows * 300) h5.createEArray(group, 'tatums_confidence', tables.Float64Atom(shape=()), (0, ), ARRAY_DESC_TATUMS_CONFIDENCE, expectedrows=expectedrows * 300) # group musicbrainz arrays group = h5.root.musicbrainz h5.createEArray(where=group, name='artist_mbtags', atom=tables.StringAtom(256, shape=()), shape=(0, ), title=ARRAY_DESC_ARTIST_MBTAGS, expectedrows=expectedrows * 5) h5.createEArray(group, 'artist_mbtags_count', tables.IntAtom(shape=()), (0, ), ARRAY_DESC_ARTIST_MBTAGS_COUNT, expectedrows=expectedrows * 5)
def opensourcefile(k, filename=None, sourcetype=None, overwrite=False): """Open the source term hdf5 file with filename.""" import tables #Set up file for results if not filename or not os.path.isdir(os.path.dirname(filename)): source_logger.info("File or path to file %s does not exist." % filename) date = time.strftime("%Y%m%d%H%M%S") filename = os.path.join(os.getcwd(), "src" + date + ".hf5") source_logger.info("Saving source results in file " + filename) if not sourcetype: raise TypeError( "Need to specify filename and type of source data to store [int(egrand)|(full)term]!" ) if sourcetype in ["int", "term"]: sarrname = "source" + sourcetype if _debug: source_logger.debug("Source array type: " + sarrname) else: raise TypeError("Incorrect source type specified!") #Check if file exists and set write flags depending on overwrite option if os.path.isfile(filename): if overwrite: source_logger.info("File %s exists and will be overwritten." % filename) writeflag = "w" else: source_logger.info("File %s exists and results will be appended." % filename) writeflag = "a" else: writeflag = "w" #Add compression to files and specify good chunkshape filters = tables.Filters(complevel=1, complib=configuration.hdf5complib) #cshape = (10,10,10) #good mix of t, k, q values #Get atom shape for earray atomshape = (0, len(k)) try: if _debug: source_logger.debug("Trying to open source file " + filename) rf = tables.openFile(filename, writeflag, "Source term result") if not "results" in rf.root: if _debug: source_logger.debug("Creating group 'results' in source file.") resgrp = rf.createGroup(rf.root, "results", "Results") else: resgrp = rf.root.results if not sarrname in resgrp: if _debug: source_logger.debug("Creating array '" + sarrname + "' in source file.") sarr = rf.createEArray(resgrp, sarrname, tables.ComplexAtom(itemsize=16), atomshape, filters=filters) karr = rf.createEArray(resgrp, "k", tables.Float64Atom(), (0, ), filters=filters) narr = rf.createEArray(resgrp, "nix", tables.IntAtom(), (0, ), filters=filters) karr.append(k) else: if _debug: source_logger.debug( "Source file and node exist. Testing source node shape...") sarr = rf.getNode(resgrp, sarrname) narr = rf.getNode(resgrp, "nix") if sarr.shape[1:] != atomshape[1:]: raise ValueError("Source node on file is not correct shape!") except IOError: raise return rf, sarr, narr
sequence_features=sequence_features) return contexts, features dataset = TFRecordDataset(files) dataset = dataset.map(_parse_function) iterator = dataset.make_one_shot_iterator() data_shape = (0, ) labels_shape = (0, ) sound_dtype = tables.StringAtom(itemsize=128) labels_dtype = tables.IntAtom() data_storage = hdf5_file.create_earray(hdf5_file.root, 'audio_embedding', sound_dtype, shape=data_shape) labels_storage = hdf5_file.create_earray(hdf5_file.root, 'labels', labels_dtype, shape=labels_shape) value = iterator.get_next() i = 1 with tf.Session() as sess: while 1: try:
climb_lik_file = '/RQusagers/dnelson/project/anc_finder/results/BALSAC/CAID_3M_all_anc_out.csv' max_trees = 1000 print "Loading climbing likelihoods" climb_liks = np.genfromtxt(climb_lik_file, skip_header=True, delimiter=',', usecols=[1])[:max_trees] print "Loading trees" trees = [line.strip() for line in open(tree_anc_file, 'r')][:max_trees] climb_outfile = os.path.expanduser('~/temp/CAID_climb_1000.h5') with tables.open_file(climb_outfile, 'w') as f: ## Create extendable arrays so we can incrementally write output f.create_earray(f.root, 'liks', atom=tables.FloatAtom(), shape=(0, )) f.create_earray(f.root, 'ancs', atom=tables.FloatAtom(), shape=(0, )) ## Trees, which are variable-length, must be added individually f.create_vlarray(f.root, 'trees', atom=tables.IntAtom()) f.create_vlarray(f.root, 'genotypes', atom=tables.IntAtom()) incremental_write(climb_liks, trees, climb_outfile) ## Store control likelihoods # control_outfile = os.path.expanduser('~/temp/CAID_control.h5') # # init_array(control_outfile, array_name='control_liks') # with tables.open_file(control_outfile, 'a') as f: # f.root.control_liks.append([np.log2(conv_prob)])
def process_filelist_test(filelist=None, model=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, K=1, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is in testartist) INPUT filelist - a list of song files model - h5 file containing feats and year for all train songs tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep K - param of KNN (default 1) typecompress - feature type, 'picks', 'corrcoeff' or 'cov' must be the same as in training """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_test, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return if not os.path.isfile(model): print 'ERROR: model', model, 'does not exist.' return # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[ 1] == finaldim, 'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'year_real', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'year_pred', tables.Float64Atom(shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # go through files cnt_f = 0 for f in filelist: cnt_f += 1 if cnt_f % 5000 == 0: print 'TESTING FILE #' + str(cnt_f) # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0: # probably useless but... continue if typecompress == 'picks': # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) if processed_feats is None: continue if processed_feats.shape[0] == 0: continue # do prediction year_pred = do_prediction(processed_feats, kd, h5model, K) # add pred and ground truth to output if not year_pred is None: output.root.data.year_real.append([year]) output.root.data.year_pred.append([year_pred]) # close output and model del kd h5model.close() output.close() # done return
def initialize_database(self, **kargs): """ Initializes the EventDatabase. Adds a group 'events' with table 'eventsTable' and matrices 'raw_data', 'levels', and 'level_lengths'. :param kargs: Dictionary - includes: -maxEventLength: Maximum number of datapoints for an event to be added. """ if 'maxEventLength' in kargs: if kargs['maxEventLength'] > self.max_event_length: self.max_event_length = kargs['maxEventLength'] if 'events' not in self.root: self.createGroup(self.root, 'events', 'Events') if not 'eventTable' in self.root.events: self.createTable(self.root.events, 'eventTable', _Event, 'Event parameters') self.event_row = None filters = tb.Filters(complib='blosc', complevel=4) shape = (0, self.max_event_length) a = tb.FloatAtom() b = tb.IntAtom() if not 'raw_data' in self.root.events: self.createEArray(self.root.events, 'raw_data', a, shape=shape, title="Raw data points", filters=filters) if not 'levels' in self.root.events: self.createEArray(self.root.events, 'levels', a, shape=shape, title="Cusum levels", filters=filters) if not 'level_lengths' in self.root.events: self.createEArray(self.root.events, 'level_lengths', b, shape=shape, title="Lengths of the cusum levels", filters=filters) # Create/init the debug group if needed. if 'debug' in kargs and kargs['debug']: if not 'debug' in self.root: self.createGroup(self.root, 'debug', 'Debug') debug_shape = (kargs['n_channels'], kargs['n_points']) if not 'data' in self.root.debug: self.createCArray(self.root.debug, 'data', a, shape=debug_shape, title="Raw data", filters=filters) if not 'baseline' in self.root.debug: self.createCArray(self.root.debug, 'baseline', a, shape=debug_shape, title="Baseline data", filters=filters) if not 'threshold_positive' in self.root.debug: self.createCArray(self.root.debug, 'threshold_positive', a, shape=debug_shape, title="Raw data", filters=filters) if not 'threshold_negative' in self.root.debug: self.createCArray(self.root.debug, 'threshold_negative', a, shape=debug_shape, title="Raw data", filters=filters)
def create_hdf_arrays(file_name, rec_info, electrode_mapping, emg_mapping, file_dir=None): '''Creates empty data arrays in hdf5 store for storage of the intan recording data. Parameters ---------- file_name : str, absolute path to h5 file rec_info : dict recording info dict provided by blechpy.rawIO.read_recording_info electrode_mapping : pandas.DataFrame with colummns Electrode, Port and Channels emg_mapping : pandas.Dataframe with columns EMG, Port and Channels (can be empty) file_dir : str (optional) path to recording directory if h5 is in different folder Throws ------ ValueError if file_name is not absolute path to file and file_dir is not provided ''' if file_dir is None: file_dir = os.path.dirname(file_name) if file_dir is '': raise ValueError(('Must provide absolute path to file in a recording' 'directory or a file_dir argument')) if not os.path.isabs(file_name): file_name = os.path.join(file_dir, file_name) println('Creating empty arrays in hdf5 store for raw data...') sys.stdout.flush() atom = tables.IntAtom() f_atom = tables.Float64Atom() with tables.open_file(file_name, 'r+') as hf5: # Create array for raw time vector hf5.create_earray('/raw', 'amplifier_time', f_atom, (0, )) # Create arrays for each electrode for idx, row in electrode_mapping.iterrows(): hf5.create_earray('/raw', 'electrode%i' % row['Electrode'], atom, (0, )) # Create arrays for raw emg (if any exist) if not emg_mapping.empty: for idx, row in emg_mapping: hf5.create_earray('/raw_emg', 'emg%i' % row['EMG'], atom, (0, )) # Create arrays for digital inputs (if any exist) if rec_info.get('dig_in'): for x in rec_info['dig_in']: hf5.create_earray('/digital_in', 'dig_in_%i' % x, atom, (0, )) # Create arrays for digital outputs (if any exist) if rec_info.get('dig_out'): for x in rec_info['dig_out']: hf5.create_earray('/digital_out', 'dig_out_%i' % x, atom, (0, )) print('Done!')
def generate_per_subject_cache(xls_data, test_split=0.3, validation_split=0.3): prevElem = None h5file = tb.openFile(FULL_SPECTROGRAM_BY_SUBJECT_CACHE, mode='w', title="All the data") root = h5file.root first = True X_train = None Y_train = None X_validate = None Y_validate = None X_test = None Y_test = None y_append = None X_append = None for ss_id, regions_of_interest in xls_data.items(): avi = maybe_get_unique_avi_from_subjectState_id(ss_id) if avi: #split the subjects into vlaidation, train and test sets ran_num = random.uniform(0, 1) if ran_num < test_split: X_append, y_append = X_test, Y_test elif ran_num < test_split + validation_split: X_append, y_append = X_validate, Y_validate else: X_append, y_append = X_train, Y_train sw = SubjectVideo(avi) for _, timestamp, bpm in regions_of_interest: #reduces the memory used - for testing #if random.uniform(0,1) < 0.9: # continue timestamp = int(timestamp) bpm = round(float(bpm)) try: _, _, Sxx0 = sw.get_spectrogram(timestamp, 4) #workaround this error # max_freq_idx = int((max_freqency / f[-1]) * Sxx.shape[0]) #IndexError: index -1 is out of bounds for axis 0 with size 0 except IndexError: print("Something went wrong for avi, timestamp ", avi, timestamp) continue elem = np.array([Sxx0]) if prevElem is not None and elem.shape != prevElem.shape: print("skipping " + str(avi) + " " + ss_id + " due to incorrect shape") continue prevElem = elem #store the elem to disk if first: first = False a = tb.Atom.from_dtype(np.dtype('Float32')) data_shape = tuple([0] + list(elem.shape)) X_train = h5file.create_earray(root, 'X_train', a, data_shape, "X_train") X_test = h5file.create_earray(root, 'X_test', a, data_shape, "X_test") X_validate = h5file.create_earray(root, 'X_validate', a, data_shape, "X_validate") Y_train = h5file.create_earray(root, 'Y_train', tb.IntAtom(), (0, ), "Y_train") Y_test = h5file.create_earray(root, 'Y_test', tb.IntAtom(), (0, ), "Y_test") Y_validate = h5file.create_earray(root, 'Y_validate', tb.IntAtom(), (0, ), "Y_validate") X_append, y_append = X_test, Y_test X_append.append(np.array([elem])) y_append.append([bpm]) h5file.flush() print("converted " + str(avi) + " for " + X_append.title + " set") else: print("Skipping " + ss_id + " becuase it isn't unique") h5file.close()
def bpm_to_data(data, train_split=0.9): try: return readh5File(SPECTROGRAM_CACHE) except IOError: pass pattern = re.compile(".*vp_(\\d+)_(\\d+)_.*") prevElem = None # limit = 250 h5file = tb.openFile(SPECTROGRAM_CACHE, mode='w', title="All the data") root = h5file.root first = True X_train = None Y_train = None X_test = None Y_test = None for wavFile in iterateThroughWav(): m = pattern.match(wavFile) subjectId = int(m.group(1)) stateId = int(m.group(2)) sw = SubjectWav(wavFile) subjectStateId = str(subjectId) + "_" + str(stateId).zfill(2) try: for _, timestamp, bpm in data[subjectStateId]: #reduces the memory used # if random.uniform(0,1) < 0.9: # continue timestamp = int(timestamp) bpm = round(float(bpm)) _, _, Sxx0 = sw.get_spectrogram(timestamp, 4) # print(Sxx0.shape) #(651,154) (801,219) elem = np.array([Sxx0]) if prevElem is not None and elem.shape != prevElem.shape: print("skipping " + str(wavFile) + " " + subjectStateId + " due to incorrect shape") continue prevElem = elem #store the elem to disk if first: first = False a = tb.Atom.from_dtype(np.dtype('Float32')) data_shape = tuple([0] + list(elem.shape)) X_train = h5file.create_earray(root, 'X_train', a, data_shape, "X_train") X_test = h5file.create_earray(root, 'X_test', a, data_shape, "X_test") Y_train = h5file.create_earray(root, 'Y_train', tb.IntAtom(), (0, ), "Y_train") # code.interact(local=locals()) Y_test = h5file.create_earray(root, 'Y_test', tb.IntAtom(), (0, ), "Y_test") if random.uniform(0, 1) < 0.9: X_train.append(np.array([elem])) Y_train.append([bpm]) else: X_test.append(np.array([elem])) Y_test.append([bpm]) h5file.flush() except KeyError: print("can not find: " + subjectStateId + ".") pass print("converted " + str(wavFile)) #Could not broadcast error means that not all elemnt of X_train have the same shape #usually meaning there is something wrong with files h5file.close() # data = (X_train, Y_train) , (X_test, Y_test) # write_cache(SPECTROGRAM_CACHE,data) return readh5File(SPECTROGRAM_CACHE)
def full_bpm_to_data(data, train_split=0.9): try: return readh5File(FULL_SPECTROGRAM_CACHE) except IOError: pass prevElem = None h5file = tb.openFile(FULL_SPECTROGRAM_CACHE, mode='w', title="All the data") root = h5file.root first = True X_train = None Y_train = None X_test = None Y_test = None for ss_id, regions_of_interest in data.items(): avi = maybe_get_unique_avi_from_subjectState_id(ss_id) if avi: sw = SubjectVideo(avi) for _, timestamp, bpm in regions_of_interest: #reduces the memory used # if random.uniform(0,1) < 0.9: # continue timestamp = int(timestamp) bpm = round(float(bpm)) try: _, _, Sxx0 = sw.get_spectrogram(timestamp, 4) #workaround this error # max_freq_idx = int((max_freqency / f[-1]) * Sxx.shape[0]) #IndexError: index -1 is out of bounds for axis 0 with size 0 except IndexError: print("Something went wrong for avi, timestamp ", avi, timestamp) continue elem = np.array([Sxx0]) if prevElem is not None and elem.shape != prevElem.shape: print("skipping " + str(avi) + " " + ss_id + " due to incorrect shape") continue prevElem = elem #store the elem to disk if first: first = False a = tb.Atom.from_dtype(np.dtype('Float32')) data_shape = tuple([0] + list(elem.shape)) X_train = h5file.create_earray(root, 'X_train', a, data_shape, "X_train") X_test = h5file.create_earray(root, 'X_test', a, data_shape, "X_test") Y_train = h5file.create_earray(root, 'Y_train', tb.IntAtom(), (0, ), "Y_train") Y_test = h5file.create_earray(root, 'Y_test', tb.IntAtom(), (0, ), "Y_test") if random.uniform(0, 1) < train_split: X_train.append(np.array([elem])) Y_train.append([bpm]) else: X_test.append(np.array([elem])) Y_test.append([bpm]) h5file.flush() print("converted " + str(avi)) else: print("Skipping " + ss_id + " becuase it isn't unique") h5file.close() return readh5File(FULL_SPECTROGRAM_CACHE)
def train(nthreads, maindir, output, testartists, npicks, winsize, finaldim, trainsongs=None, typecompress='picks'): """ Main function to do the training Do the main pass with the number of given threads. Then, reads the tmp files, creates the main output, delete the tmpfiles. INPUT - nthreads - number of threads to use - maindir - dir of the MSD, wehre to find song files - output - main model, contains everything to perform KNN - testartists - set of artists to ignore - npicks - number of samples to pick per song - winsize - window size (in beats) of a sample - finaldim - final dimension of the sample, something like 5? - trainsongs - list of songs to use for training - typecompress - 'picks', 'corrcoeff' or 'cov' RETURN - nothing """ # sanity checks if os.path.isfile(output): print 'ERROR: file', output, 'already exists.' return # initial time t1 = time.time() # do main pass tmpfiles = process_filelist_train_main_pass(nthreads, maindir, testartists, npicks, winsize, finaldim, trainsongs=trainsongs, typecompress=typecompress) if tmpfiles is None: print 'Something went wrong, tmpfiles are None' return # intermediate time t2 = time.time() stimelen = str(datetime.timedelta(seconds=t2 - t1)) print 'Main pass done after', stimelen sys.stdout.flush() # find approximate number of rows per tmpfiles h5 = tables.openFile(tmpfiles[0], 'r') nrows = h5.root.data.year.shape[0] * len(tmpfiles) h5.close() # create output output = tables.openFile(output, mode='a') group = output.createGroup("/", 'data', 'KNN MODEL FILE FOR YEAR RECOGNITION') output.createEArray(group, 'feats', tables.Float64Atom(shape=()), (0, finaldim), 'feats', expectedrows=nrows) output.createEArray(group, 'year', tables.IntAtom(shape=()), (0, ), 'year', expectedrows=nrows) output.createEArray(group, 'track_id', tables.StringAtom(18, shape=()), (0, ), 'track_id', expectedrows=nrows) # aggregate temp files for tmpf in tmpfiles: h5 = tables.openFile(tmpf) output.root.data.year.append(h5.root.data.year[:]) output.root.data.track_id.append(h5.root.data.track_id[:]) output.root.data.feats.append(h5.root.data.feats[:]) h5.close() # delete tmp file os.remove(tmpf) # close output output.close() # final time t3 = time.time() stimelen = str(datetime.timedelta(seconds=t3 - t1)) print 'Whole training done after', stimelen # done return