np.random.seed(args.seed) # directories for checkpoint, images and log files save_models_folder = wd + '/output/saved_models/' os.makedirs(save_models_folder, exist_ok=True) save_logs_folder = wd + '/output/saved_logs/' os.makedirs(save_logs_folder, exist_ok=True) ########################################################################################################### # Data ########################################################################################################### # data loader data_filename = args.data_path + '/RC-49_' + str(args.img_size) + 'x' + str( args.img_size) + '.h5' hf = h5py.File(data_filename, 'r') labels = hf['labels'][:] labels = labels.astype(np.float) images = hf['images'][:] hf.close() N_all = len(images) assert len(images) == len(labels) q1 = args.min_label q2 = args.max_label indx = np.where((labels > q1) * (labels < q2) == True)[0] labels = labels[indx] images = images[indx] assert len(labels) == len(images) # normalize to [0, 1]
### ### In this section we will create a new HDF5 file with a dataset and attributes. # Import the h5py and NumPy packages import h5py import numpy as np # Set the array length we use for our dataset. array_len = 1500 # Let's create a new HDF5 file called myfile.h5. Set its access attribute such that we # will be able to write data into it. fid = h5py.File('myfile.h5', 'w') ### ### PART 2 - Dataset Creation # Let's add our first dataset: an 1D array consisting of array_len random floats. # Start by creating a NumPy array of random floats. input_data = np.random.rand(array_len) # Let's define a dataset. We start by constructing an emtpy dataset with space enough # to hold our input_data array. dset = fid.create_dataset( "RandomData", (array_len,), dtype='f' )
def create( cool_uri, bins, pixels, columns=None, dtypes=None, metadata=None, assembly=None, symmetric_upper=True, mode=None, h5opts=None, boundscheck=True, triucheck=True, dupcheck=True, ensure_sorted=False, lock=None, append=False, append_scool=False, scool_root_uri=None, **kwargs ): """ Create a new Cooler. Deprecated parameters --------------------- chromsizes : Series Chromsizes are now inferred from ``bins``. append : bool, optional Append new Cooler to the file if it exists. If False, an existing file with the same name will be truncated. Default is False. Use the ``mode`` argument instead. dtype : dict, optional Dictionary mapping column names in the pixel table to dtypes. Use the ``dtypes`` argument instead. """ file_path, group_path = parse_cooler_uri(cool_uri) if mode is None: mode = "a" if append else "w" h5opts = _set_h5opts(h5opts) if not isinstance(bins, pd.DataFrame): raise ValueError( "Second positional argument must be a pandas DataFrame. " "Note that the `chromsizes` argument is now deprecated: " "see documentation for `create`." ) if append_scool == True and scool_root_uri is None: raise ValueError( "If the parameter `append_scool` is set, the parameter `scool_root_uri` must be defined." ) dtypes = _get_dtypes_arg(dtypes, kwargs) for col in ["chrom", "start", "end"]: if col not in bins.columns: raise ValueError("Missing column from bin table: '{}'.".format(col)) # Populate expected pixel column names. Include user-provided value # columns. if columns is None: columns = ["bin1_id", "bin2_id", "count"] else: columns = list(columns) for col in ["bin1_id", "bin2_id"]: # don't include count! if col not in columns: columns.insert(0, col) # Populate dtypes for expected pixel columns, and apply user overrides. if dtypes is None: dtypes = dict(PIXEL_DTYPES) else: dtypes_ = dict(dtypes) dtypes = dict(PIXEL_DTYPES) dtypes.update(dtypes_) # Get empty "meta" header frame (assigns the undeclared dtypes). # Any columns from the input not in meta will be ignored. meta = get_meta(columns, dtypes, default_dtype=float) # Determine the appropriate iterable try: from dask.dataframe import DataFrame as dask_df except (ImportError, AttributeError): # pragma: no cover dask_df = () if isinstance(pixels, dask_df): iterable = map(lambda x: x.compute(), pixels.to_delayed()) input_columns = infer_meta(pixels).columns elif isinstance(pixels, pd.DataFrame): iterable = (pixels,) input_columns = infer_meta(pixels).columns elif isinstance(pixels, dict): iterable = (pixels,) input_columns = infer_meta([(k, v.dtype) for (k, v) in pixels.items()]).columns else: iterable = pixels input_columns = None # If possible, ensure all expected columns are available if input_columns is not None: for col in columns: if col not in input_columns: col_type = "Standard" if col in PIXEL_FIELDS else "User" raise ValueError( "{} column not found in input: '{}'".format(col_type, col) ) # Prepare chroms and bins bins = bins.copy() bins["chrom"] = bins["chrom"].astype(object) chromsizes = get_chromsizes(bins) try: chromsizes = six.iteritems(chromsizes) except AttributeError: pass chromnames, lengths = zip(*chromsizes) chroms = pd.DataFrame( {"name": chromnames, "length": lengths}, columns=["name", "length"] ) binsize = get_binsize(bins) n_chroms = len(chroms) n_bins = len(bins) if not symmetric_upper and triucheck: warnings.warn( "Creating a non-symmetric matrix, but `triucheck` was set to " "True. Changing to False." ) triucheck = False # Chain input validation to the end of the pipeline if boundscheck or triucheck or dupcheck or ensure_sorted: validator = validate_pixels( n_bins, boundscheck, triucheck, dupcheck, ensure_sorted ) iterable = map(validator, iterable) # Create root group with h5py.File(file_path, mode) as f: logger.info('Creating cooler at "{}::{}"'.format(file_path, group_path)) if group_path == "/": for name in ["chroms", "bins", "pixels", "indexes"]: if name in f: del f[name] else: try: f.create_group(group_path) except ValueError: del f[group_path] f.create_group(group_path) # Write chroms, bins and pixels if append_scool: src_path, src_group = parse_cooler_uri(scool_root_uri) dst_path, dst_group = parse_cooler_uri(cool_uri) with h5py.File(src_path, "r+") as src, h5py.File(dst_path, "r+") as dst: dst[dst_group]["chroms"] = src["chroms"] # hard link to root bins table, but only the three main datasets dst[dst_group]["bins/chrom"] = src["bins/chrom"] dst[dst_group]["bins/start"]= src["bins/start"] dst[dst_group]["bins/end"]= src["bins/end"] # create per cell the additional columns e.g. 'weight' # these columns are individual for each cell columns = list(bins.keys()) for col in ["chrom", "start", "end"]: columns.remove(col) if columns: put(dst[dst_group]['bins'], bins[columns]) with h5py.File(file_path, "r+") as f: h5 = f[group_path] grp = h5.create_group("pixels") if symmetric_upper: max_size = n_bins * (n_bins - 1) // 2 + n_bins else: max_size = n_bins * n_bins prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts) else: with h5py.File(file_path, "r+") as f: h5 = f[group_path] logger.info("Writing chroms") grp = h5.create_group("chroms") write_chroms(grp, chroms, h5opts) logger.info("Writing bins") grp = h5.create_group("bins") write_bins(grp, bins, chroms["name"], h5opts) grp = h5.create_group("pixels") if symmetric_upper: max_size = n_bins * (n_bins - 1) // 2 + n_bins else: max_size = n_bins * n_bins prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts) # Multiprocess HDF5 reading is supported only if the same HDF5 file is not # open in write mode anywhere. To read and write to the same file, pass a # lock shared with the HDF5 reading processes. `write_pixels` will acquire # it and open the file for writing for the duration of each write step # only. After it closes the file and releases the lock, the reading # processes will have to re-acquire the lock and re-open the file to obtain # the updated file state for reading. logger.info("Writing pixels") target = posixpath.join(group_path, "pixels") nnz, ncontacts = write_pixels( file_path, target, meta.columns, iterable, h5opts, lock ) # Write indexes with h5py.File(file_path, "r+") as f: h5 = f[group_path] logger.info("Writing indexes") grp = h5.create_group("indexes") chrom_offset = index_bins(h5["bins"], n_chroms, n_bins) bin1_offset = index_pixels(h5["pixels"], n_bins, nnz) write_indexes(grp, chrom_offset, bin1_offset, h5opts) logger.info("Writing info") info = {} info["bin-type"] = u"fixed" if binsize is not None else u"variable" info["bin-size"] = binsize if binsize is not None else u"null" info["storage-mode"] = u"symmetric-upper" if symmetric_upper else u"square" info["nchroms"] = n_chroms info["nbins"] = n_bins info["sum"] = ncontacts info["nnz"] = nnz if assembly is not None: info["genome-assembly"] = assembly if metadata is not None: info["metadata"] = metadata write_info(h5, info)
(This script is not generally useful for most ilastik users or developers.) Input: hdf5 volume Output: directory of .png tiles representing the volume. """ if __name__ == "__main__": import sys import h5py import logging import argparse from lazyflow.utility import PathComponents, export_to_tiles logger = logging.getLogger() logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.INFO) # Usage: python make_tiles.py --tile_size=250 /path/to/my_vol.h5/some/dataset /path/to/output_dir parser = argparse.ArgumentParser() parser.add_argument('--tile_size', type=int) parser.add_argument('hdf5_dataset_path') parser.add_argument('output_dir') parsed_args = parser.parse_args(sys.argv[1:]) path_comp = PathComponents(parsed_args.hdf5_dataset_path) with h5py.File(path_comp.externalPath) as input_file: vol_dset = input_file[path_comp.internalPath] export_to_tiles(vol_dset, parsed_args.tile_size, parsed_args.output_dir)
>>>>> Dependencies: <<<<< 1. ASTRA toolbox: conda install -c astra-toolbox astra-toolbox 2. tomobar: conda install -c dkazanc tomobar or install from https://github.com/dkazanc/ToMoBAR @author: Daniil Kazantsev, e:mail [email protected] GPLv3 license (ASTRA toolbox) """ #import timeit import matplotlib.pyplot as plt import numpy as np import h5py from ccpi.supp.qualitymetrics import QualityTools # loading the data h5f = h5py.File('data/TomoSim_data1550671417.h5','r') phantom = h5f['phantom'][:] projdata_norm = h5f['projdata_norm'][:] proj_angles = h5f['proj_angles'][:] h5f.close() [Vert_det, AnglesNum, Horiz_det] = np.shape(projdata_norm) N_size = Vert_det sliceSel = 128 #plt.gray() plt.figure() plt.subplot(131) plt.imshow(phantom[sliceSel,:,:],vmin=0, vmax=1) plt.title('3D Phantom, axial view')
def __init__(self, path, filename, idenselect=[], train=True, transform=None, ): if os.path.isdir(path) is not True: raise ValueError('Path {} is not directory'.format(path)) self.path = path self.filename = filename dir = os.path.join(path, filename + '.mat' ) f = h5py.File(dir) self.data = np.array(f["data"]) self.points = np.array(f["points"]) self.imsize = np.array(f["imsize"])[:,0].astype(int) self.iactor = np.array(f["iactor"])[0,:].astype(int) self.labels = np.array(f["iclass"])[0,:].astype(int) - 1 self.name = np.array(f["name"]) self.num = np.array(f["num"])[0,0].astype(int) # Emotions class if filename == 'ck' or filename == 'ckp': #classes = ['Neutral - NE', 'Anger - AN', 'Contempt - CO', 'Disgust - DI', 'Fear - FR', 'Happiness - HA', 'Sadness - SA', 'Surprise - SU'] toferp = [0, 4, 7, 5, 6, 1, 3, 2 ] elif filename=='bu3dfe' or filename=='jaffe': #classes = ['Neutral - NE', 'Anger - AN', 'Disgust - DI', 'Fear - FR', 'Happiness - HA', 'Sadness - SA', 'Surprise - SU', 'Contempt - CO'] toferp = [0, 4, 5, 6, 1, 3, 2, 7 ] else: assert(False) #self.toferp = toferp #self.classes = classes #self.class_to_idx = { _class: i for i, _class in enumerate(classes) } self.labels = np.array([ toferp[l] for l in self.labels ]) self.numclass = len(np.unique(self.labels)) index = np.ones( (self.num,1) ) actors = np.unique(self.iactor) for i in idenselect: index[self.iactor == actors[i]] = 0 self.indexs = np.where(index == train)[0] self.transform = transform # ###### # index_nne = [] # for idx in self.indexs: # if( self.labels[ idx ] != 0 ): # index_nne.append( idx ) # self.indexs = np.array(index_nne) # ####### self.labels_org = self.labels self.labels = self.labels[ self.indexs ] self.classes = [self.classes[ i ] for i in np.unique( self.labels ) ] self.labels = self.labels ### - 1 ############ self.numclass = len(self.classes) self.index = 0
print(args.restore_model,args.restore_predictor) print() print('Save locations') print(args.save_model,args.save_predictor) print() # Which array to convert from categorical to residue letter if args.encoding == 'categorical': ORDER = cst.ORDER_CATEGORICAL CATEGORIES = cst.CATEGORIES elif args.encoding == 'blosum': ORDER = cst.ORDER_BLOSUM CATEGORIES = cst.BLOSUM ### Collect data f = h5py.File('/projects/ml/flu/fludb_data/processed_data_525916981168.h5','r') train_labels_dataset = f['train_labels'] train_labels = train_labels_dataset[()] valid_labels_dataset = f['valid_labels'] valid_labels = valid_labels_dataset[()] test_labels_dataset = f['test_labels'] test_labels = test_labels_dataset[()] if args.encoding == 'categorical': train_sequences_dataset = f['train_sequences_categorical'] train_sequences = train_sequences_dataset[()] valid_sequences_dataset = f['valid_sequences_categorical']
def createVSB100(db_settings, logger): ''' This method creates the database needed for caffe. ''' action = 'vw_commercial' database_path = db_settings['database_path'] features_path = db_settings['features_path'] #'/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_features.mat' video_info_path = db_settings['video_info_path'] #'/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_vidinfo.mat' #database_path = '/cs/vml2/mkhodaba/cvpr16/datasets/VSB100/databases/{action_name}.h5' #features_path = '/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_features.mat' #video_info_path = '/cs/vml3/mkhodaba/cvpr16/Graph_construction/Features/{action_name}_vidinfo.mat' features_path = features_path.format(action_name=action) video_info_path = video_info_path.format(action_name=action) database_path = database_path.format(action_name=action) neighbors_num = db_settings['number_of_neighbors'] neighbor_frames_num = db_settings['neighbor_frames_num'] from scipy.io import loadmat import numpy as np from scipy.spatial import cKDTree from random import randint from sklearn.preprocessing import StandardScaler try: features = loadmat(features_path)['features'] #number_of_frames x number_of_supervoxels_per_frame x feature_length except: import h5py features = h5py.File(features_path) print features.keys() video_info = loadmat(video_info_path) #video_info = [mapped, labelledlevelvideo, numberofsuperpixelsperframe] #mapped -> #number_of_frames x number_of_supervoxels_per_frame #labelledlevelvideo -> height x width x number_of_frames #framebelong -> total_number_of_super_pixels x 1 #labelsatframe -> total_number_of_super_pixels x 1 kdtrees = [] labelledlevelvideo = video_info['labelledlevelvideo'] numberofsuperpixelsperframe = video_info['numberofsuperpixelsperframe'] numberofsuperpixelsperframe = numberofsuperpixelsperframe[0] print features.shape frames_num = len(features) superpixels_num = len(features[0]) #per frame feature_len = len(features[0][0]) print features[0][0][1:50] normalize_data = False if normalize_data: features_normalized = np.zeros((np.sum(numberofsuperpixelsperframe), feature_len)) print features_normalized.shape idx = 0 for f in xrange(frames_num): for s in xrange(numberofsuperpixelsperframe[f]): features_normalized[idx][...] = features[f][s][...] idx += 1 clf = StandardScaler() features_normalized_2 = clf.fit_transform(features_normalized) idx = 0 for f in xrange(frames_num): for s in xrange(numberofsuperpixelsperframe[f]): features[f][s][...] = features_normalized_2[idx][...] idx +=1 print features[0][0][1:50] print features.shape print frames_num, superpixels_num, feature_len print numberofsuperpixelsperframe #centers[f][i] -> h,w of center centers = np.zeros((frames_num, superpixels_num, 2)) #[[[0.0,0.0] for i in xrange(superpixels_num)] for j in xrange(frames_num)] #frames_num x superpixels_num x 2 pixels_count = [[0 for i in xrange(superpixels_num)] for j in xrange(frames_num)] #frames_num x superpixels_num height = len(labelledlevelvideo) width = len(labelledlevelvideo[0]) logger.log('Computing centers of superpixels ...') for f in xrange(frames_num): logger.log('Frame %d' % f) for h in xrange(height): for w in xrange(width): try: idx = labelledlevelvideo[h][w][f]-1 except: print h, w, f raise centers[f][idx][0] += h centers[f][idx][1] += w pixels_count[f][idx] += 1 for i in xrange(numberofsuperpixelsperframe[f]): centers[f][i][0] /= pixels_count[f][i] centers[f][i][1] /= pixels_count[f][i] logger.log('Building kdtree') kdtree = cKDTree(np.array(centers[f][:numberofsuperpixelsperframe[f]])) kdtrees.append(kdtree) framebelong = video_info['framebelong'] print framebelong.shape labelsatframe = video_info['labelsatframe'] target_superpixel_num = 0 for f in xrange(neighbor_frames_num, frames_num-neighbor_frames_num): target_superpixel_num += numberofsuperpixelsperframe[f] n = target_superpixel_num #len(framebelong) superpixel_skip_num = 0 n_neg = 10 for f in xrange(neighbor_frames_num): superpixel_skip_num += numberofsuperpixelsperframe[f] data = {'target':np.zeros((n*n_neg, feature_len)), 'negative':np.zeros((n*n_neg, feature_len))} #data = {'target':np.zeros((n, feature_len)), 'negative':np.zeros((n, feature_len))} #Tracer()() total_number_of_neighbors = neighbors_num * (2*neighbor_frames_num+1) total_number_of_neighbors = neighbors_num * (2*neighbor_frames_num+1) for i in range(total_number_of_neighbors): data['neighbor{0}'.format(i)] = np.zeros((n*n_neg, feature_len)) #data['neighbor{0}'.format(i)] = np.zeros((n, feature_len)) superpixel_idx = -1 logger.log('Creating the database of superpixels:features') for f in xrange(neighbor_frames_num, frames_num-neighbor_frames_num): #TODO: start from a frame that has at least neighbor_frames_num number of frames before it logger.log('Frame %d' % f) logger.log('There are %d superpixels in in this frame' % numberofsuperpixelsperframe[f]) for i in xrange(numberofsuperpixelsperframe[f]): superpixel_idx += 1 assert f == framebelong[superpixel_idx+superpixel_skip_num]-1, 'Something went wrong in mapping superpixel index to frames/label at frame (1)' assert i == labelsatframe[superpixel_idx+superpixel_skip_num]-1, 'Something went wrong in mapping superpixel index to frames/label at frame (2)' data['target'][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[f][i][...] #data['target'][superpixel_idx][...] = features[f][i][...] center = centers[f][i] frame_start = max(0, f-neighbor_frames_num) frame_end = min(frames_num, f+neighbor_frames_num) neighbor_idx = 0 #print frame_start, frame_end for target_frame in xrange(frame_start, frame_end+1): if f == target_frame: nearest_neighbors = kdtrees[target_frame].query(center, neighbors_num+1)[1] # Added one to the neighbors because the target itself is included nearest_neighbors = nearest_neighbors[1:] else: nearest_neighbors = kdtrees[target_frame].query(center, neighbors_num)[1] for idx in nearest_neighbors: #data['neighbor{0}'.format(neighbor_idx)][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[target_frame][idx][...] data['neighbor{0}'.format(neighbor_idx)][superpixel_idx][...] = features[target_frame][idx][...] neighbor_idx += 1 assert neighbor_idx == total_number_of_neighbors, "Number of neighbors doesn't match ( %d != %d )" % (neighbor_idx, total_number_of_neighbors) #TODO: print "Random frame ... (Warning: if it's taknig too long stop it! \n Apparantly, the number of neighboring frames are relatively large \n with respect to the number of video frames)" # frame_random = randint(0, frames_num-1) # while frame_end-frame_start < 0.5*frames_num and frame_start <= frame_random <= frame_end: # frame_random = randint(0, frames_num-1) # idx_random = randint(0, numberofsuperpixelsperframe[ frame_random]-1) # data['negative'][superpixel_idx][...] = features[frame_random][idx_random][...] nearest_neighbors = kdtrees[f].query(center, 5*neighbors_num+n_neg)[1] #nearest_neighbors = kdtrees[f].query(center, 5*neighbors_num)[1] #It's the nearest of farthest superpixels to this one idx_random = nearest_neighbors[-1] if i == 10: print 'f, i, superpixel_idx, idx_random', f, i, superpixel_idx, idx_random #data['negative'][superpixel_idx][...] = features[f][idx_random][...] for j in xrange(n_neg): idx_random = nearest_neighbors[-j] data['negative'][superpixel_idx*n_neg + j][...] = features[f][idx_random][...] assert superpixel_idx+1 == target_superpixel_num, "Total number of superpixels doesn't match (%d != %d)" % (superpixel_idx, target_superpixel_num) db_path = database_path.format(action_name=action) print db_path database = DB(db_path) for name, datum in data.iteritems(): database.save(datum, name) database.close() #Creating the database for extracting the final representations. It just needs to have the targets nothing else. # n = len(framebelong) # print 'n', n # data = {'target':np.zeros((n*n_neg, feature_len)), 'negative':np.zeros((n*n_neg, feature_len))} # total_number_of_neighbors = neighbors_num * (2*neighbor_frames_num+1) # for i in range(total_number_of_neighbors): # data['neighbor{0}'.format(i)] = np.zeros((n*n_neg, feature_len)) # superpixel_idx = 0 # for f in xrange(1,frames_num-1): # for i in xrange(numberofsuperpixelsperframe[f]): # try: # data['target'][superpixel_idx*n_neg:(superpixel_idx + 1)*n_neg][...] = features[f][i][...] # except: # print superpixel_idx, f, i # raise # superpixel_idx +=1 # database_path = db_settings['database_path'] # db_path = database_path.format(action_name=(action+'_test')) # print 'test db path', db_path # database = DB(db_path) # for name, datum in data.iteritems(): # database.save(datum, name) # database.close() # write_db_list(db_settings, logger) logger.log('Creating database Done!')
def slice_templates(params, to_remove=[], to_merge=[], extension=''): import shutil, h5py file_out_suff = params.get('data', 'file_out_suff') data_file = params.data_file N_e = params.getint('data', 'N_e') N_total = params.nb_channels N_t = params.getint('detection', 'N_t') template_shift = params.getint('detection', 'template_shift') if comm.rank == 0: print_and_log(['Node 0 is slicing templates'], 'debug', logger) old_templates = load_data(params, 'templates') old_limits = load_data(params, 'limits') x, N_tm = old_templates.shape norm_templates = load_data(params, 'norm-templates') if to_merge != []: for count in xrange(len(to_merge)): remove = to_merge[count][1] to_remove += [remove] all_templates = set(numpy.arange(N_tm // 2)) to_keep = numpy.array(list(all_templates.difference(to_remove))) positions = numpy.arange(len(to_keep)) local_keep = to_keep[positions] templates = scipy.sparse.lil_matrix((N_e * N_t, 2 * len(to_keep)), dtype=numpy.float32) hfile = h5py.File(file_out_suff + '.templates-new.hdf5', 'w', libver='latest') norms = hfile.create_dataset('norms', shape=(2 * len(to_keep), ), dtype=numpy.float32, chunks=True) limits = hfile.create_dataset('limits', shape=(len(to_keep), 2), dtype=numpy.float32, chunks=True) for count, keep in zip(positions, local_keep): templates[:, count] = old_templates[:, keep] templates[:, count + len(to_keep)] = old_templates[:, keep + N_tm // 2] norms[count] = norm_templates[keep] norms[count + len(to_keep)] = norm_templates[keep + N_tm // 2] if to_merge == []: new_limits = old_limits[keep] else: subset = numpy.where(to_merge[:, 0] == keep)[0] if len(subset) > 0: idx = numpy.unique(to_merge[subset].flatten()) ratios = norm_templates[idx] / norm_templates[keep] new_limits = [ numpy.min(ratios * old_limits[idx][:, 0]), numpy.max(ratios * old_limits[idx][:, 1]) ] else: new_limits = old_limits[keep] limits[count] = new_limits templates = templates.tocoo() hfile.create_dataset('temp_x', data=templates.row) hfile.create_dataset('temp_y', data=templates.col) hfile.create_dataset('temp_data', data=templates.data) hfile.create_dataset('temp_shape', data=numpy.array([N_e, N_t, 2 * len(to_keep)], dtype=numpy.int32)) hfile.close() if os.path.exists(file_out_suff + '.templates%s.hdf5' % extension): os.remove(file_out_suff + '.templates%s.hdf5' % extension) shutil.move(file_out_suff + '.templates-new.hdf5', file_out_suff + '.templates%s.hdf5' % extension) comm.Barrier()
# positionArray = [[0, 0]*T]*len(theta_1) # for i in range(0, len(theta_1)): # positionArray[i] = ForwardModel(timeArray, [theta_1[i], theta_2[i]], state0) # print(i) # positionArray = np.array(positionArray) # xData = np.hstack(positionArray[:,:,0]) # yData = np.hstack(positionArray[:,:,1]) # hdf5file2 = h5py.File('predictiveValues.h5', 'a') # WriteData(hdf5file2, 'data/predictiveX', xData) # WriteData(hdf5file2, 'data/predictiveY', yData) # fig = MakeFigure(450, 1) # ax = plt.gca() # #ax.set_title('Iceberg Predictive Model', fontsize = 12) # ax.set_xlabel('Latitude (deg)', fontsize = 30) # ax.set_ylabel('Longitude (deg)', fontsize = 30) # hist = ax.hist2d(xData, yData, normed=True, bins = (500,500), cmap = plt.cm.viridis) # #plt.colorbar(hist[3], ax=ax) # plt.show() filename = 'predictiveValues.h5' hdf5file = h5py.File(filename, 'r') timeMat = hdf5file['data/predictiveX'].value Data = hdf5file['data/predictiveY'].value fig = MakeFigure(450, 1) ax = plt.gca() #ax.set_title('Harmonic Predictive Model', fontsize = 30) ax.set_xlabel('Longitude', fontsize = 30) ax.set_ylabel('Latitude', fontsize = 30) hist = ax.hist2d(timeMat, Data, normed=True, bins = (1000,1000), cmax= 0.15, cmap = plt.cm.viridis) #plt.colorbar(hist[3], ax=ax) plt.show()
output_shape_path = "./output_shape/{}".format(cat_id) output_color_path = "./output_color/{}".format(cat_id) ids = glob.glob(os.path.join(output_shape_path, "*")) ids = [os.path.basename(i) for i in ids] # # # avg_psnr_rgb = 0.0 avg_psnr_ycc = 0.0 count = 0 for id_ in ids: # Load ground truth volume gt_path = os.path.join(gt_output_path, id_, "models/model_normalized_{}.h5".format(vol_dim)) f_gt = h5py.File(gt_path) data_gt = f_gt['data'][:] indices_gt = np.where((data_gt[:, :, :, 0] > -0.5) == 1) # Load views views_path = os.path.join(gt_output_path, id_, "views/*.png") views_paths = glob.glob(views_path) views_paths.sort() # prediction pred_color_path_id = os.path.join(output_color_path, id_) pred_colors_paths = glob.glob(os.path.join(pred_color_path_id, "*.h5")) pred_colors_paths.sort() pred_shape_path_id = os.path.join(output_shape_path, id_) pred_shapes_paths = glob.glob(os.path.join(pred_shape_path_id, "*.h5"))
def main(): # Parse command-line arguments. args = parser.parse_args() # Validate arguments if not args.paths: log.error("No ROOT files were specified.") return if args.max_processes > 20: log.error( "The requested number of processes ({}) is excessive. Exiting.". format(args.max_processes)) return if args.stop is not None: args.stop = int(args.stop) pass if not args.outdir.endswith('/'): args.outdir += '/' pass if args.shuffle: raise NotImplemented() args.paths = sorted(args.paths) for path in args.paths: # Base candidate selection selection = None # "(p_truth_eta > -1.5 && p_truth_eta < 1.5)" # Read numpy array from file. f = ROOT.TFile(path, 'READ') tree = f.Get('tree') # Split indices into batches N = min(1000000, tree.GetEntries()) # @TEMP index_edges = map( int, np.linspace(0, N, args.max_processes + 1, endpoint=True)) index_ranges = zip(index_edges[:-1], index_edges[1:]) # Start conversion process(es) pool = multiprocessing.Pool(processes=args.max_processes) results = pool.map(converter, [(path, start, stop, selection) for (start, stop) in index_ranges]) # Concatenate data data = np.concatenate(results) print data.shape # Save as gzipped HDF5 mkdir(args.outdir) filename = 'cells_{}.h5'.format(args.tag) log.debug(" Saving to {}".format(args.outdir + filename)) with h5py.File(args.outdir + filename, 'w') as hf: hf.create_dataset('egamma', data=data, chunks=(min(1024, data.shape[0]), ), compression='gzip') pass call(['gzip', '-f', args.outdir + filename]) pass return
def load_stdata(fname): f = h5py.File(fname, 'r') data = f['data'].value timestamps = f['date'].value f.close() return data, timestamps
ext = os.path.splitext(demFile)[1] if ext == '.hgt': amp,dem,demRsc = readfile.read_float32(demFile) elif ext == '.dem': dem,demRsc =readfile.read_dem(demFile) try: outName=sys.argv[2] except: outName='dem.h5' h5=h5py.File(outName,'w') group=h5.create_group('dem') dset = group.create_dataset('dem', data=dem, compression='gzip') for key , value in demRsc.iteritems(): group.attrs[key]=value group.attrs['ref_y']=0 group.attrs['ref_x']=0 h5.close()
def get_group_mass_fractions(file, group): # Read HDF5 file h5file = h5py.File(file, 'r') return h5file['/' + group + '/Mass Fractions']
def slice_clusters(params, result, to_remove=[], to_merge=[], extension='', light=False): import h5py, shutil file_out_suff = params.get('data', 'file_out_suff') data_file = params.data_file N_e = params.getint('data', 'N_e') N_total = params.nb_channels N_t = params.getint('detection', 'N_t') template_shift = params.getint('detection', 'template_shift') if comm.rank == 0: print_and_log(['Node 0 is slicing clusters'], 'debug', logger) if to_merge != []: for count in xrange(len(to_merge)): remove = to_merge[count][1] to_remove += [remove] all_elements = [[] for i in xrange(N_e)] for target in numpy.unique(to_remove): elec = result['electrodes'][target] nic = target - numpy.where(result['electrodes'] == elec)[0][0] mask = result['clusters_' + str(elec)] > -1 tmp = numpy.unique(result['clusters_' + str(elec)][mask]) all_elements[elec] += list( numpy.where(result['clusters_' + str(elec)] == tmp[nic])[0]) for elec in xrange(N_e): if not light: result['data_' + str(elec)] = numpy.delete(result['data_' + str(elec)], all_elements[elec], axis=0) result['clusters_' + str(elec)] = numpy.delete( result['clusters_' + str(elec)], all_elements[elec]) result['times_' + str(elec)] = numpy.delete( result['times_' + str(elec)], all_elements[elec]) result['peaks_' + str(elec)] = numpy.delete( result['peaks_' + str(elec)], all_elements[elec]) else: result['clusters_' + str(elec)] = numpy.delete( result['clusters_' + str(elec)], all_elements[elec]) myfile = h5py.File(file_out_suff + '.clusters.hdf5', 'r', libver='latest') data = myfile.get('data_' + str(elec))[:] result['data_' + str(elec)] = numpy.delete(data, all_elements[elec], axis=0) data = myfile.get('times_' + str(elec))[:] result['times_' + str(elec)] = numpy.delete( data, all_elements[elec]) data = myfile.get('peaks_' + str(elec))[:] result['peaks_' + str(elec)] = numpy.delete( data, all_elements[elec]) myfile.close() result['electrodes'] = numpy.delete(result['electrodes'], numpy.unique(to_remove)) cfile = h5py.File(file_out_suff + '.clusters-new.hdf5', 'w', libver='latest') to_write = ['data_', 'clusters_', 'times_', 'peaks_'] for ielec in xrange(N_e): write_datasets(cfile, to_write, result, ielec) write_datasets(cfile, ['electrodes'], result) cfile.close() if os.path.exists(file_out_suff + '.clusters%s.hdf5' % extension): os.remove(file_out_suff + '.clusters%s.hdf5' % extension) shutil.move(file_out_suff + '.clusters-new.hdf5', file_out_suff + '.clusters%s.hdf5' % extension) comm.Barrier()
n_a = 1 n_b = 1 n_k = 1 # var_w = 4.0 # need only to define a reasonable integration interval # var_e = 1.0 # std_w = np.sqrt(var_w) # std_e = np.sqrt(var_e) model_name_load = 'NLS_noise' # start from NLS fit model_name_save = 'ML_noise' # Refine with ML fit dataset_name = 'train_noise' # In[Load data] filename = os.path.join('data', 'dataset.h5') h5_data = h5py.File(filename, 'r') u = np.array(h5_data[dataset_name]['u']) y = np.array(h5_data[dataset_name]['y']) y0 = np.array(h5_data[dataset_name]['y0']) # Train on a single example u = u[0:1, ...] y = y[0:1, ...] batch_size = u.shape[0] seq_len = u.shape[1] n_u = u.shape[2] n_y = y.shape[2] # In[To tensors] u_torch = torch.tensor(u, dtype=torch.float32)
def main(args): dset = h5py.File(args.filename, 'r') if not dset: print("Not a valid dataset: %s" % (args.filename)) return dsetNames = dset.keys() print("File %s contains %d groups:" % (args.filename, len(dset.keys()))) print(" ", "\n ".join(dsetNames)) if not args.in_group: if len(dset.keys()) > 1: print("Input group not specified -- selecting most recent") args.in_group = list(dset.keys())[-1] if not args.out_folder: args.out_folder = re.sub('.h5$', '', args.filename) print("Output folder not specified -- using %s" % args.out_folder) if args.in_group not in dset: print("Could not find group %s" % (args.in_group)) return if not os.path.exists(args.out_folder): os.makedirs(args.out_folder) group = dset.get(args.in_group) print("Reading data from group '%s' in file '%s'" % (args.in_group, args.filename)) # mrdImg data is stored as: # /group/config text of recon config parameters (optional) # /group/xml text of ISMRMRD flexible data header (optional) # /group/image_0/data array of IsmrmrdImage data # /group/image_0/header array of ImageHeader # /group/image_0/attributes text of mrdImg MetaAttributes isImage = True imageNames = group.keys() print("Found %d mrdImg sub-groups: %s" % (len(imageNames), ", ".join(imageNames))) for imageName in imageNames: if ((imageName == 'xml') or (imageName == 'config') or (imageName == 'config_file')): continue mrdImg = group[imageName] if not (('data' in mrdImg) and ('header' in mrdImg) and ('attributes' in mrdImg)): isImage = False dset.close() if (isImage is False): print("File does not contain properly formatted MRD raw or mrdImg data") return dset = ismrmrd.Dataset(args.filename, args.in_group, False) groups = dset.list() if ('xml' in groups): xml_header = dset.read_xml_header() xml_header = xml_header.decode("utf-8") mrdHead = ismrmrd.xsd.CreateFromDocument(xml_header) for group in groups: if ( (group == 'config') or (group == 'config_file') or (group == 'xml') ): continue print("Reading images from '/" + args.in_group + "/" + group + "'") for imgNum in range(0, dset.number_of_images(group)): mrdImg = dset.read_image(group, imgNum) meta = ismrmrd.Meta.deserialize(mrdImg.attribute_string) if ((mrdImg.data.shape[0] == 3) and (mrdImg.getHead().image_type == 6)): # RGB images print("RGB data not yet supported") continue else: if (mrdImg.data.shape[1] != 1): print("Multi-slice data not yet supported") continue if (mrdImg.data.shape[0] != 1): print("Multi-channel data not yet supported") continue # Use previously JSON serialized header as a starting point, if available if meta.get('DicomJson') is not None: dicomDset = pydicom.dataset.Dataset.from_json(base64.b64decode(meta['DicomJson'])) else: dicomDset = pydicom.dataset.Dataset() # Enforce explicit little endian for written DICOM files dicomDset.file_meta = pydicom.dataset.FileMetaDataset() dicomDset.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian dicomDset.file_meta.MediaStorageSOPClassUID = pynetdicom.sop_class.MRImageStorage dicomDset.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() pydicom.dataset.validate_file_meta(dicomDset.file_meta) # FileMetaInformationGroupLength is still missing? dicomDset.is_little_endian = True dicomDset.is_implicit_VR = False # ----- Update DICOM header from MRD header ----- try: if mrdHead.measurementInformation is None: pass # print(" MRD header does not contain measurementInformation section") else: # print("---------- Old -------------------------") # print("SeriesInstanceUID : %s" % dicomDset.SeriesInstanceUID ) # print("PatientPosition : %s" % dicomDset.PatientPosition ) # print("SeriesDescription : %s" % dicomDset.SeriesDescription ) # print("FrameOfReferenceUID: %s" % dicomDset.FrameOfReferenceUID ) if mrdHead.measurementInformation.measurementID is not None: dicomDset.SeriesInstanceUID = mrdHead.measurementInformation.measurementID if mrdHead.measurementInformation.patientPosition is not None: dicomDset.PatientPosition = mrdHead.measurementInformation.patientPosition.name if mrdHead.measurementInformation.protocolName is not None: dicomDset.SeriesDescription = mrdHead.measurementInformation.protocolName if mrdHead.measurementInformation.frameOfReferenceUID is not None: dicomDset.FrameOfReferenceUID = mrdHead.measurementInformation.frameOfReferenceUID # print("---------- New -------------------------") # print("SeriesInstanceUID : %s" % dicomDset.SeriesInstanceUID ) # print("PatientPosition : %s" % dicomDset.PatientPosition ) # print("SeriesDescription : %s" % dicomDset.SeriesDescription ) # print("FrameOfReferenceUID: %s" % dicomDset.FrameOfReferenceUID ) except: print("Error setting header information from MRD header's measurementInformation section") try: # print("---------- Old -------------------------") # print("mrdHead.acquisitionSystemInformation.systemVendor : %s" % mrdHead.acquisitionSystemInformation.systemVendor ) # print("mrdHead.acquisitionSystemInformation.systemModel : %s" % mrdHead.acquisitionSystemInformation.systemModel ) # print("mrdHead.acquisitionSystemInformation.systemFieldStrength_T: %s" % mrdHead.acquisitionSystemInformation.systemFieldStrength_T ) # print("mrdHead.acquisitionSystemInformation.institutionName : %s" % mrdHead.acquisitionSystemInformation.institutionName ) # print("mrdHead.acquisitionSystemInformation.stationName : %s" % mrdHead.acquisitionSystemInformation.stationName ) if mrdHead.acquisitionSystemInformation.systemVendor is not None: dicomDset.Manufacturer = mrdHead.acquisitionSystemInformation.systemVendor if mrdHead.acquisitionSystemInformation.systemModel is not None: dicomDset.ManufacturerModelName = mrdHead.acquisitionSystemInformation.systemModel if mrdHead.acquisitionSystemInformation.systemFieldStrength_T is not None: dicomDset.MagneticFieldStrength = mrdHead.acquisitionSystemInformation.systemFieldStrength_T if mrdHead.acquisitionSystemInformation.institutionName is not None: dicomDset.InstitutionName = mrdHead.acquisitionSystemInformation.institutionName if mrdHead.acquisitionSystemInformation.stationName is not None: dicomDset.StationName = mrdHead.acquisitionSystemInformation.stationName # print("---------- New -------------------------") # print("mrdHead.acquisitionSystemInformation.systemVendor : %s" % mrdHead.acquisitionSystemInformation.systemVendor ) # print("mrdHead.acquisitionSystemInformation.systemModel : %s" % mrdHead.acquisitionSystemInformation.systemModel ) # print("mrdHead.acquisitionSystemInformation.systemFieldStrength_T: %s" % mrdHead.acquisitionSystemInformation.systemFieldStrength_T ) # print("mrdHead.acquisitionSystemInformation.institutionName : %s" % mrdHead.acquisitionSystemInformation.institutionName ) # print("mrdHead.acquisitionSystemInformation.stationName : %s" % mrdHead.acquisitionSystemInformation.stationName ) except: print("Error setting header information from MRD header's acquisitionSystemInformation section") # Set mrdImg pixel data from MRD mrdImg dicomDset.PixelData = np.squeeze(mrdImg.data).tobytes() # mrdImg.data is [cha z y x] -- squeeze to [y x] for [row col] dicomDset.Rows = mrdImg.data.shape[2] dicomDset.Columns = mrdImg.data.shape[3] if (mrdImg.data.dtype == 'uint16') or (mrdImg.data.dtype == 'int16'): dicomDset.BitsAllocated = 16 dicomDset.BitsStored = 16 dicomDset.HighBit = 15 elif (mrdImg.data.dtype == 'uint32') or (mrdImg.data.dtype == 'int') or (mrdImg.data.dtype == 'float32'): dicomDset.BitsAllocated = 32 dicomDset.BitsStored = 32 dicomDset.HighBit = 31 elif (mrdImg.data.dtype == 'float64'): dicomDset.BitsAllocated = 64 dicomDset.BitsStored = 64 dicomDset.HighBit = 63 else: print("Unsupported data type: ", mrdImg.data.dtype) dicomDset.SeriesNumber = mrdImg.image_series_index dicomDset.InstanceNumber = mrdImg.image_index # ----- Set some mandatory default values ----- if not 'SamplesPerPixel' in dicomDset: dicomDset.SamplesPerPixel = 1 if not 'PhotometricInterpretation' in dicomDset: dicomDset.PhotometricInterpretation = 'MONOCHROME2' if not 'PixelRepresentation' in dicomDset: dicomDset.PixelRepresentation = 0 # Unsigned integer if not 'ImageType' in dicomDset: dicomDset.ImageType = ['ORIGINAL', 'PRIMARY', 'M'] if not 'SeriesNumber' in dicomDset: dicomDset.SeriesNumber = 1 if not 'SeriesDescription' in dicomDset: dicomDset.SeriesDescription = '' if not 'InstanceNumber' in dicomDset: dicomDset.InstanceNumber = 1 # ----- Update DICOM header from MRD ImageHeader ----- dicomDset.ImageType[2] = imtype_map[mrdImg.image_type] dicomDset.PixelSpacing = [float(mrdImg.field_of_view[0]) / mrdImg.data.shape[2], float(mrdImg.field_of_view[1]) / mrdImg.data.shape[3]] dicomDset.SliceThickness = mrdImg.field_of_view[2] dicomDset.ImagePositionPatient = [mrdImg.position[0], mrdImg.position[1], mrdImg.position[2]] dicomDset.ImageOrientationPatient = [mrdImg.read_dir[0], mrdImg.read_dir[1], mrdImg.read_dir[2], mrdImg.phase_dir[0], mrdImg.phase_dir[1], mrdImg.phase_dir[2]] time_sec = mrdImg.acquisition_time_stamp/1000/2.5 hour = int(np.floor(time_sec/3600)) min = int(np.floor((time_sec - hour*3600)/60)) sec = time_sec - hour*3600 - min*60 dicomDset.AcquisitionTime = "%02.0f%02.0f%02.6f" % (hour, min, sec) dicomDset.TriggerTime = mrdImg.physiology_time_stamp[0] / 2.5 # ----- Update DICOM header from MRD Image MetaAttributes ----- if meta.get('SeriesDescription') is not None: dicomDset.SeriesDescription = meta['SeriesDescription'] if meta.get('SeriesDescriptionAdditional') is not None: dicomDset.SeriesDescription = dicomDset.SeriesDescription + meta['SeriesDescriptionAdditional'] if meta.get('ImageComment') is not None: dicomDset.ImageComment = "_".join(meta['ImageComment']) if meta.get('ImageType') is not None: dicomDset.ImageType = meta['ImageType'] if (meta.get('ImageRowDir') is not None) and (meta.get('ImageColumnDir') is not None): dicomDset.ImageOrientationPatient = [float(meta['ImageRowDir'][0]), float(meta['ImageRowDir'][1]), float(meta['ImageRowDir'][2]), float(meta['ImageColumnDir'][0]), float(meta['ImageColumnDir'][1]), float(meta['ImageColumnDir'][2])] if meta.get('RescaleIntercept') is not None: dicomDset.RescaleIntercept = meta['RescaleIntercept'] if meta.get('RescaleSlope') is not None: dicomDset.RescaleSlope = meta['RescaleSlope'] if meta.get('WindowCenter') is not None: dicomDset.WindowCenter = meta['WindowCenter'] if meta.get('WindowWidth') is not None: dicomDset.WindowWidth = meta['WindowWidth'] if meta.get('EchoTime') is not None: dicomDset.EchoTime = meta['EchoTime'] if meta.get('InversionTime') is not None: dicomDset.InversionTime = meta['InversionTime'] # Unhandled fields: # LUTFileName # ROI # Write DICOM files fileName = "%02.0f_%s_%03.0f.dcm" % (dicomDset.SeriesNumber, dicomDset.SeriesDescription, dicomDset.InstanceNumber) print(" Writing file %s" % fileName) dicomDset.save_as(os.path.join(args.out_folder, fileName)) return
def HDF5_ATL06_tide_write(IS2_atl06_tide, IS2_atl06_attrs, INPUT=None, FILENAME='', FILL_VALUE=None, DIMENSIONS=None, CLOBBER=False): #-- setting HDF5 clobber attribute if CLOBBER: clobber = 'w' else: clobber = 'w-' #-- open output HDF5 file fileID = h5py.File(os.path.expanduser(FILENAME), clobber) #-- create HDF5 records h5 = {} #-- number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) #-- and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) h5['ancillary_data'] = {} for k,v in IS2_atl06_tide['ancillary_data'].items(): #-- Defining the HDF5 dataset variables val = 'ancillary_data/{0}'.format(k) h5['ancillary_data'][k] = fileID.create_dataset(val, np.shape(v), data=v, dtype=v.dtype, compression='gzip') #-- add HDF5 variable attributes for att_name,att_val in IS2_atl06_attrs['ancillary_data'][k].items(): h5['ancillary_data'][k].attrs[att_name] = att_val #-- write each output beam beams = [k for k in IS2_atl06_tide.keys() if bool(re.match(r'gt\d[lr]',k))] for gtx in beams: fileID.create_group(gtx) #-- add HDF5 group attributes for beam for att_name in ['Description','atlas_pce','atlas_beam_type', 'groundtrack_id','atmosphere_profile','atlas_spot_number', 'sc_orientation']: fileID[gtx].attrs[att_name] = IS2_atl06_attrs[gtx][att_name] #-- create land_ice_segments group fileID[gtx].create_group('land_ice_segments') h5[gtx] = dict(land_ice_segments={}) for att_name in ['Description','data_rate']: att_val = IS2_atl06_attrs[gtx]['land_ice_segments'][att_name] fileID[gtx]['land_ice_segments'].attrs[att_name] = att_val #-- delta_time, geolocation and segment_id variables for k in ['delta_time','latitude','longitude','segment_id']: #-- values and attributes v = IS2_atl06_tide[gtx]['land_ice_segments'][k] attrs = IS2_atl06_attrs[gtx]['land_ice_segments'][k] fillvalue = FILL_VALUE[gtx]['land_ice_segments'][k] #-- Defining the HDF5 dataset variables val = '{0}/{1}/{2}'.format(gtx,'land_ice_segments',k) if fillvalue: h5[gtx]['land_ice_segments'][k] = fileID.create_dataset(val, np.shape(v), data=v, dtype=v.dtype, fillvalue=fillvalue, compression='gzip') else: h5[gtx]['land_ice_segments'][k] = fileID.create_dataset(val, np.shape(v), data=v, dtype=v.dtype, compression='gzip') #-- create or attach dimensions for HDF5 variable if DIMENSIONS[gtx]['land_ice_segments'][k]: #-- attach dimensions for i,dim in enumerate(DIMENSIONS[gtx]['land_ice_segments'][k]): h5[gtx]['land_ice_segments'][k].dims[i].attach_scale( h5[gtx]['land_ice_segments'][dim]) else: #-- make dimension h5[gtx]['land_ice_segments'][k].make_scale(k) #-- add HDF5 variable attributes for att_name,att_val in attrs.items(): h5[gtx]['land_ice_segments'][k].attrs[att_name] = att_val #-- add to geophysical corrections key = 'geophysical' fileID[gtx]['land_ice_segments'].create_group(key) h5[gtx]['land_ice_segments'][key] = {} for att_name in ['Description','data_rate']: att_val=IS2_atl06_attrs[gtx]['land_ice_segments'][key][att_name] fileID[gtx]['land_ice_segments'][key].attrs[att_name] = att_val for k,v in IS2_atl06_tide[gtx]['land_ice_segments'][key].items(): #-- attributes attrs = IS2_atl06_attrs[gtx]['land_ice_segments'][key][k] fillvalue = FILL_VALUE[gtx]['land_ice_segments'][key][k] #-- Defining the HDF5 dataset variables val = '{0}/{1}/{2}/{3}'.format(gtx,'land_ice_segments',key,k) if fillvalue: h5[gtx]['land_ice_segments'][key][k] = \ fileID.create_dataset(val, np.shape(v), data=v, dtype=v.dtype, fillvalue=fillvalue, compression='gzip') else: h5[gtx]['land_ice_segments'][key][k] = \ fileID.create_dataset(val, np.shape(v), data=v, dtype=v.dtype, compression='gzip') #-- attach dimensions for i,dim in enumerate(DIMENSIONS[gtx]['land_ice_segments'][key][k]): h5[gtx]['land_ice_segments'][key][k].dims[i].attach_scale( h5[gtx]['land_ice_segments'][dim]) #-- add HDF5 variable attributes for att_name,att_val in attrs.items(): h5[gtx]['land_ice_segments'][key][k].attrs[att_name] = att_val #-- HDF5 file title fileID.attrs['featureType'] = 'trajectory' fileID.attrs['title'] = 'ATLAS/ICESat-2 L3A Land Ice Height' fileID.attrs['summary'] = ('Estimates of the ice-sheet tidal parameters ' 'needed to interpret and assess the quality of land height estimates.') fileID.attrs['description'] = ('Land ice parameters for each beam. All ' 'parameters are calculated for the same along-track increments for ' 'each beam and repeat.') date_created = datetime.datetime.today() fileID.attrs['date_created'] = date_created.isoformat() project = 'ICESat-2 > Ice, Cloud, and land Elevation Satellite-2' fileID.attrs['project'] = project platform = 'ICESat-2 > Ice, Cloud, and land Elevation Satellite-2' fileID.attrs['project'] = platform #-- add attribute for elevation instrument and designated processing level instrument = 'ATLAS > Advanced Topographic Laser Altimeter System' fileID.attrs['instrument'] = instrument fileID.attrs['source'] = 'Spacecraft' fileID.attrs['references'] = 'https://nsidc.org/data/icesat-2' fileID.attrs['processing_level'] = '4' #-- add attributes for input ATL06 file fileID.attrs['input_files'] = os.path.basename(INPUT) #-- find geospatial and temporal ranges lnmn,lnmx,ltmn,ltmx,tmn,tmx = (np.inf,-np.inf,np.inf,-np.inf,np.inf,-np.inf) for gtx in beams: lon = IS2_atl06_tide[gtx]['land_ice_segments']['longitude'] lat = IS2_atl06_tide[gtx]['land_ice_segments']['latitude'] delta_time = IS2_atl06_tide[gtx]['land_ice_segments']['delta_time'] #-- setting the geospatial and temporal ranges lnmn = lon.min() if (lon.min() < lnmn) else lnmn lnmx = lon.max() if (lon.max() > lnmx) else lnmx ltmn = lat.min() if (lat.min() < ltmn) else ltmn ltmx = lat.max() if (lat.max() > ltmx) else ltmx tmn = delta_time.min() if (delta_time.min() < tmn) else tmn tmx = delta_time.max() if (delta_time.max() > tmx) else tmx #-- add geospatial and temporal attributes fileID.attrs['geospatial_lat_min'] = ltmn fileID.attrs['geospatial_lat_max'] = ltmx fileID.attrs['geospatial_lon_min'] = lnmn fileID.attrs['geospatial_lon_max'] = lnmx fileID.attrs['geospatial_lat_units'] = "degrees_north" fileID.attrs['geospatial_lon_units'] = "degrees_east" fileID.attrs['geospatial_ellipsoid'] = "WGS84" fileID.attrs['date_type'] = 'UTC' fileID.attrs['time_type'] = 'CCSDS UTC-A' #-- convert start and end time from ATLAS SDP seconds into GPS seconds atlas_sdp_gps_epoch=IS2_atl06_tide['ancillary_data']['atlas_sdp_gps_epoch'] gps_seconds = atlas_sdp_gps_epoch + np.array([tmn,tmx]) #-- calculate leap seconds leaps = pyTMD.time.count_leap_seconds(gps_seconds) #-- convert from seconds since 1980-01-06T00:00:00 to Julian days time_julian = 2400000.5 + pyTMD.time.convert_delta_time(gps_seconds - leaps, epoch1=(1980,1,6,0,0,0), epoch2=(1858,11,17,0,0,0), scale=1.0/86400.0) #-- convert to calendar date YY,MM,DD,HH,MN,SS = pyTMD.time.convert_julian(time_julian,FORMAT='tuple') #-- add attributes with measurement date start, end and duration tcs = datetime.datetime(int(YY[0]), int(MM[0]), int(DD[0]), int(HH[0]), int(MN[0]), int(SS[0]), int(1e6*(SS[0] % 1))) fileID.attrs['time_coverage_start'] = tcs.isoformat() tce = datetime.datetime(int(YY[1]), int(MM[1]), int(DD[1]), int(HH[1]), int(MN[1]), int(SS[1]), int(1e6*(SS[1] % 1))) fileID.attrs['time_coverage_end'] = tce.isoformat() fileID.attrs['time_coverage_duration'] = '{0:0.0f}'.format(tmx-tmn) #-- Closing the HDF5 file fileID.close()
def batchGen(filenames, batch_size=16, maxlen=None, classification=True): """ Generator function for batches of documents and labels from a list of HDF5 files Args: filenames: list of HDF5 filenames batch_size: size of each batch to yield from generator maxlen: maximum length of each example document Yields: padded_docs, padded_labels: A tuple of padded documents and corresponding labels. """ while True: for fname in filenames: with h5py.File(fname, 'r') as hf: # Get a list of all examples in the file groups = [item[1] for item in hf.items()] # Get lists of all sentences and all labels docs = [grp['sents'][()] for grp in groups] docs = [docs.tolist() for docs in docs] labels = np.array([grp['labels'][()] for grp in groups]) # Only get examples longer than 0 and less than maxlen if maxlen: docs = [x for x in docs if len(x) < maxlen and len(x) > 0] labels = [ x for x in labels if len(x) < maxlen and len(x) > 0 ] # Only get examples longer than 0 else: docs = [x for x in docs if len(x) > 0] labels = [x for x in labels if len(x) > 0] # Shuffle documents and labels docs, labels = shuffle(docs, labels) n = len(docs) assert n == len( labels) # Ensure docs and labels are same length num_batches = np.floor(n / batch_size).astype(np.int16) for idx in range(num_batches): # Get each batch of documents and labels batch_docs = docs[idx * batch_size:(idx + 1) * batch_size] batch_labels = labels[idx * batch_size:(idx + 1) * batch_size] # Pad docs and labels to the length of the longest sample in the batch padded_docs = pad_sequences(batch_docs, dtype=object, value=' ', maxlen=maxlen, padding='pre', truncating='post') # if classification: padded_labels = pad_sequences(batch_labels, dtype=int, value=2, maxlen=maxlen, padding='pre', truncating='post') padded_labels = to_categorical(padded_labels, num_classes=3, dtype='int32') else: padded_labels = pad_sequences(batch_labels, dtype=int, value=0, maxlen=maxlen, padding='pre', truncating='post') padded_labels = np.expand_dims(padded_labels, axis=-1) yield (padded_docs, padded_labels)
def genMPO_SiSjRpt(nsite,ig,jg,fname,xts,ifQt,debug=False): if debug: print '\n[mpo_dmrg_propsMPORpt.genMPO_SiSjRpt] fname=',fname t0 = time.time() fop = h5py.File(fname,'w') npt = len(xts) nop = 3*npt fop['nop'] = nop for isite in range(nsite): ti = time.time() gname = 'site'+str(isite) grp = fop.create_group(gname) # 0.5*(Si+*Sj-+Si-*Sj) + Szi*Szj if not ifQt: # (a) 0.5*Si+*Sj- if isite == 0: cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sp','Sm',0.5) else: cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sp','Sm',1.0) for ipt in range(npt): rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt]) wop = mpo_dmrg_opers.prodTwoOpers(cop,rop) grp['op'+str(0*npt+ipt)] = wop # (b) 0.5*Si-*Sj+ if isite == 0: cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sm','Sp',0.5) else: cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sm','Sp',1.0) for ipt in range(npt): rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt]) wop = mpo_dmrg_opers.prodTwoOpers(cop,rop) grp['op'+str(1*npt+ipt)] = wop # (c) Szi*Szj cop = mpo_dmrg_spinopers.genLocal2Spatial(nsite,isite,ig,jg,'Sz','Sz',1.0) for ipt in range(npt): rop = mpo_dmrg_opers.genExpISyPhi(xts[ipt]) wop = mpo_dmrg_opers.prodTwoOpers(cop,rop) grp['op'+str(2*npt+ipt)] = wop else: # Sip*Sjm for ipt in range(npt): if isite == 0: cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sp','Sm',0.5,xts[ipt]) else: cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sp','Sm',1.0,xts[ipt]) cop.dump(grp,'op'+str(0*npt+ipt)) # Sim*Sjp for ipt in range(npt): if isite == 0: cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sm','Sp',0.5,xts[ipt]) else: cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sm','Sp',1.0,xts[ipt]) cop.dump(grp,'op'+str(1*npt+ipt)) # Siz*Sjz for ipt in range(npt): cop = qtensor_spinopers.genLocal2RSpatialQt(nsite,isite,ig,jg,'Sz','Sz',1.0,xts[ipt]) cop.dump(grp,'op'+str(2*npt+ipt)) tf = time.time() if debug: print ' isite =',isite,' time = %.2f s'%(tf-ti) t1 = time.time() if debug: print ' time for genMPO_SiSjRpt = %.2f s'%(t1-t0) return fop
import os, h5py from time import time import numpy as np from scipy import sparse from sklearn.utils import extmath """Import data: """ def importData(sub) f = h5py.File(('/scr/litauen1/%s.hcp.lh.mat' % sub),'r') data = np.array(f.get('connData')) cortex = np.array(f.get('cortex')) - 1 return data, cortex data, cortex = importData('') print("Computing the principal singular vectors using randomized_svd") t0 = time() U, s, V = extmath.randomized_svd(data, 5, n_iter=3) print("done in %0.3fs" % (time() - t0)) def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10): """Power iteration computation of the principal eigenvector This method is also known as Google PageRank and the implementation is based on the one from the NetworkX project (BSD licensed too) with copyrights by: Aric Hagberg <*****@*****.**>
# Calculates the average of the orientation for atom i with other atom at distance j, # over all particles. # Load libraries import h5py import matplotlib as mpl mpl.use('agg') import matplotlib.pyplot as plt import numpy as np import math as ma figx = 4. figy = 4. # k2 in_file = h5py.File("k2/averaged_data.h5","r") mean_1_sep_0_k2 = in_file["mean_1_sep_0"][:,:] mean_2_sep_0_k2 = in_file["mean_2_sep_0"][:,:] mean_1_sep_1_k2 = in_file["mean_1_sep_1"][:,:] mean_2_sep_1_k2 = in_file["mean_2_sep_1"][:,:] mean_1_sep_3_k2 = in_file["mean_1_sep_3"][:,:] mean_2_sep_3_k2 = in_file["mean_2_sep_3"][:,:] mean_1_sep_6_k2 = in_file["mean_1_sep_6"][:,:] mean_2_sep_6_k2 = in_file["mean_2_sep_6"][:,:] # k3 in_file = h5py.File("k3/averaged_data.h5","r") mean_1_sep_0_k3 = in_file["mean_1_sep_0"][:,:] mean_2_sep_0_k3 = in_file["mean_2_sep_0"][:,:] mean_1_sep_1_k3 = in_file["mean_1_sep_1"][:,:] mean_2_sep_1_k3 = in_file["mean_2_sep_1"][:,:] mean_1_sep_3_k3 = in_file["mean_1_sep_3"][:,:]
order = OutOfOrder(botnet) target = TargetInfo(order) target_left = BorderTargetInfo(order, direction='left') target_right = BorderTargetInfo(order, direction='right') move_side = MoveSidewards(botnet, target_left, target_right) if __name__ == '__main__': plot_data_file = '../plot/plot_data/out_of_order_robot1.h5' import os import h5py if os.path.isfile(plot_data_file): # data file exists, so simulation ran before # no need for running simulation again, so just get data for plotting with h5py.File(plot_data_file, 'r') as hf: print('List of arrays in this file: \n', hf.keys()) trange = np.array(hf.get('trange')) p_x = np.array(hf.get('p_x')) p_forget = np.array(hf.get('p_forget')) p_diff = np.array(hf.get('p_diff')) p_evidence = np.array(hf.get('p_evidence')) p_neg_min = np.array(hf.get('p_neg_min')) p_evidence_left = np.array(hf.get('p_evidence_left')) p_odd = np.array(hf.get('p_odd')) p_evidence_right = np.array(hf.get('p_evidence_right')) else: # data file does not exists, so run simulation sim = nengo.Simulator(model) sim.run(3)
images_name = [tmp.strip() for tmp in f.readlines()] with open(os.path.join(scene_path, "calibration.txt")) as f: calib_name = [tmp.strip() for tmp in f.readlines()] r_list = list() t_list = list() geoms = list() resized_shapes = list() org_imsizes = list() K_list = list() # Read image infos for im, calib in zip(images_name, calib_name): calib_h5 = h5py.File(os.path.join(scene_path, calib)) r_list.append(np.array(calib_h5["R"])) t_list.append(np.array(calib_h5["T"]).T) geoms.append(calib_h5) org_imsizes.append(np.array(calib_h5['imsize'][0]).tolist()) K_list.append(np.array(calib_h5['K'])) resized_shapes.append(getResizedSize(minSize, Image.open(os.path.join(scene_path, im)).size, strideNet)) #for i, (idA, idB) in tqdm(enumerate(pairs_ids)): for i, (idA, idB) in enumerate(pairs_ids): if i % 50 == 49 :
def extractPatch4OneSubject(matFA, matSeg, matMask, fileID, d, step, rate): eps = 5e-2 [row, col, leng] = matFA.shape cubicCnt = 0 estNum = 40000 trainFA = np.zeros([estNum, 1, dFA[0], dFA[1], dFA[2]], dtype=np.float16) trainSeg = np.zeros([estNum, 1, dSeg[0], dSeg[1], dSeg[2]], dtype=np.float16) print('trainFA shape, ', trainFA.shape) # to padding for input margin1 = int((dFA[0] - dSeg[0]) / 2) margin2 = int((dFA[1] - dSeg[1]) / 2) margin3 = int((dFA[2] - dSeg[2]) / 2) two_margin1 = dFA[0] - dSeg[0] two_margin2 = dFA[1] - dSeg[1] two_margin3 = dFA[2] - dSeg[2] cubicCnt = 0 marginD = [margin1, margin2, margin3] print('matFA shape is ', matFA.shape) matFAOut = np.zeros( [row + two_margin1, col + two_margin2, leng + two_margin3], dtype=np.float16) print('matFAOut shape is ', matFAOut.shape) matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matFA matSegOut = np.zeros( [row + two_margin1, col + two_margin2, leng + two_margin3], dtype=np.float16) matSegOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matSeg matMaskOut = np.zeros( [row + two_margin1, col + two_margin2, leng + two_margin3], dtype=np.float16) matMaskOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matMask # for mageFA, enlarge it by padding if margin1 != 0: matFAOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matFA[marginD[0] - 1::-1, :, :] # reverse 0:marginD[0] matFAOut[row + marginD[0]:matFAOut.shape[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matFA[ matFA.shape[0] - 1:row - marginD[0] - 1:-1, :, :] # we'd better flip it along the 1st dimension if margin2 != 0: matFAOut[ marginD[0]:row + marginD[0], 0:marginD[1], marginD[2]:leng + marginD[2]] = matFA[:, marginD[1] - 1:: -1, :] # we'd flip it along the 2nd dimension matFAOut[ marginD[0]:row + marginD[0], col + marginD[1]:matFAOut.shape[1], marginD[2]:leng + marginD[2]] = matFA[:, matFA.shape[1] - 1:col - marginD[1] - 1: -1, :] # we'd flip it along the 2nd dimension if margin3 != 0: matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], 0:marginD[2]] = matFA[:, :, marginD[ 2] - 1::-1] # we'd better flip it along the 3rd dimension matFAOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2] + leng:matFAOut.shape[2]] = matFA[:, :, matFA.shape[2] - 1:leng - marginD[2] - 1:-1] # for matseg, enlarge it by padding if margin1 != 0: matSegOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matSeg[marginD[0] - 1::-1, :, :] # reverse 0:marginD[0] matSegOut[row + marginD[0]:matSegOut.shape[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matSeg[ matSeg.shape[0] - 1:row - marginD[0] - 1: -1, :, :] # we'd better flip it along the 1st dimension if margin2 != 0: matSegOut[ marginD[0]:row + marginD[0], 0:marginD[1], marginD[2]:leng + marginD[2]] = matSeg[:, marginD[1] - 1:: -1, :] # we'd flip it along the 2nd dimension matSegOut[ marginD[0]:row + marginD[0], col + marginD[1]:matSegOut.shape[1], marginD[2]:leng + marginD[2]] = matSeg[:, matSeg.shape[1] - 1:col - marginD[1] - 1: -1, :] # we'd flip it along the 2nd dimension if margin3 != 0: matSegOut[ marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], 0:marginD[2]] = matSeg[:, :, marginD[ 2] - 1::-1] # we'd better flip it along the 3rd dimension matSegOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2] + leng:matSegOut.shape[2]] = matSeg[:, :, matSeg.shape[2] - 1:leng - marginD[2] - 1:-1] # for matseg, enlarge it by padding if margin1 != 0: matMaskOut[0:marginD[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matMask[marginD[0] - 1::-1, :, :] # reverse 0:marginD[0] matMaskOut[row + marginD[0]:matMaskOut.shape[0], marginD[1]:col + marginD[1], marginD[2]:leng + marginD[2]] = matMask[ matMask.shape[0] - 1:row - marginD[0] - 1: -1, :, :] # we'd better flip it along the 1st dimension if margin2 != 0: matMaskOut[marginD[0]:row + marginD[0], 0:marginD[1], marginD[2]:leng + marginD[2]] = matMask[:, marginD[ 1] - 1::-1, :] # we'd flip it along the 2nd dimension matMaskOut[marginD[0]:row + marginD[0], col + marginD[1]:matMaskOut.shape[1], marginD[2]:leng + marginD[2]] = matMask[:, matMask.shape[1] - 1:col - marginD[ 1] - 1:-1, :] # we'd flip it along the 2nd dimension if margin3 != 0: matMaskOut[ marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], 0:marginD[2]] = matMask[:, :, marginD[ 2] - 1::-1] # we'd better flip it along the 3rd dimension matMaskOut[marginD[0]:row + marginD[0], marginD[1]:col + marginD[1], marginD[2] + leng:matMaskOut.shape[2]] = matMask[:, :, matMask.shape[2] - 1:leng - marginD[2] - 1:-1] dsfactor = rate for i in range(1): for j in range(0, col - dSeg[1], step[1]): for k in range(0, leng - dSeg[2], step[2]): volMask = matMaskOut[i:i + dSeg[0], j:j + dSeg[1], k:k + dSeg[2]] if np.sum(volMask) < eps: continue cubicCnt = cubicCnt + 1 # index at scale 1 volSeg = matSeg[i:i + dSeg[0], j:j + dSeg[1], k:k + dSeg[2]] volFA = matFAOut[i:i + dFA[0], j:j + dFA[1], k:k + dFA[2]] trainFA[cubicCnt, 0, :, :, :] = volFA # 32*32*32 trainSeg[cubicCnt, 0, :, :, :] = volSeg # 24*24*24 trainFA = trainFA[0:cubicCnt, :, :, :, :] trainSeg = trainSeg[0:cubicCnt, :, :, :, :] save_folder = '' if opt.split in ["train", "dev", "test"]: save_folder = os.path.join(opt.save_folder, opt.split) else: print("Specify correct split type!") raise FileNotFoundError with h5py.File(save_folder + 'train_%s.h5' % fileID, 'w') as f: f['noisy'] = trainFA f['clear'] = trainSeg with open('./train_list.txt', 'a') as f: f.write(save_folder + 'train_%s.h5' % fileID) return cubicCnt
def create_scool( cool_uri, bins, cell_name_pixels_dict, columns=None, dtypes=None, metadata=None, assembly=None, ordered=False, symmetric_upper=True, mode="w", mergebuf=int(20e6), delete_temp=True, temp_dir=None, max_merge=200, boundscheck=True, dupcheck=True, triucheck=True, ensure_sorted=False, h5opts=None, lock=None, **kwargs): r""" Create a single-cell (scool) file. For each cell store a cooler matrix under **/cells**, where all matrices have the same dimensions. Each cell is a regular cooler data collection, so the input must be a bin table and pixel table for each cell. The pixel tables are provided as a dictionary where the key is a unique cell name. The bin tables can be provided as a dict with the same keys or a single common bin table can be given. .. versionadded:: 0.8.9 Parameters ---------- cool_uri : str Path to scool file or URI string. If the file does not exist, it will be created. bins : :class:`pandas.DataFrame` or Dict[str, DataFrame] A single bin table or dictionary of cell names to bins tables. A bin table is a dataframe with columns ``chrom``, ``start`` and ``end``. May contain additional columns. cell_name_pixels_dict : Dict[str, DataFrame] Cell name as key and pixel table DataFrame as value. A table, given as a dataframe or a column-oriented dict, containing columns labeled ``bin1_id``, ``bin2_id`` and ``count``, sorted by (``bin1_id``, ``bin2_id``). If additional columns are included in the pixel table, their names and dtypes must be specified using the ``columns`` and ``dtypes`` arguments. For larger input data, an **iterable** can be provided that yields the pixel data as a sequence of chunks. If the input is a dask DataFrame, it will also be processed one chunk at a time. {other_parameters} See also -------- cooler.create_cooler cooler.zoomify_cooler {notes} """ file_path, group_path = parse_cooler_uri(cool_uri) h5opts = _set_h5opts(h5opts) if isinstance(bins, pd.DataFrame): bins_dict = {cell_name: bins for cell_name in cell_name_pixels_dict} cell_names = sorted(cell_name_pixels_dict) else: # Assume bins is a dict of cell name -> dataframe bins_dict = bins if len(bins_dict) == 0: raise ValueError("At least one bin must be given.") else: bins = bins_dict[next(iter(bins_dict))][["chrom", "start", "end"]] # Sort bins_dict and cell_name_pixels_dict to guarantee matching keys bins_keys = sorted(bins_dict) cell_names = sorted(cell_name_pixels_dict) for key_bins, key_pixels in zip(bins_keys, cell_names): if key_bins != key_pixels: raise ValueError('Bins and pixel dicts do not have matching keys') dtypes = _get_dtypes_arg(dtypes, kwargs) for col in ["chrom", "start", "end"]: if col not in bins.columns: raise ValueError("Missing column from bin table: '{}'.".format(col)) # Populate dtypes for expected pixel columns, and apply user overrides. if dtypes is None: dtypes = dict(PIXEL_DTYPES) else: dtypes_ = dict(dtypes) dtypes = dict(PIXEL_DTYPES) dtypes.update(dtypes_) # Determine the appropriate iterable try: from dask.dataframe import DataFrame as dask_df except (ImportError, AttributeError): # pragma: no cover dask_df = () # Prepare chroms and bins bins = bins.copy() bins["chrom"] = bins["chrom"].astype(object) chromsizes = get_chromsizes(bins) try: chromsizes = six.iteritems(chromsizes) except AttributeError: pass chromnames, lengths = zip(*chromsizes) chroms = pd.DataFrame( {"name": chromnames, "length": lengths}, columns=["name", "length"] ) binsize = get_binsize(bins) n_chroms = len(chroms) n_bins = len(bins) # Create root group with h5py.File(file_path, mode) as f: logger.info('Creating cooler at "{}::{}"'.format(file_path, group_path)) if group_path == "/": for name in ["chroms", "bins"]: if name in f: del f[name] else: try: f.create_group(group_path) except ValueError: del f[group_path] f.create_group(group_path) with h5py.File(file_path, "r+") as f: h5 = f[group_path] logger.info("Writing chroms") grp = h5.create_group("chroms") write_chroms(grp, chroms, h5opts) logger.info("Writing bins") grp = h5.create_group("bins") write_bins(grp, bins, chroms["name"], h5opts) with h5py.File(file_path, "r+") as f: h5 = f[group_path] logger.info("Writing info") info = {} info["bin-type"] = u"fixed" if binsize is not None else u"variable" info["bin-size"] = binsize if binsize is not None else u"null" info["nchroms"] = n_chroms info["ncells"] = len(cell_name_pixels_dict) info["nbins"] = n_bins if assembly is not None: info["genome-assembly"] = assembly if metadata is not None: info["metadata"] = metadata write_info(h5, info, True) # Append single cells for key in cell_names: if '/' in key: cell_name = key.split('/')[-1] else: cell_name = key create( cool_uri + '::/cells/' + cell_name, bins_dict[key], cell_name_pixels_dict[key], columns=columns, dtypes=dtypes, metadata=metadata, assembly=assembly, ordered=ordered, symmetric_upper=symmetric_upper, mode='a', boundscheck=boundscheck, dupcheck=dupcheck, triucheck=triucheck, ensure_sorted=ensure_sorted, h5opts=h5opts, lock=lock, mergebuf=mergebuf, delete_temp=delete_temp, temp_dir=temp_dir, max_merge=max_merge, append_scool=True, scool_root_uri=cool_uri )
# encode the target labels targetNames = np.unique(labels) le = LabelEncoder() target = le.fit_transform(labels) print("[STATUS] training labels encoded...") # normalize the feature vector in the range (0-1) scaler = MinMaxScaler(feature_range=(0, 1)) rescaled_features = scaler.fit_transform(global_features) print("[STATUS] feature vector normalized...") print("[STATUS] target labels: {}".format(target)) print("[STATUS] target labels shape: {}".format(target.shape)) # save the feature vector using HDF5 h5f_data = h5py.File('output/data.h5', 'w') h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features)) h5f_label = h5py.File('output/labels.h5', 'w') h5f_label.create_dataset('dataset_1', data=np.array(target)) h5f_data.close() h5f_label.close() print("[STATUS] end of training..") # import the necessary packages import h5py import numpy as np import os import glob
def append(cool_uri, table, data, chunked=False, force=False, h5opts=None, lock=None): # pragma: no cover """ Append one or more data columns to an existing table. Parameters ---------- cool_uri : str Path to Cooler file or URI to Cooler group. table : str Name of table (HDF5 group). data : dict-like DataFrame, Series or mapping of column names to data. If the input is a dask DataFrame or Series, the data is written in chunks. chunked : bool, optional If True, the values of the data dict are treated as separate chunk iterators of column data. force : bool, optional If True, replace existing columns with the same name as the input. h5opts : dict, optional HDF5 dataset filter options to use (compression, shuffling, checksumming, etc.). Default is to use autochunking and GZIP compression, level 6. lock : multiprocessing.Lock, optional Optional lock to synchronize concurrent HDF5 file access. """ h5opts = _set_h5opts(h5opts) file_path, group_path = parse_cooler_uri(cool_uri) try: from dask.dataframe import DataFrame as dask_df, Series as dask_series except (ImportError, AttributeError): dask_df = () dask_series = () if isinstance(data, dask_series): data = data.to_frame() try: names = data.keys() except AttributeError: names = data.columns with h5py.File(file_path, "r+") as f: h5 = f[group_path] for name in names: if name in h5[table]: if not force: raise ValueError( "'{}' column already exists. ".format(name) + "Use --force option to overwrite." ) else: del h5[table][name] if isinstance(data, dask_df): # iterate over dataframe chunks for chunk in data.to_delayed(): i = 0 for chunk in data.to_delayed(): chunk = chunk.compute() try: if lock is not None: lock.acquire() put(h5[table], chunk, lo=i, h5opts=h5opts) finally: if lock is not None: lock.release() i += len(chunk) elif chunked: # iterate over chunks from each column for key in data.keys(): i = 0 for chunk in data[key]: try: if lock is not None: lock.acquire() put(h5[table], {key: chunk}, lo=i, h5opts=h5opts) finally: if lock is not None: lock.release() i += len(chunk) else: # write all the data try: if lock is not None: lock.acquire() put(h5[table], data, lo=0, h5opts=h5opts) finally: if lock is not None: lock.release()
def prepare_data(argv=None): '''Aggregate sequence data GTDB using a file-of-files''' from io import BytesIO import tempfile import h5py from datetime import datetime from tqdm import tqdm from skbio import TreeNode from skbio.sequence import DNA, Protein from hdmf.common import get_hdf5io from hdmf.data_utils import DataChunkIterator from ..utils import get_faa_path, get_fna_path, get_genomic_path from deep_taxon.sequence.convert import AASeqIterator, DNASeqIterator, DNAVocabIterator, DNAVocabGeneIterator from deep_taxon.sequence.dna_table import AATable, DNATable, SequenceTable, TaxaTable, DeepIndexFile, NewickString, CondensedDistanceMatrix, GenomeTable, TreeGraph parser = argparse.ArgumentParser() parser.add_argument('fadir', type=str, help='directory with NCBI sequence files') parser.add_argument('metadata', type=str, help='metadata file from GTDB') parser.add_argument('out', type=str, help='output HDF5') parser.add_argument( '-T', '--tree', type=str, help='a Newick file with a tree of representative taxa', default=None) parser.add_argument( '-A', '--accessions', type=str, default=None, help='file of the NCBI accessions of the genomes to convert') parser.add_argument( '-d', '--max_deg', type=float, default=None, help='max number of degenerate characters in protein sequences') parser.add_argument('-l', '--min_len', type=float, default=None, help='min length of sequences') parser.add_argument('--iter', action='store_true', default=False, help='convert using iterators') parser.add_argument( '-p', '--num_procs', type=int, default=1, help='the number of processes to use for counting total sequence size') parser.add_argument('-L', '--total_seq_len', type=int, default=None, help='the total sequence length') parser.add_argument('-t', '--tmpdir', type=str, default=None, help='a temporary directory to store sequences') parser.add_argument('-N', '--n_seqs', type=int, default=None, help='the total number of sequences') rep_grp = parser.add_mutually_exclusive_group() rep_grp.add_argument( '-n', '--nonrep', action='store_true', default=False, help='keep non-representative genomes only. keep both by default') rep_grp.add_argument( '-r', '--rep', action='store_true', default=False, help='keep representative genomes only. keep both by default') parser.add_argument( '-a', '--all', action='store_true', default=False, help= 'keep all non-representative genomes. By default, only non-reps with the highest and lowest contig count are kept' ) grp = parser.add_mutually_exclusive_group() grp.add_argument('-P', '--protein', action='store_true', default=False, help='get paths for protein files') grp.add_argument('-C', '--cds', action='store_true', default=False, help='get paths for CDS files') grp.add_argument('-G', '--genomic', action='store_true', default=False, help='get paths for genomic files (default)') parser.add_argument('-z', '--gzip', action='store_true', default=False, help='GZip sequence table') dep_grp = parser.add_argument_group( title="Legacy options you probably do not need") dep_grp.add_argument('-e', '--emb', type=str, help='embedding file', default=None) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args(args=argv) if args.total_seq_len is not None: if args.n_seqs is None: sys.stderr.write( "If using --total_seq_len, you must also use --n_seqs\n") if args.n_seqs is not None: if args.total_seq_len is None: sys.stderr.write( "If using --n_seqs, you must also use --total_seq_len\n") if not any([args.protein, args.cds, args.genomic]): args.genomic = True logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='%(asctime)s - %(message)s') logger = logging.getLogger() ############################# # read and filter taxonomies ############################# logger.info('Reading taxonomies from %s' % args.metadata) taxlevels = [ 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] extra_cols = ['contig_count', 'checkm_completeness'] def func(row): dat = dict(zip(taxlevels, row['gtdb_taxonomy'].split(';'))) dat['species'] = dat['species'] # .split(' ')[1] dat['gtdb_genome_representative'] = row['gtdb_genome_representative'][ 3:] dat['accession'] = row['accession'][3:] for k in extra_cols: dat[k] = row[k] return pd.Series(data=dat) taxdf = pd.read_csv(args.metadata, header=0, sep='\t')[['accession', 'gtdb_taxonomy', 'gtdb_genome_representative', 'contig_count', 'checkm_completeness']]\ .apply(func, axis=1) taxdf = taxdf.set_index('accession') dflen = len(taxdf) logger.info('Found %d total genomes' % dflen) taxdf = taxdf[taxdf['gtdb_genome_representative'].str.contains( 'GC[A,F]_', regex=True)] # get rid of genomes that are not at NCBI taxdf = taxdf[taxdf.index.str.contains( 'GC[A,F]_', regex=True)] # get rid of genomes that are not at NCBI logger.info('Discarded %d non-NCBI genomes' % (dflen - len(taxdf))) rep_taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']] if args.accessions is not None: logger.info('reading accessions %s' % args.accessions) with open(args.accessions, 'r') as f: accessions = [l[:-1] for l in f.readlines()] dflen = len(taxdf) taxdf = taxdf[taxdf.index.isin(accessions)] logger.info('Discarded %d genomes not found in %s' % (dflen - len(taxdf), args.accessions)) dflen = len(taxdf) if args.nonrep: taxdf = taxdf[taxdf.index != taxdf['gtdb_genome_representative']] logger.info('Discarded %d representative genomes' % (dflen - len(taxdf))) dflen = len(taxdf) if not args.all: groups = taxdf[['gtdb_genome_representative', 'contig_count' ]].groupby('gtdb_genome_representative') min_ctgs = groups.idxmin()['contig_count'] max_ctgs = groups.idxmax()['contig_count'] accessions = np.unique(np.concatenate([min_ctgs, max_ctgs])) taxdf = taxdf.filter(accessions, axis=0) logger.info('Discarded %d extra non-representative genomes' % (dflen - len(taxdf))) elif args.rep: taxdf = taxdf[taxdf.index == taxdf['gtdb_genome_representative']] logger.info('Discarded %d non-representative genomes' % (dflen - len(taxdf))) dflen = len(taxdf) logger.info('%d remaining genomes' % dflen) ############################### # Arguments for constructing the DeepIndexFile object ############################### di_kwargs = dict() taxa_ids = taxdf.index.values # get paths to Fasta Files fa_path_func = partial(get_genomic_path, directory=args.fadir) if args.cds: fa_path_func = partial(get_fna_path, directory=args.fadir) elif args.protein: fa_path_func = partial(get_faa_path, directory=args.fadir) map_func = map if args.num_procs > 1: logger.info(f'using {args.num_procs} processes to locate Fasta files') import multiprocessing as mp map_func = mp.Pool(processes=args.num_procs).imap logger.info('Locating Fasta files for each taxa') fapaths = list(tqdm(map_func(fa_path_func, taxa_ids), total=len(taxa_ids))) logger.info('Found Fasta files for all accessions') ############################# # read and filter embeddings ############################# emb = None if args.emb is not None: logger.info('reading embeddings from %s' % args.emb) with h5py.File(args.emb, 'r') as f: emb = f['embedding'][:] emb_taxa = f['leaf_names'][:] logger.info('selecting embeddings for taxa found in %s' % args.accessions) emb = select_embeddings(taxa_ids, emb_taxa, emb) logger.info(f'Writing {len(rep_taxdf)} taxa to taxa table') tt_args = [ 'taxa_table', 'a table for storing taxa data', rep_taxdf.index.values ] tt_kwargs = dict() for t in taxlevels[:-1]: enc = LabelEncoder().fit(rep_taxdf[t].values) _data = enc.transform(rep_taxdf[t].values).astype(np.uint32) _vocab = enc.classes_.astype('U') logger.info(f'{t} - {len(_vocab)} classes') tt_args.append( EnumData(name=t, description=f'label encoded {t}', data=_data, elements=_vocab)) # we have too many species to store this as VocabData, nor does it save any spaces tt_args.append( VectorData(name='species', description=f'Microbial species in the form Genus species', data=rep_taxdf['species'].values)) if emb is not None: tt_kwargs['embedding'] = emb #tt_kwargs['rep_taxon_id'] = rep_taxdf['gtdb_genome_representative'].values taxa_table = TaxaTable(*tt_args, **tt_kwargs) h5path = args.out logger.info("reading %d Fasta files" % len(fapaths)) logger.info("Total size: %d", sum(list(map_func(os.path.getsize, fapaths)))) tmp_h5_file = None if args.protein: vocab_it = AAVocabIterator SeqTable = SequenceTable skbio_cls = Protein else: vocab_it = DNAVocabIterator SeqTable = DNATable skbio_cls = DNA vocab = np.array(list(vocab_it.characters())) if not args.protein: np.testing.assert_array_equal(vocab, list('ACYWSKDVNTGRMHB')) if args.total_seq_len is None: logger.info('counting total number of sqeuences') n_seqs, total_seq_len = np.array( list(zip( *tqdm(map_func(seqlen, fapaths), total=len(fapaths))))).sum( axis=1) logger.info(f'found {total_seq_len} bases across {n_seqs} sequences') else: n_seqs, total_seq_len = args.n_seqs, args.total_seq_len logger.info( f'As specified, there are {total_seq_len} bases across {n_seqs} sequences' ) logger.info( f'allocating uint8 array of length {total_seq_len} for sequences') if args.tmpdir is not None: if not os.path.exists(args.tmpdir): os.mkdir(args.tmpdir) tmpdir = tempfile.mkdtemp(dir=args.tmpdir) else: tmpdir = tempfile.mkdtemp() comp = 'gzip' if args.gzip else None tmp_h5_filename = os.path.join(tmpdir, 'sequences.h5') logger.info(f'writing temporary sequence data to {tmp_h5_filename}') tmp_h5_file = h5py.File(tmp_h5_filename, 'w') sequence = tmp_h5_file.create_dataset('sequences', shape=(total_seq_len, ), dtype=np.uint8, compression=comp) seqindex = tmp_h5_file.create_dataset('sequences_index', shape=(n_seqs, ), dtype=np.uint64, compression=comp) genomes = tmp_h5_file.create_dataset('genomes', shape=(n_seqs, ), dtype=np.uint64, compression=comp) seqlens = tmp_h5_file.create_dataset('seqlens', shape=(n_seqs, ), dtype=np.uint64, compression=comp) names = tmp_h5_file.create_dataset('seqnames', shape=(n_seqs, ), dtype=h5py.special_dtype(vlen=str), compression=comp) taxa = np.zeros(len(fapaths), dtype=int) seq_i = 0 b = 0 for genome_i, fa in tqdm(enumerate(fapaths), total=len(fapaths)): kwargs = { 'format': 'fasta', 'constructor': skbio_cls, 'validate': False } taxid = taxa_ids[genome_i] rep_taxid = taxdf['gtdb_genome_representative'][genome_i] taxa[genome_i] = np.where(rep_taxdf.index == rep_taxid)[0][0] for seq in skbio.io.read(fa, **kwargs): enc_seq = vocab_it.encode(seq) e = b + len(enc_seq) sequence[b:e] = enc_seq seqindex[seq_i] = e genomes[seq_i] = genome_i seqlens[seq_i] = len(enc_seq) names[seq_i] = vocab_it.get_seqname(seq) b = e seq_i += 1 ids = tmp_h5_file.create_dataset('ids', data=np.arange(n_seqs), dtype=int) tmp_h5_file.flush() io = get_hdf5io(h5path, 'w') print([a['name'] for a in GenomeTable.__init__.__docval__['args']]) genome_table = GenomeTable( 'genome_table', 'information about the genome each sequence comes from', taxa_ids, taxa, taxa_table=taxa_table) ############################# # read and trim tree ############################# if args.tree: logger.info('Reading tree from %s' % args.tree) root = TreeNode.read(args.tree, format='newick') logger.info('Found %d tips' % len(list(root.tips()))) logger.info('Transforming leaf names for shearing') for tip in root.tips(): tip.name = tip.name[3:].replace(' ', '_') logger.info('converting tree to Newick string') bytes_io = BytesIO() root.write(bytes_io, format='newick') tree_str = bytes_io.getvalue() di_kwargs['tree'] = NewickString('tree', data=tree_str) # get distances from tree if they are not provided tt_dmat = root.tip_tip_distances().filter(rep_taxdf.index) di_kwargs['distances'] = CondensedDistanceMatrix('distances', data=tt_dmat.data) adj, gt_indices = get_tree_graph(root, rep_taxdf) di_kwargs['tree_graph'] = TreeGraph(data=adj, leaves=gt_indices, table=genome_table, name='tree_graph') if args.gzip: names = io.set_dataio(names, compression='gzip', chunks=True) sequence = io.set_dataio(sequence, compression='gzip', maxshape=(None, ), chunks=True) seqindex = io.set_dataio(seqindex, compression='gzip', maxshape=(None, ), chunks=True) seqlens = io.set_dataio(seqlens, compression='gzip', maxshape=(None, ), chunks=True) genomes = io.set_dataio(genomes, compression='gzip', maxshape=(None, ), chunks=True) ids = io.set_dataio(ids, compression='gzip', maxshape=(None, ), chunks=True) seq_table = SeqTable( 'seq_table', 'a table storing sequences for computing sequence embedding', names, sequence, seqindex, seqlens, genomes, genome_table=genome_table, id=ids, vocab=vocab) difile = DeepIndexFile(seq_table, taxa_table, genome_table, **di_kwargs) before = datetime.now() io.write(difile, exhaust_dci=False, link_data=False) io.close() after = datetime.now() delta = (after - before).total_seconds() logger.info( f'Sequence totals {sequence.dtype.itemsize * sequence.size} bytes') logger.info(f'Took {delta} seconds to write after read') if tmp_h5_file is not None: tmp_h5_file.close() logger.info("reading %s" % (h5path)) h5size = os.path.getsize(h5path) logger.info("HDF5 size: %d", h5size)