def convert_genotypes(self): chunk_size = self.split_size if chunk_size is None: raise ValueError( 'CONVERTER_SPLIT_SIZE does not define in config file!') G = np.array([]) # self.reader.folder.processed=0 while True: with Timer() as t: G = self.reader.folder.get_bed(chunk_size) if isinstance(G, type(None)): break print(('Time to read {} SNPs is {} s'.format(G.shape[0], t.secs))) self.write_data('gen') atom = tables.Int8Atom() self.genotype = self.h5_gen_file.create_carray( self.h5_gen_file.root, 'genotype', atom, (G.shape), title='Genotype', filters=self.pytable_filters) with Timer() as t: self.genotype[:] = G print(('Time to write {} SNPs is {} s'.format(G.shape[0], t.secs))) self.h5_gen_file.close() G = None gc.collect()
def contact_maps_from_traj(pdb_file, traj_file, savefile, contact_cutoff=8.0): """ Get contact map from trajectory. """ mda_traj = mda.Universe(pdb_file, traj_file) traj_length = len(mda_traj.trajectory) nloops = int( brute(best_loop, (loop_range, ), args=(traj_length, size), finish=None)) print("traj_length: %d nloop: %d" % (traj_length, nloops)) write_freq = nloops // 5 ca = mda_traj.select_atoms('name CA') dist_shape = distances.self_distance_array(ca.positions).shape[0] if rank == 0: savefile = os.path.abspath(savefile) outfile = tables.open_file(savefile, 'w') atom = tables.Int8Atom() cm_table = outfile.create_earray(outfile.root, 'contact_maps', atom, shape=(0, dist_shape)) print("dist_shape ", dist_shape) contact_matrices = [] # workaround mpi4py 2^32 limit on number of objects # and ib memory size limit for loop in range(nloops): contact_matrices_loop = [] nframes = traj_length // (size * nloops) start = (rank + loop * size) * nframes end = (rank + 1 + loop * size) * nframes if loop == nloops - 1 and rank == size - 1: end = traj_length print("loop %d rank %d start %d end %d" % (loop, rank, start, end)) for frame in mda_traj.trajectory[start:end]: cm_matrix = (distances.self_distance_array(ca.positions) < contact_cutoff) * 1.0 contact_matrices_loop.append(cm_matrix.astype('int8')) print("rank %d cm size %d" % (rank, len(contact_matrices_loop))) contact_matrices_loop = comm.gather(contact_matrices_loop, root=0) if rank == 0: contact_matrices.append( list(chain.from_iterable(contact_matrices_loop))) print("loop %d " % loop, len(contact_matrices_loop), len(contact_matrices_loop[0])) if (loop + 1) % write_freq == 0: contact_matrices = list(chain.from_iterable(contact_matrices)) cm_table.append(contact_matrices) contact_matrices = [] comm.Barrier() if rank == 0: if len(contact_matrices) > 0: contact_matrices = list(chain.from_iterable(contact_matrices)) cm_table.append(contact_matrices) outfile.close()
def init_genotypes(self): """Create the array on file (which is empty) to store genotypes.""" atom = tables.Int8Atom() self.genotype = self.h5_file.createCArray( self.h5_file.root, 'genotype', atom, (self.num_probes, self.num_individuals), title='Genotype', filters=self.pytable_filters)
def copy_data(targeth, name, source, idx): a = targeth.createCArray(targeth.root, name, tb.Int8Atom(), shape=[len(idx)] + list(source.shape[1:])) print "Copying {} images to {}".format(len(idx), name) t0 = time() for i, ind in enumerate(idx): if i > 0 and i % printfreq == 0: progress(i, len(idx), time() - t0, printfreq) t0 = time() a[i, :] = source[ind, :]
def create_data_file(out_file, n_channels, n_samples, image_shape, storage_names=('data', 'truth', 'affine'), affine_shape=(0, 4, 4), normalize=True, affine_dtype=tables.Float32Atom()): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters( complevel=5 ) #, complib='blosc') # suggested remove in https://github.com/ellisdg/3DUnetCNN/issues/58 data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) if not normalize: data_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[0], tables.Int8Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) else: data_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[0], tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[1], tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[2], affine_dtype, shape=affine_shape, filters=filters, expectedrows=n_samples) if len(storage_names) == 4: normalization_storage = hdf5_file.create_earray(hdf5_file.root, storage_names[3], tables.Float32Atom(), shape=(0, 2), filters=filters, expectedrows=n_samples) # will hold mean and std of this case for later normalization return hdf5_file, data_storage, truth_storage, affine_storage, normalization_storage return hdf5_file, data_storage, truth_storage, affine_storage
def _create_table(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError( "Could not create table %s because of unknown dtype '%s'" % (name, example.dtype)) #+ ", of name: " % example.shape) if type(example) == np.ndarray: h5dim = (0, ) + example.shape h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_earray(h5.root, name, h5type, h5dim, filters=filters) elif type(example) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_vlarray(h5.root, name, h5type, filters=filters) self.types[name] = type(example)
def _create_table_list(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. The modified version for creating table with appendList """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == list and type(example[0]) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype) if type(example) == np.ndarray: h5dim = (0, ) + example.shape[1:] h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_earray(h5.root, name, h5type, h5dim, filters=filters) elif type(example) == list and type(example[0]) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) self.tables[name] = h5.create_vlarray(h5.root, name, h5type, filters=filters) self.types[name] = type(example)
def main(options): gdb = genome.db.GenomeDB(assembly=options.assembly) chrom_dict = gdb.get_chromosome_dict() track = gdb.create_track(options.track_name[0]) if options.dtype == "float32": atom = tables.Float32Atom() elif options.dtype == "int8": atom = tables.Int8Atom() elif options.dtype == "uint8": atom = tables.UInt8Atom() elif options.dtype == "int16": atom = tables.Int16Atom() else: raise NotImplementedError("datatype %s not implemented" % dtype) for path in options.filename: filename = path.split("/")[-1] if options.format in ("xb", "xbf"): # all of the chromosomes are in a single file... chrom_names = [chrom.name for chrom in gdb.get_chromosomes()] else: chrom_names = [extract_chrom_name(filename)] for chrom_name in chrom_names: if chrom_name not in chrom_dict: raise ValueError("unknown chromosome '%s'" % chrom_name) chrom = chrom_dict[chrom_name] sys.stderr.write(chrom_name + "\n") # create a chunked array with one dimension the length # of the chromosome shape = [chrom.length] carray = track.h5f.createCArray(track.h5f.root, chrom_name, atom, shape, filters=ZLIB_FILTER) # populate the array with data read from a file carray[:] = trackreader.read_file(path, chrom, dtype=options.dtype, format=options.format, pos_idx=options.pos_idx, val_idx=options.val_idx, strand=options.strand) track.close()
def write_image_annotation_pairs_to_h5(filename_pairs, h5_filename): atom = tables.Int8Atom() h5_file = tables.open_file(h5_filename, mode='a') array_x = h5_file.create_earray(h5_file.root, 'X', atom, (0, 512, 1024, 3)) array_y = h5_file.create_earray(h5_file.root, 'Y', atom, (0, 512, 1024)) h = 512 w = 1024 for img_path, annotation_path in tqdm(filename_pairs): img = misc.imread(img_path) img = misc.imresize(img, (h, w)) annotation = misc.imread(annotation_path) annotation = custom_ignore_labels(annotation) annotation = misc.imresize(annotation, (h, w), 'nearest') array_x.append(np.expand_dims(img, 0)) array_y.append(np.expand_dims(annotation, 0)) h5_file.close()
def _create_table(self, name, example, parent=None): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. """ h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) if parent is None: parent = h5.root if type(example) == str: h5type = tables.VLStringAtom() h5.createVLArray(parent, name, h5type, filters=filters) return if type(example) == dict: self.h5.createGroup(parent, name) return #If we get here then we're dealing with numpy arrays example = np.asarray(example) #MODIFICATION: appended name everywhere and introduced string type_map = { np.dtype(np.float64).name: tables.Float64Atom(), np.dtype(np.float32).name: tables.Float32Atom(), np.dtype(np.int).name: tables.Int64Atom(), np.dtype(np.int8).name: tables.Int8Atom(), np.dtype(np.uint8).name: tables.UInt8Atom(), np.dtype(np.int16).name: tables.Int16Atom(), np.dtype(np.uint16).name: tables.UInt16Atom(), np.dtype(np.int32).name: tables.Int32Atom(), np.dtype(np.uint32).name: tables.UInt32Atom(), np.dtype(np.bool).name: tables.BoolAtom(), # Maximal string length of 128 per string - change if needed 'string32': tables.StringAtom(128) } try: h5type = type_map[example.dtype.name] h5dim = (0, ) + example.shape h5.createEArray(parent, name, h5type, h5dim, filters=filters) except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype)
def init_phenotypes(self): """Extract the phenotype from the individuals""" #phenotypes = numpy.loadtxt(self.fam_name, usecols=[5], dtype=int) phenotypes = self.individuals['phenotype'] phenotypes[phenotypes == 1] = -1 phenotypes[phenotypes == 2] = 1 phenotypes.shape = (len(phenotypes), 1) #check for undefined phenotype undefined_phenotype = flatnonzero(phenotypes == 0) if len(undefined_phenotype) > 0: print('Some phenotypes were undefined') print(undefined_phenotype) atom = tables.Int8Atom() self.h5_file.createCArray(self.h5_file.root, 'phenotype', atom, (1, self.num_individuals), title='Phenotype', filters=self.pytable_filters) self.h5_file.root.phenotype[:] = phenotypes.flatten() self.phenotypes = self.h5_file.root.phenotype
def main(args): usage = """python %s <plink root> <h5 file> Convert binary PLINK files into h5 file. E.g.: py plink2h5.py mydata_final_clean mydata_final_clean.h5\n""" % args[0] if len(args) != 3: sys.stderr.write(usage) sys.exit(0) plinkRoot = args[1] h5fname = args[2] plinkTitle = plinkRoot.split("/")[-1] # Read binary PLINK files plinkF = pf.open(plinkRoot) numSnps = len(plinkF.get_loci()) numSamples = len(plinkF.get_samples()) print "%d SNPs x %d samples" % (numSnps, numSamples) # Create the empty array to store genotypes atom = tables.Int8Atom() h5F = tables.openFile(h5fname, 'w', title=plinkTitle) genotype = h5F.createCArray(h5F.root, 'genotype', atom, (numSnps, numSamples), title='Genotype', filters=tables.Filters(complevel=5, complib='blosc')) # populate for counter, row in enumerate(plinkF): genotype[counter, :] = list(row) if counter % 10000 == 9999: print(counter + 1), 'SNPs read' plinkF.close() h5F.close()
precision, group=None): pass if is_tables: precision_to_atom = { 'float32': tables.Float32Atom(), 'complex64': tables.ComplexAtom(8), 'float64': tables.Float64Atom(), 'complex128': tables.ComplexAtom(16), 'bool': tables.BoolAtom(), 'int32': tables.Int32Atom(), 'int16': tables.Int16Atom(), 'int8': tables.Int8Atom(), } class H5FileTables(H5FileBase, tables.File): def create_extendable_array(self, nodename, shape, precision, group=None): if not group: group = self.root atom = precision_to_atom[precision] self.create_earray(group, nodename, atom, shape) def get_data_by_reference(self, nodename, group=None): if not group: group = self.root return self.get_node(group, nodename)
name = 'group_' + sys.argv[1][-17:-12] feature = np.loadtxt( sys.argv[2]) #this is the features for secondary structure h5 = tb.open_file( 'pssm_test_table_jhE0', 'a' ) ##########!!!!!!!!!! THIS IS THE ONLY THING YOU HAVE TO CHANGE !!!!!!!!!!!!!!!! group = h5.create_group('/', name, 'individual group') pssm = h5.create_earray( group, name='one_hot', shape=(0, 21, 15), atom=tb.Float32Atom( )) #would be 0, 21, 15 if you want it to be the shape of the old one ss = h5.create_earray(group, name='ss', shape=(0, 3), atom=tb.Int8Atom(), filters=tables.Filters(complevel=9, complib='blosc:snappy')) index = [] #### splitting the sliding table into bits of 21 sized timesteps ## for num, line in enumerate(window): if num != 0 and num % 21 == 0: index.append(num) window = np.vsplit(window, index) #print np.shape(window) for feat, line in zip(feature, window): ss.append(feat[np.newaxis, :]) pssm.append(line[np.newaxis, :])
############################################################################################################################## ################################################### TO PYTABLE ############################################################# ############################################################################################################################## name= 'group_' + sys.argv[1][-8:-3] print name #feature= np.loadtxt(sys.argv[2]) #this is the features for secondary structure h5= tb.open_file(sys.argv[4], 'a') group= h5.create_group('/', name, 'individual group') seq_tab = h5.create_earray(group, name='seq_tab', shape=(0, 20, 15), atom=tb.Float32Atom()) #would be 0, 21, 15 if you want it to be the shape of the old one pssm_tab = h5.create_earray(group, name='pssm_tab', shape=(0, 21, 15), atom=tb.Float32Atom()) #would be 0, 21, 15 if you want it to be the shape of the old one ss3_feat = h5.create_earray(group, name='ss3_feat', shape=(0, 3), atom=tb.Int8Atom()) ss8_feat = h5.create_earray(group, name='ss8_feat', shape=(0, 8), atom=tb.Int8Atom()) rsa_feat = h5.create_earray(group, name='rsa_feat', shape=(0, 1), atom=tb.Float32Atom()) rsa=np.reshape(rsa,(-1,1)) ############## might need to add filter!!!!! ################# ### also there were 3 difficult files, something went wrong in comp. In the future we can maybe take these away with an if loop just to check that they match #### splitting the sliding table into bits of 21 (pssm) or 20 (seq) sized timesteps ## def timesteps (array, size): index= [] for num, line in enumerate(array):
def load_color(random_seed=123522): # Check if dataset is in the data directory. data_path = os.path.join(os.path.split(__file__)[0], "data") if not os.path.exists(data_path): os.makedirs(data_path) dataset = 'train.zip' data_file = os.path.join(data_path, dataset) if os.path.isfile(data_file): dataset = data_file if (not os.path.isfile(data_file)): try: import urllib urllib.urlretrieve('http://google.com') except AttributeError: import urllib.request as urllib url = 'https://dl.dropboxusercontent.com/u/15378192/train.zip' print('Downloading data from %s' % url) urllib.urlretrieve(url, data_file) data_dir = os.path.join(data_path, 'cvd') if not os.path.exists(data_dir): os.makedirs(data_dir) zf = zipfile.ZipFile(data_file) zf.extractall(data_dir) data_file = os.path.join(data_path, 'cvd_color.hdf5') label_file = os.path.join(data_path, 'cvd_color_labels.npy') if not os.path.exists(data_file): print('... loading data') cat_matches = [] dog_matches = [] for root, dirname, filenames in os.walk(data_dir): for filename in fnmatch.filter(filenames, 'cat*'): cat_matches.append(os.path.join(root, filename)) for filename in fnmatch.filter(filenames, 'dog*'): dog_matches.append(os.path.join(root, filename)) sort_key = lambda x: int(x.split('.')[-2]) cat_matches = sorted(cat_matches, key=sort_key) dog_matches = sorted(dog_matches, key=sort_key) def square(x): resize_shape = (260, 260) slice_size = (256, 256) slice_left = (resize_shape[0] - slice_size[0]) / 2 slice_upper = (resize_shape[1] - slice_size[1]) / 2 return imresize( x, resize_shape, interp='nearest')[slice_left:slice_left + slice_size[0], slice_upper:slice_upper + slice_size[1]].transpose(2, 0, 1).astype('uint8') matches = cat_matches + dog_matches matches = np.array(matches) random_state = np.random.RandomState(random_seed) idx = random_state.permutation(len(matches)) c = [0] * len(cat_matches) d = [1] * len(dog_matches) y = np.array(c + d).astype('uint8') matches = matches[idx] y = y[idx] compression_filter = tables.Filters(complevel=5, complib='blosc') h5_file = tables.openFile(data_file, mode='w') example = square(mpimg.imread(matches[0])) image_storage = h5_file.createEArray(h5_file.root, 'images', tables.Int8Atom(), shape=(0, ) + example.shape, filters=compression_filter) for n, f in enumerate(matches): print("Processing image %i of %i" % (n, len(matches))) x = square(mpimg.imread(f)).astype('uint8') image_storage.append(x[None]) h5_file.close() np.save(label_file, y) h5_file = tables.openFile(data_file, mode='r') x_s = h5_file.root.images y_s = np.load(label_file) return (x_s, y_s)
tmesh_id = h5file.createGroup(trianglesMesh_id, 'tmesh') h5file.setNodeAttr(tmesh_id, 'type', 'unstructured') eltnodes_arr = h5file.createCArray(tmesh_id, "elementNodes", tables.Int32Atom(), numpy.shape(eltnodes), filters=filters) i = 0 for array in eltnodes_arr: eltnodes_arr[i] = eltnodes[i] i += 1 elttypes_arr = h5file.createCArray(tmesh_id, "elementTypes", tables.Int8Atom(), numpy.shape(elttypes), filters=filters) nodes_arr = h5file.createCArray(tmesh_id, "nodes", tables.Float32Atom(), numpy.shape(nodes), filters=filters) i = 0 for array in nodes_arr: nodes_arr[i] = nodes[i] i += 1 i = 0 for array in elttypes_arr:
################################################### PSSM TO PYTABLE #################################################### ############################################################################################################################## name = 'group_' + sys.argv[1][-8:-3] print name #feature= np.loadtxt(sys.argv[2]) #this is the features for secondary structure h5 = tb.open_file( '8state_table', 'a' ) ##########!!!!!!!!!! THIS IS THE ONLY THING YOU HAVE TO CHANGE !!!!!!!!!!!!!!!! group = h5.create_group('/', name, 'individual group') one_hot = h5.create_earray( group, name='one_hot', shape=(0, 20, 15), atom=tb.Float32Atom( )) #would be 0, 21, 15 if you want it to be the shape of the old one ss = h5.create_earray(group, name='ss', shape=(0, 8), atom=tb.Int8Atom()) index = [] #### splitting the sliding table into bits of 21 sized timesteps ## for num, line in enumerate(final): if num != 0 and num % 20 == 0: index.append(num) final = np.vsplit(final, index) #print np.shape(final) for feat, line in zip(encoded, final): ss.append(feat[np.newaxis, :]) one_hot.append(line[np.newaxis, :]) print ss print one_hot
def preprocess_all_data(dataset_directory, input_size=1024, alphabet='אבגדהוזחטיכךלמםנןסעפףצץקרשת "', output_filename='./sample_dataset/sample_dataset'): """ Gets dataset directory path (which has structure as detailed behind TODO), and writes to file data as numeric NumPy ndarray in HDF5 file and TFrecord file. If the output files already exists the preprocessed data is *overwritten*. """ #initialize variables preprocessed_samples = np.array([], dtype=np.int8) preprocessed_labels = np.array([], dtype=np.int8) h5_fn = output_filename + '.h5' tfr_fn = output_filename + '.tfrecords' #initialize files dataset will be stored in with tables.open_file( h5_fn, mode='w') as h5file, tf.io.TFRecordWriter(tfr_fn) as tfwriter: typeAtom = tables.Int8Atom() print('Processing...') #iterate over authors ds_path = pathlib.Path(dataset_directory) for author_label, author_dir in enumerate(ds_path.iterdir()): #validate print('Processing ' + str(author_dir) + '...') if not author_dir.is_dir(): print('File ' + str(author_dir) + ' ignored (invalid location).') continue #create h5 group and table gauthor = h5file.create_group(h5file.root, 'author' + str(author_label), author_dir.name) array_c = h5file.create_earray(gauthor, 'samples', typeAtom, (0, len(alphabet), input_size), author_dir.name + " Samples") # author_dict[author_label] = author_dir.name for book_path in author_dir.iterdir(): # validation check if not book_path.is_file(): print('Directory ' + str(author_dir) + ' ignored (invalid location).') continue if book_path.suffix != '.json': print('File ' + str(author_dir) + ' ignored (type should be JSON).') continue # load JSON data with book_path.open(mode='r', encoding='utf8') as book_file: try: book_raw_text = json.load(book_file)['text'] # book_raw_text = book_raw_data except: print('File ' + str(author_dir) + ' ignored (impossible to read JSON).') continue # flatten if isinstance(book_raw_text, list): # no internal separation of text flattened_raw_lst = list(flatten(book_raw_text)) elif isinstance( book_raw_text, dict): # internal separation of text - dict of dicts tmp = [] for d in book_raw_text.values(): if isinstance(d, dict): tmp.extend(list(d.values())) elif isinstance(d, list): tmp.extend(d) flattened_raw_lst = list(flatten(tmp)) else: raise ValueError(str(book_path) + ': Could not parse.') # ensure file does not have different structure from expected assert (all(isinstance(x, str) for x in flattened_raw_lst)) # TODO: check manually all is well # concatenate flattened_raw_str = ''.join(flattened_raw_lst) # TODO: handle single quote characters # keep only letters in alphabet and remove multiple spaces filtered = re.sub('[^' + alphabet + ']', ' ', flattened_raw_str) filtered = re.sub(' +', ' ', filtered) # TODO: is it always correct to replace out-of-alphabet characters by spaces? # split to samples #TODO: prevent cutting in the middle of words n = input_size samples = [ filtered[i:i + n] for i in range(0, len(filtered), n) ] #convert to numerical one-hot samples_onehot_minus1 = np.stack( [str2onehot(sample, alphabet) for sample in samples[0:-1]], axis=0) #pad last sample and add it to 3d array lastsample_onehot = str2onehot(samples[-1], alphabet) lastsample_onehot_padded = np.zeros_like( samples_onehot_minus1[-1, :, :], dtype=np.int8) lastsample_onehot_padded[ 0:lastsample_onehot.shape[0], 0:lastsample_onehot.shape[1]] = lastsample_onehot samples_onehot = np.concatenate( (samples_onehot_minus1, lastsample_onehot_padded[np.newaxis, :, :])) ## write to file #write to h5 array_c.append(samples_onehot) #write to tfrecord for text_arr in samples_onehot: tf_example = text_example(text_arr, author_label) tfwriter.write(tf_example.SerializeToString()) h5file.flush() tfwriter.flush()
image_params = config.image_params stream = config.image_prep_function(args.data_dir, labels, **image_params) one_patient = next(stream) idd_len = len(one_patient[1]) c, h, w = one_patient[2].shape print("Image channels: {} rows: {} cols: {}".format(c, h, w)) stream = chain([one_patient], stream) tables_file = os.path.join(args.output_dir, 'image_data.h5') image_data_table = tables.open_file(tables_file, mode='w') image_atom = tables.Float32Atom() int_atom = tables.Int8Atom() id_atom = tables.StringAtom(len(one_patient[1])) train_data = image_data_table.create_earray(image_data_table.root, 'train_data', atom=image_atom, shape=(0, 1, h, w)) train_labels = image_data_table.create_earray(image_data_table.root, 'train_labels', atom=int_atom, shape=(0, 2)) test_data = image_data_table.create_earray(image_data_table.root, 'test_data', atom=image_atom, shape=(0, 1, h, w)) test_ids = image_data_table.create_earray(image_data_table.root, 'test_ids', atom=id_atom, shape=(0, idd_len)) test_sample = 1 train_sample = 1 for item in stream: dataset, idd, image = item id_chars = np.chararray(shape=(1, len(idd))) id_chars[:] = idd
targetfile = arguments['<target.h5>'] printfreq = int(arguments['--progress-freq']) FILTERS = tb.Filters(complevel=5, complib='zlib') sourceh = tb.open_file(source, mode='r', filters=FILTERS) likeh = tb.open_file(like, mode='r', filters=FILTERS) targeth = tb.open_file(targetfile, mode='w', filters=FILTERS) source_mask = sourceh.root.datamask source_data = sourceh.root.data like_mask = likeh.root.datamask like_data = likeh.root.data target_data = targeth.createCArray(targeth.root, 'data', tb.Int8Atom(), shape=[source_data.shape[0]] + list(like_data.shape[1:])) # copy data n_images = source_data.shape[0] print "Remasking {} images...".format(n_images) t0 = time() for i in range(n_images): if i > 0 and i % printfreq == 0: progress(i, n_images, time() - t0, printfreq) t0 = time() reconstituted = np.zeros(shape=source_mask.shape) reconstituted[np.array(source_mask)] = source_data[i, :] target_data[i, :] = reconstituted[np.array(like_mask)]
t = f.root.targets[:] pos = np.sum(t) batches = [any(t[i*batch_size:(i+1)*batch_size]) for i in range(number)] n = np.sum(batches) * batch_size j = 0 while (n < pos * 2 - 300): batches[j] = True n = np.sum(batches) * batch_size j += 1 a = f.create_earray(f.root,'balance_targets',tables.Int8Atom(),(0,),expectedrows=n) for i in range(number): if batches[i]: a.append(t[i*batch_size:(i+1)*batch_size]) for name in total: data = f.root[name][:] if name.startswith('hist'): features = (6272,) else: features = (64,64)
index = [] for num, line in enumerate(OH): if num != 0 and num % 20 == 0: index.append(num) OH = np.vsplit(OH, index) #h5 = tables.open_file(sys.argv[3], 'w') #### here i make one big table, just change out for the one above if you want lots of individual tables and for gods sake dont forget to delete the file if you run again h5 = tables.open_file('big_table', 'a') group = h5.create_group('/', name, 'individual group') one_hot = h5.create_earray(group, name='one_hot', shape=(0, 20, 15), atom=tables.Int8Atom()) #pssm = h5.create_earray(h5.root, name='pssm', shape=(0, 15, 21), atom=tables.Float32Atom()) d = feat.ndim if d == 1: ss = h5.create_earray(group, name='ss', shape=(0, d), atom=tables.Int8Atom()) feat = np.reshape(feat, (-1, 1)) else: if len(feat[0]) == 4: ss = h5.create_earray(group, name='ss', shape=(0, 4), atom=tables.Int8Atom())
def generate_modular(self): """ Generate synthetic data with a modular network and a genotype matrix made of random {0, 1, 2}. Generated files --------------- <root_dir>/<simu_id>.readme: README file describing the simulation paramters. <root_dir>/<simu_id>.task_similarities.txt: args.num_tasks x args.num_tasks matrix \Omega of task covariance. <root_dir>/<simu_id>.causal_features: args.num_tasks lists of NUM_CAUSAL_EACH causal features, chosen from the first NUM_CAUSAL_TOTAL features. One list per task. Indices start at 0. <root_dir>/<simu_id>.causal_weights: Lists of the weights given to the causal features, generated so as to respect the covariance structure given by Omega. One list per task, in the order of <simu_id>.causal_features. <root_dir>/<simu_id>.genotypes.txt: num_features x num_samples matrix of {0, 1, 2} (representing SNPs). <root_dir>/<simu_id>.network.dimacs: A modular network over the self.num_features features, with fully connected modules of size MOD_SIZE. For task_id in 0, ..., args.num_tasks: <root_dir>/<simu_id>.phenotype_<task_id>.txt: Phenotype vector (of size args.num_samples) for task <task_id>. <root_dir>/<simu_id>.scores_<task_id>.txt Node weights (of size args.num_features) for task <task_id>. Computed as Pearson correlation. """ # Writing readme readme_f = '%s/%s.readme' % (self.root_dir, self.simu_id) with open(readme_f, 'w') as f: f.write("# Features generated by generate_data.generate_modular\n") f.write("%d\tfeatures\n" % self.num_features) f.write("%d\tsamples\n" % self.num_samples) f.write("%d\ttasks\n" % self.num_tasks) f.close() logging.info("README file created under %s\n" % readme_f) # Generate a matrix of similarities between tasks omega = np.random.uniform(size = (self.num_tasks, self.num_tasks)) omega = omega.transpose().dot(omega) d = np.diag(omega) d.shape = (self.num_tasks, 1) omega = omega / np.sqrt(d.dot(d.transpose())) # Save omega to file fname = "%s/%s.task_similarities.txt" % (self.root_dir, self.simu_id) np.savetxt(fname, omega, fmt='%.3f') logging.info("Covariance matrix saved under %s\n" % fname) # Generate beta vectors that are correlated according to omega # Trick: cov(Ax) = Acov(x)A' L = np.linalg.cholesky(omega) # i.e. LL' = omega b = np.random.normal(size=(self.num_tasks, NUM_CAUSAL_TOTAL)) beta = L.dot(b) # For each task, keep the NUM_CAUSAL_EACH features with highest weight # as causal; # drop the weight of the others to 0. causal_features = [] for k in range(self.num_tasks): b = [x for x in beta[k, :]] b.sort() causal_features.append(np.where(beta[k, :] >= b[-NUM_CAUSAL_EACH])[0]) beta[k, np.where(beta[k, :] < b[-NUM_CAUSAL_EACH])[0]] = 0. # Save causal features to file fname = "%s/%s.causal_features.txt" % (self.root_dir, self.simu_id) np.savetxt(fname, causal_features, fmt='%d') logging.info("Causal features saved under %s\n" % fname) # Save beta to file fname = "%s/%s.causal_weights.txt" % (self.root_dir, self.simu_id) np.savetxt(fname, beta) logging.info("Causal weights saved under %s\n" % fname) # Generate genotypes # Create PyTables structure for X transposed (so as to access rows, not cols) fname = "%s/%s.genotypes.txt" % (self.root_dir, self.simu_id) with tb.open_file(fname, 'w') as h5f: filters = tb.Filters(complevel=5, complib='blosc') Xtr = h5f.create_carray(h5f.root, 'Xtr', tb.Int8Atom(), shape=(self.num_features, self.num_samples)) for row in xrange(self.num_features): Xtr[row, :] = np.random.random_integers(0, high=2, size=self.num_samples) h5f.close() logging.info("Genotypes saved under %s\n" % fname) # generate phenotypes and Pearson scores, and save to file with tb.open_file(fname, 'r') as h5f: Xtr = h5f.root.Xtr for task_idx in range(self.num_tasks): y = Xtr[:NUM_CAUSAL_TOTAL,:].transpose().dot(beta.transpose()[:, task_idx]) y += np.random.normal(scale=0.1, size=(self.num_samples, )) fname = "%s/%s.phenotype_%d.txt" % (self.root_dir, self.simu_id, task_idx) np.savetxt(fname, y, fmt='%.3f') logging.info("Phenotype for task %d saved under %s\n" % (task_idx, fname)) # compute feature-phenotype correlations r2 = [st.pearsonr(Xtr[feat_idx, :].transpose(), y)[0]**2 \ for feat_idx in range(self.num_features)] fname = "%s/%s.scores_%d.txt" % (self.root_dir, self.simu_id, task_idx) np.savetxt(fname, r2, fmt='%.3e') logging.info("Node weights for task %d saved under %s\n" % (task_idx, fname)) # Generate network in dimacs format # Careful: node indices must start at 1 num_modules = self.num_features / MOD_SIZE num_edges = MOD_SIZE * (MOD_SIZE - 1) * num_modules + \ 2 * (num_modules - 1) + 2 * (self.num_features - \ MOD_SIZE * num_modules) dimacs_f = '%s/%s.network.dimacs' % (self.root_dir, self.simu_id) with open(dimacs_f, 'w') as g: g.write("p max %d %d\n" % ((self.num_features), num_edges)) # create fully connected modules of size MOD_SIZE # connect each to the next one for mod_idx in range(num_modules): x = mod_idx * MOD_SIZE if mod_idx > 0: g.write("a %d %d 1\n" % ((x+1), (x))) # connect to previous module for x_idx1 in range(MOD_SIZE): for x_idx2 in range(MOD_SIZE): if x_idx1 != x_idx2: g.write("a %d %d 1\n" % ((x+x_idx1+1), (x+x_idx2+1))) if (x+MOD_SIZE) < self.num_features: # connect to next module g.write("a %d %d 1\n" % ((x+MOD_SIZE), (x+MOD_SIZE+1))) # connect each of the remaining nodes to its neighbor for x_idx in range(x+MOD_SIZE+1, self.num_features): g.write("a %d %d 1\n" % ((x_idx), (x_idx-1))) g.write("a %d %d 1\n" % ((x_idx), (x_idx+1))) # last connection (mirror from the previous one) g.write("a %d %d 1\n" % ((x_idx+1), (x_idx))) g.close() logging.info("Network saved under %s\n" % fname)
def collect_skeletons(experiments_df, main_file, file_ext='_featuresN.hdf5', gap_to_interp_seconds=3, sample_size_frames_s=10): assert all(x in experiments_df for x in ('directory', 'base_name', 'fps', 'id', 'strain')) with tables.File(main_file, 'w') as tab_fid: r_dtype = [] for col in experiments_df: dat = experiments_df[col] if dat.dtype == np.dtype('O'): n_s = dat.str.len().max() dt = np.dtype('S%i' % n_s) else: dt = dat.dtype r_dtype.append((col, dt)) #save the experiments table. I do it after the loop to store the fps information tab_recarray = experiments_df.to_records(index=False).astype( np.dtype(r_dtype)) tab_fid.create_table('/', 'experiments_data', obj=tab_recarray, filters=TABLE_FILTERS) table_type = np.dtype([('experiment_id', np.int32), ('worm_index', np.int32), ('strain', 'S10'), ('ini_time_aprox', np.float32), ('ini', np.int32), ('fin', np.int32)]) data_table = tab_fid.create_table('/', "skeletons_groups", table_type, "Worm feature List", filters=TABLE_FILTERS) skeletons_data = tab_fid.create_earray( '/', 'skeletons_data', atom=tables.Float32Atom(), shape=(0, 49, 2), expectedrows=experiments_df.shape[0] * 22500, filters=TABLE_FILTERS) is_bad_skeleton_data = tab_fid.create_earray( '/', 'is_bad_skeleton', atom=tables.Int8Atom(), shape=(0, ), expectedrows=experiments_df.shape[0] * 22500, filters=TABLE_FILTERS) #timer = TimeCounter(tot_frames = len(experiments_df)) tot_skels = 0 for irow, row in tqdm.tqdm(experiments_df.iterrows(), total=len(experiments_df)): try: features_file = os.path.join(row['directory'], row['base_name'] + file_ext) with pd.HDFStore(features_file, 'r') as fid: assert '/timeseries_data' in fid except AssertionError: continue for output in _process_file(features_file, row['fps'], gap_to_interp_seconds, sample_size_frames_s): worm_index, worm_data, skeletons, is_bad_skeleton, borders = output if not borders: continue for bb in borders: skels = skeletons[bb[0]:bb[1]] assert not np.any(np.isnan(skels)) is_bad = is_bad_skeleton[bb[0]:bb[1]] ini_t = worm_data['timestamp'].values[bb[0]] / row['fps'] rr = (row['id'], int(worm_index), np.array(row['strain']), ini_t, tot_skels, tot_skels + skels.shape[0] - 1) data_table.append([rr]) skeletons_data.append(skels) is_bad_skeleton_data.append(is_bad) tot_skels += skels.shape[0] #print(rr[3:], tot_skels, skeletons_data.shape) data_table.flush() skeletons_data.flush() #print(timer.get_str(irow+1)) #SAVE STRAIN CODES #I am reading the skeletons_group instead of the experiment data, to ignore strains without a valid skeleton with pd.HDFStore(main_file, 'r') as fid: skeletons_groups = fid['/skeletons_groups'] #get strain data ss = skeletons_groups['strain'].unique() n_c = max(len(x) for x in ss) strains_dict = {x: ii for ii, x in enumerate(np.sort(ss))} strains_codes = np.array( list(strains_dict.items()), np.dtype([('strain', 'S' + str(n_c)), ('strain_id', np.int)])) with tables.File(main_file, 'r+') as fid: if '/strains_codes' in fid: fid.remove_node('/strains_codes') fid.create_table('/', 'strains_codes', obj=strains_codes, filters=TABLE_FILTERS)
def _create_table_list(self, name, example): """ Create a new table within the HDF file, where the tables shape and its datatype are determined by *example*. The modified version for creating table with appendList """ type_map = { np.dtype(np.float64): tables.Float64Atom(), np.dtype(np.float32): tables.Float32Atom(), np.dtype(np.int): tables.Int64Atom(), np.dtype(np.int8): tables.Int8Atom(), np.dtype(np.uint8): tables.UInt8Atom(), np.dtype(np.int16): tables.Int16Atom(), np.dtype(np.uint16): tables.UInt16Atom(), np.dtype(np.int32): tables.Int32Atom(), np.dtype(np.uint32): tables.UInt32Atom(), np.dtype(np.bool): tables.BoolAtom(), } try: if type(example) == np.ndarray: h5type = type_map[example.dtype] elif type(example) == list and type(example[0]) == str: h5type = tables.VLStringAtom() except KeyError: raise TypeError("Don't know how to handle dtype '%s'" % example.dtype) if type(example) == np.ndarray: h5dim = (0, ) + example.shape[1:] h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) nodes = h5.list_nodes(h5.root) nmpt = name.replace('.', '/\n') nmpt = nmpt.split('\n') path = '/' for kay in range(len(nmpt) - 1): #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1]) try: h5.is_visible_node(path + nmpt[kay][:-1]) except: h5.create_group(path, nmpt[kay][:-1]) path += nmpt[kay] self.tables[name] = h5.create_earray(path, nmpt[-1], h5type, h5dim, filters=filters) elif type(example) == list and type(example[0]) == str: h5 = self.h5 filters = tables.Filters(complevel=self.compression_level, complib='zlib', shuffle=True) nodes = h5.list_nodes(h5.root) nmpt = name.replace('.', '/\n') nmpt = nmpt.split('\n') path = '/' for kay in range(len(nmpt) - 1): #if not path+nmpt[kay][:-1] in str(nodes): h5.create_group(path,nmpt[kay][:-1]) try: h5.is_visible_node(path + nmpt[kay][:-1]) except: h5.create_group(path, nmpt[kay][:-1]) path += nmpt[kay] self.tables[name] = h5.create_vlarray(path, nmpt[-1], h5type, filters=filters) self.types[name] = type(example)
targets_location = base + 'data/6464/targets/' dst = base + 'data/6464/h5/{}.h5' files = [] for filename in os.listdir(targets_location): files += [filename] for filename in files: f = tables.open_file(dst.format(filename[:12]),'w') targets = f.create_earray(f.root,'targets',tables.Int8Atom(),(0,),expectedrows=7476) print('{}'.format(filename)) t = open(targets_location + filename) targets_csv = csv.reader(t) targets_single = [] for row in targets_csv: targets_single += [row[1]] t.close() targets_single = targets_single[:7476] targets.append(np.array(targets_single))
def main(fname, concurrent_edges=None, concurrent_bin=None, exlude_ranges=None): """ creates table to store artifact information """ for sign in SIGNS: # why is this here? if READONLY: mode = 'r' else: mode = 'r+' h5fid = tables.open_file(fname, mode) try: node = h5fid.get_node('/' + sign + '/times') except tables.NoSuchNodeError: print('{} has no {} spikes'.format(fname, sign)) continue if len(node.shape) == 0: continue elif node.shape[0] == 0: continue times = node[:] num_spk = times.shape[0] spikes = h5fid.get_node('/' + sign, 'spikes')[:, :] assert num_spk == spikes.shape[0] try: artifacts = h5fid.get_node('/' + sign + '/artifacts') except tables.NoSuchNodeError: h5fid.create_array('/' + sign, 'artifacts', atom=tables.Int8Atom(), shape=(num_spk, )) artifacts = h5fid.get_node('/' + sign + '/artifacts') if RESET: artifacts[:] = 0 arti_by_diff, arti_by_diff_id = mark_by_diff(times) add_id(artifacts, arti_by_diff, arti_by_diff_id, sign) # artifacts[arti_by_diff != 0] = arti_by_diff_id # if DEBUG: # print('Marked {} {} spikes by diff'. # format(arti_by_diff.sum(), sign)) arti_by_height, arti_by_height_id = mark_by_height(spikes, sign) add_id(artifacts, arti_by_height, arti_by_height_id, sign) # artifacts[arti_by_height != 0] = arti_by_height_id # if DEBUG: # print('Marked {} {} spikes by height'. # format(arti_by_height.sum(), sign)) arti_by_double, double_id = mark_double_detection(times, spikes, sign) add_id(artifacts, arti_by_double, double_id, sign) # artifacts[arti_by_double != 0] = double_id # if DEBUG: # print('Marked {} {} spikes as detected twice'. # format(arti_by_double.sum(), sign)) if concurrent_edges is not None: arti_by_conc, arti_by_conc_id = mark_by_bincount( times, concurrent_edges, concurrent_bin) add_id(artifacts, arti_by_conc, arti_by_conc_id, sign) # artifacts[arti_by_conc != 0] = arti_by_conc_id # if DEBUG: # print('Marked {} {} spikes by concurrent occurence'. # format(arti_by_conc.sum(), sign)) if exlude_ranges is not None: arti_by_ranges, range_id = mark_range_detection( times, exlude_ranges) add_id(artifacts, arti_by_ranges, range_id, sign) # artifacts[arti_by_ranges != 0] = range_id # if DEBUG: # print('Marked {} {} spikes within supplied range '. # format(arti_by_ranges.sum(), sign)) h5fid.close()
# file 2, apd2 f2 = tables.open_file('tempAPD2_copy.hdf', 'r') ts_2 = f2.root.timestamps # lengths f1_num = f1.root.timestamps.nrows f2_num = f2.root.timestamps.nrows row_num = (f1_num + f2_num) # file 3, outfile f3 = tables.open_file('sortedFile.hdf', mode='w') f3.create_group(f3.root, name='photon_data') filters = tables.Filters(complevel=6, complib='zlib') atom1 = tables.UInt32Atom() atom2 = tables.Int8Atom() ts = f3.create_carray('/photon_data', name='timestamps', atom=atom1, shape=(row_num, 1), filters=filters) det = f3.create_carray('/photon_data', name='detectors', atom=atom2, shape=(row_num, 1), filters=filters) # Calculations start = time.time() merge_files(ts_1, ts_2, ts, det, f1_num, f2_num) print("Merging took %f seconds." % (time.time() - start))