def save_query_to_hdf5_point(self, save_path_queries_hdf5, entry_id, sample_original_space): """ :return: """ sample_original_space = to_vector(sample_original_space).T if os.path.isfile(save_path_queries_hdf5): with h5py.File(save_path_queries_hdf5, 'r+') as hf: points_dataset = hf.get('point_queries') already_in_points_ds = points_dataset.shape[0] points_dataset.resize(already_in_points_ds + sample_original_space.shape[0], axis=0) points_dataset[ already_in_points_ds:already_in_points_ds + sample_original_space.shape[0], :] = sample_original_space entryids_dataset = hf.get('entry_ids') already_in_entryids_ds = entryids_dataset.len() entryids_dataset.resize(already_in_entryids_ds + 1, axis=0) entryids_dataset[ already_in_entryids_ds:already_in_entryids_ds + 1] = entry_id split_dict = { "data": { "point_queries": (0, already_in_points_ds + sample_original_space.shape[0]), "entry_ids": (0, already_in_entryids_ds + 1) } } hf.attrs["split"] = H5PYDataset.create_split_array(split_dict) else: # HDF5 query line save file does not exist yet! f = h5py.File(save_path_queries_hdf5, "w") points_dataset = f.create_dataset( 'point_queries', sample_original_space.shape, maxshape=(None, sample_original_space.shape[1]), dtype="float32") points_dataset[...] = sample_original_space entryids_dataset = f.create_dataset('entry_ids', (1, ), maxshape=(None, ), dtype=int) entryids_dataset[...] = entry_id split_dict = { "data": { "point_queries": (0, sample_original_space.shape[0]), "entry_ids": (0, 1) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_db_point_to_hdf5(self, db_point_scaled_space): """ Save a decision boundary annotation to hdf5. :param db_point_scaled_space: (n_samples, n_features) :return: """ try: db_point_original_space = self.scaling_transformation.inverse_transform( db_point_scaled_space) # shape (1,nlat) if os.path.isfile(self.save_path_dbpoints_hdf5): with h5py.File(self.save_path_dbpoints_hdf5, 'r+') as hf: dbpoints_dataset = hf.get('db_points') already_in_dataset = dbpoints_dataset.shape[0] dbpoints_dataset.resize(already_in_dataset + db_point_original_space.shape[0], axis=0) dbpoints_dataset[already_in_dataset:already_in_dataset + db_point_original_space. shape[0], :] = db_point_original_space split_dict = { "data": { "db_points": (0, already_in_dataset + db_point_original_space.shape[0]) } } hf.attrs["split"] = H5PYDataset.create_split_array( split_dict) else: # HDF5 query line save file does not exist yet! f = h5py.File(self.save_path_dbpoints_hdf5, "w") dbpoints_dataset = f.create_dataset( 'db_points', db_point_original_space.shape, maxshape=(None, db_point_original_space.shape[1]), dtype="float32") dbpoints_dataset[...] = db_point_original_space split_dict = { "data": { "db_points": (0, db_point_original_space.shape[0]) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() except Exception: traceback.print_exc()
def build_contexts_ds(config, all_contexts, nsamples_train, nsamples_dev, nsamples_test, nsamples_dev_big): logger.info('building contexts dataset') totals = len(all_contexts) ctx_dtype = h5py.special_dtype(vlen=np.dtype('int32')) logger.info("#contexts: %d", totals) with h5py.File(config['dsdir'] + "_contexts.hdf", mode='w') as fp: contexts = fp.create_dataset( 'contexts', compression='gzip', data=[np.asarray(ctx.context) for ctx in all_contexts], shape=(totals, ), dtype=ctx_dtype) split_dict = { 'train': { 'contexts': (0, nsamples_train) }, 'dev': { 'contexts': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'contexts': (nsamples_train + nsamples_dev, nsamples_train + nsamples_dev + nsamples_test) }, 'devbig': { 'contexts': (nsamples_train + nsamples_dev + nsamples_test, totals) } } fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def build_entmentions_ds(config, all_contexts, nsamples_train, nsamples_dev, nsamples_test, nsamples_dev_big): logger.info('building entmentions dataset') totals = len(all_contexts) ctx_dtype = h5py.special_dtype(vlen=np.dtype('uint32')) dsdir = config['dsdir'] ctx_entity_dtype = np.dtype([("id", np.dtype(str), 64), ("token", np.dtype(str), 64), ("position", np.dtype('uint8'))]) with h5py.File(dsdir + "_entmentions.hdf", mode='w') as fp: context_entities = fp.create_dataset( 'entmentions', compression='gzip', data=np.asarray([(ctx.entity_id, ctx.entity_str, ctx.entity_idx) for ctx in all_contexts], dtype=ctx_entity_dtype)) split_dict = { 'train': { 'entmentions': (0, nsamples_train) }, 'dev': { 'entmentions': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'entmentions': (nsamples_train + nsamples_dev, nsamples_train + nsamples_dev + nsamples_test) }, 'devbig': { 'entmentions': (nsamples_train + nsamples_dev + nsamples_test, totals) } } fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def build_mentions_ds(config, all_contexts, nsamples_train, nsamples_dev, nsamples_test, nsamples_dev_big, max_len_men=4): logger.info('building mentions (indices of mention words) dataset') totals = len(all_contexts) dsdir = config['dsdir'] mentions_m = numpy.ones(shape=(totals, max_len_men), dtype='int32') for i, ctx in enumerate(all_contexts): mentions_m[i] = ctx.mention with h5py.File(dsdir + "_mentions.hdf", mode='w') as fp: mentions = fp.create_dataset('mentions', mentions_m.shape, dtype='int32') mentions[...] = mentions_m split_dict = { 'train': { 'mentions': (0, nsamples_train) }, 'dev': { 'mentions': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'mentions': (nsamples_train + nsamples_dev, nsamples_train + nsamples_dev + nsamples_test) }, 'devbig': { 'mentions': (nsamples_train + nsamples_dev + nsamples_test, totals) } } fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None): if vectorfile == None: return word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words) logger.info('shape of subwords dataset: %s', input_words.shape) hdf5_file = dsdir + '_subwords.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('subwords', input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_words features.dims[0].label = 'words' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'subwords': (0, nsamples_train)}, 'dev': {'subwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'subwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file) logger.info('writing subword embeddings') idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, word2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in word2idx: entvec = embeddings[word2idx[mye]] input_entvec[i] = entvec print input_entvec.shape hdf5_file += '_entvec.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('entvec', input_entvec.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = vectorsize features[...] = input_entvec features.dims[0].label = 'entity_vector' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'entvec': (0, nsamples_train)}, 'dev': {'entvec': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'entvec': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building entityVec dataset finished. It saved in: %s', hdf5_file)
def _initialize_conversion(directory, output_path, image_shape): h5file = h5py.File(output_path, mode='w') split_dict = { 'train': { 'features': (0, TRAIN_STOP), 'targets': (0, TRAIN_STOP)}, 'valid': { 'features': (TRAIN_STOP, VALID_STOP), 'targets': (TRAIN_STOP, VALID_STOP)}, 'test': { 'features': (VALID_STOP, NUM_EXAMPLES), 'targets': (VALID_STOP, NUM_EXAMPLES)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) targets_dataset = h5file.create_dataset( 'targets', (NUM_EXAMPLES, 40), dtype='uint8') targets_dataset.dims[0].label = 'batch' targets_dataset.dims[1].label = 'target' targets_dataset[...] = ( numpy.loadtxt(os.path.join(directory, ATTRIBUTES_FILE), dtype='int32', skiprows=2, usecols=tuple(range(1, 41))) + 1) / 2 features_dataset = h5file.create_dataset( 'features', (NUM_EXAMPLES, 3) + image_shape, dtype='uint8') features_dataset.dims[0].label = 'batch' features_dataset.dims[1].label = 'channel' features_dataset.dims[2].label = 'height' features_dataset.dims[3].label = 'width' return h5file
def build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, hdf5_file, embpath, emb_list, vectorsize=200, upto=-1): print "building hs Ngram datasets: ", emb_list for emb_version in emb_list: print emb_version mypath = os.path.join(embpath, emb_version) nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); totals = nsamples_train + nsamples_dev + len(tstMentions) vectorsize = get_vec_size(mypath+'/train.txt') input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize), dtype='float32') input_hsngram_matrix[0:nsamples_train] = load_embmatirx(mypath+'/train.txt', len(trnMentions), vectorsize, upto) input_hsngram_matrix[nsamples_train:nsamples_train+nsamples_dev] = load_embmatirx(mypath+'/dev.txt', len(devMentions), vectorsize, upto) input_hsngram_matrix[nsamples_train+nsamples_dev:totals] = load_embmatirx(mypath+'/test.txt', len(tstMentions), vectorsize, upto) print input_hsngram_matrix.shape srcname = 'hsngram_' + emb_version hdf5_file = hdf5_file + '_'+ srcname + '.h5py' print hdf5_file f = h5py.File(hdf5_file, mode='w') features = f.create_dataset(srcname, input_hsngram_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = vectorsize features[...] = input_hsngram_matrix features.dims[0].label = srcname + '_vector' split_dict = { 'train': {srcname: (0, nsamples_train)}, 'dev': {srcname: (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {srcname: (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building hinrich ngram-level embeddings of mentions finished. It saved in: %s', hdf5_file)
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tc': (0, nsamples_train)}, 'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tc': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def _initialize_conversion(directory, output_path, image_shape): h5file = h5py.File(output_path, mode='w') split_dict = { 'train': { 'features': (0, TRAIN_STOP), 'targets': (0, TRAIN_STOP) }, 'valid': { 'features': (TRAIN_STOP, VALID_STOP), 'targets': (TRAIN_STOP, VALID_STOP) }, 'test': { 'features': (VALID_STOP, NUM_EXAMPLES), 'targets': (VALID_STOP, NUM_EXAMPLES) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) targets_dataset = h5file.create_dataset('targets', (NUM_EXAMPLES, ) + image_shape, dtype='uint8') targets_dataset.dims[0].label = 'batch' targets_dataset.dims[1].label = 'height' targets_dataset.dims[2].label = 'width' features_dataset = h5file.create_dataset('features', (NUM_EXAMPLES, 3) + image_shape, dtype='uint8') features_dataset.dims[0].label = 'batch' features_dataset.dims[1].label = 'channel' features_dataset.dims[2].label = 'height' features_dataset.dims[3].label = 'width' return h5file
def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test): """Create datasets within a given HDF5 file. Parameters ---------- hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. n_train : int The number of training set examples. n_valid : int The number of validation set examples. n_test : int The number of test set examples. """ n_total = n_train + n_valid + n_test n_labeled = n_train + n_valid splits = create_splits(n_train, n_valid, n_test) hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits) vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf5_file.create_dataset('encoded_images', shape=(n_total,), dtype=vlen_dtype) hdf5_file.create_dataset('targets', shape=(n_labeled, 1), dtype=numpy.int16) hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')
def build_targets_ds(config, all_contexts, nsamples_train, nsamples_dev, nsamples_test, nsamples_dev_big): logger.info("building targets dataset") entity_types = list(load_types(config['typefile'])) (t2idx, _) = cmn.loadtypes(config['typefile']) totals = len(all_contexts) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, ctx in enumerate(all_contexts): types_idx = [t2idx[t] for t in ctx.all_types if t in t2idx] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) dsdir = config['dsdir'] fp = h5py.File(dsdir + '_targets.hdf', mode='w') targets = fp.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' split_dict = { 'train': { 'targets': (0, nsamples_train) }, 'dev': { 'targets': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'targets': (nsamples_train + nsamples_dev, nsamples_train + nsamples_dev + nsamples_test) }, 'devbig': { 'targets': (nsamples_train + nsamples_dev + nsamples_test, totals) } } fp.attrs['split'] = H5PYDataset.create_split_array(split_dict) fp.flush() fp.close()
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir): totals = len(trnMentions) + len(devMentions) + len(tstMentions) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): types_idx = [t2idx[t] for t in men.alltypes] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) hdf5_file = dsdir + '_targets.h5py' f = h5py.File(hdf5_file, mode='w') targets = f.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'targets': (0, nsamples_train) }, 'dev': { 'targets': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'targets': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30): char_to_idx, idx_to_char = build_char_vocab(trnMentions) #train for characters because we only use entities names for characters totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name) print input_letters.shape fuelfile = dsdir +'_letters.h5py' f = h5py.File(fuelfile, mode='w') features = f.create_dataset('letters', input_letters.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False) features.attrs['vocabsize'] = len(char_to_idx) features[...] = input_letters features.dims[0].label = 'letters' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'letters': (0, nsamples_train)}, 'dev': {'letters': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'letters': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('building letters dataset finished. It saved in: %s', fuelfile) if vectorfile is None: return embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def test_celeba(): data_path = config.data_path try: config.data_path = '.' f = h5py.File('celeba_64.hdf5', 'w') f['features'] = numpy.arange(10 * 3 * 64 * 64, dtype='uint8').reshape( (10, 3, 64, 64)) f['targets'] = numpy.arange(10 * 40, dtype='uint8').reshape((10, 40)) split_dict = { 'train': { 'features': (0, 6), 'targets': (0, 6) }, 'valid': { 'features': (6, 8), 'targets': (6, 8) }, 'test': { 'features': (8, 10), 'targets': (8, 10) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.close() dataset = CelebA(which_format='64', which_sets=('train', )) assert_equal(dataset.filename, 'celeba_64.hdf5') finally: config.data_path = data_path os.remove('celeba_64.hdf5')
def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test): """Create datasets within a given HDF5 file. Parameters ---------- hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. n_train : int The number of training set examples. n_valid : int The number of validation set examples. n_test : int The number of test set examples. """ n_total = n_train + n_valid + n_test n_labeled = n_train + n_valid splits = create_splits(n_train, n_valid, n_test) hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits) vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf5_file.create_dataset('encoded_images', shape=(n_total, ), dtype=vlen_dtype) hdf5_file.create_dataset('targets', shape=(n_labeled, 1), dtype=numpy.int16) hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')
def build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, hdf5_file, embpath, emb_list, vectorsize=200, upto=-1): print "building hs Ngram datasets: ", emb_list for emb_version in emb_list: print emb_version mypath = os.path.join(embpath, emb_version) nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) totals = nsamples_train + nsamples_dev + len(tstMentions) vectorsize = get_vec_size(mypath + '/train.txt') input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize), dtype='float32') input_hsngram_matrix[0:nsamples_train] = load_embmatirx( mypath + '/train.txt', len(trnMentions), vectorsize, upto) input_hsngram_matrix[nsamples_train:nsamples_train + nsamples_dev] = load_embmatirx( mypath + '/dev.txt', len(devMentions), vectorsize, upto) input_hsngram_matrix[nsamples_train + nsamples_dev:totals] = load_embmatirx( mypath + '/test.txt', len(tstMentions), vectorsize, upto) print input_hsngram_matrix.shape srcname = 'hsngram_' + emb_version hdf5_file = hdf5_file + '_' + srcname + '.h5py' print hdf5_file f = h5py.File(hdf5_file, mode='w') features = f.create_dataset(srcname, input_hsngram_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = vectorsize features[...] = input_hsngram_matrix features.dims[0].label = srcname + '_vector' split_dict = { 'train': { srcname: (0, nsamples_train) }, 'dev': { srcname: (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { srcname: (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building hinrich ngram-level embeddings of mentions finished. It saved in: %s', hdf5_file)
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30): char_to_idx, idx_to_char = build_char_vocab( trnMentions ) #train for characters because we only use entities names for characters totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name) print input_letters.shape fuelfile = dsdir + '_letters.h5py' f = h5py.File(fuelfile, mode='w') features = f.create_dataset('letters', input_letters.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False) features.attrs['vocabsize'] = len(char_to_idx) features[...] = input_letters features.dims[0].label = 'letters' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'letters': (0, nsamples_train) }, 'dev': { 'letters': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'letters': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('building letters dataset finished. It saved in: %s', fuelfile) if vectorfile is None: return embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def fill_hdf5_file(h5file, data): """Fills an HDF5 file in a H5PYDataset-compatible manner. Parameters ---------- h5file : :class:`h5py.File` File handle for an HDF5 file. data : tuple of tuple One element per split/source pair. Each element consists of a tuple of (split_name, source_name, data_array, comment), where * 'split_name' is a string identifier for the split name * 'source_name' is a string identifier for the source name * 'data_array' is a :class:`numpy.ndarray` containing the data for this split/source pair * 'comment' is a comment string for the split/source pair The 'comment' element can optionally be omitted. """ # Check that all sources for a split have the same length split_names = set(split_tuple[0] for split_tuple in data) for name in split_names: lengths = [ len(split_tuple[2]) for split_tuple in data if split_tuple[0] == name ] if not all(le == lengths[0] for le in lengths): raise ValueError("split '{}' has sources that ".format(name) + "vary in length") # Initialize split dictionary split_dict = dict([(split_name, {}) for split_name in split_names]) # Compute total source lengths and check that splits have the same dtype # across a source source_names = set(split_tuple[1] for split_tuple in data) for name in source_names: splits = [s for s in data if s[1] == name] indices = numpy.cumsum([0] + [len(s[2]) for s in splits]) if not all(s[2].dtype == splits[0][2].dtype for s in splits): raise ValueError("source '{}' has splits that ".format(name) + "vary in dtype") if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits): raise ValueError("source '{}' has splits that ".format(name) + "vary in shapes") dataset = h5file.create_dataset( name, (sum(len(s[2]) for s in splits), ) + splits[0][2].shape[1:], dtype=splits[0][2].dtype) dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0) for i, j, s in zip(indices[:-1], indices[1:], splits): if len(s) == 4: split_dict[s[0]][name] = (i, j, None, s[3]) else: split_dict[s[0]][name] = (i, j) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def save_decision_boundary(self, w, b): """ :return: """ w = to_vector(w).T if os.path.isfile(self.save_path_boundaries): with h5py.File(self.save_path_boundaries, 'r+') as hf: w_dataset = hf.get('w') already_in_w_ds = w_dataset.shape[0] w_dataset.resize(already_in_w_ds + w.shape[0], axis=0) w_dataset[already_in_w_ds:already_in_w_ds + w.shape[0], :] = w b_dataset = hf.get('b') already_in_b_ds = b_dataset.len() b_dataset.resize(already_in_b_ds + 1, axis=0) b_dataset[already_in_b_ds:already_in_b_ds + 1] = b split_dict = { "data": { "w": (0, already_in_w_ds + w.shape[0]), "b": (0, already_in_b_ds + 1) } } hf.attrs["split"] = H5PYDataset.create_split_array(split_dict) else: # HDF5 query line save file does not exist yet! f = h5py.File(self.save_path_boundaries, "w") w_dataset = f.create_dataset('w', w.shape, maxshape=(None, w.shape[1]), dtype="float32") w_dataset[...] = w b_dataset = f.create_dataset('b', (1, ), maxshape=(None, ), dtype="float32") b_dataset[...] = b split_dict = {"data": {"w": (0, w.shape[0]), "b": (0, 1)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def fill_hdf5_file(h5file, data): """Fills an HDF5 file in a H5PYDataset-compatible manner. Parameters ---------- h5file : :class:`h5py.File` File handle for an HDF5 file. data : tuple of tuple One element per split/source pair. Each element consists of a tuple of (split_name, source_name, data_array, comment), where * 'split_name' is a string identifier for the split name * 'source_name' is a string identifier for the source name * 'data_array' is a :class:`numpy.ndarray` containing the data for this split/source pair * 'comment' is a comment string for the split/source pair The 'comment' element can optionally be omitted. """ # Check that all sources for a split have the same length split_names = set(split_tuple[0] for split_tuple in data) for name in split_names: lengths = [len(split_tuple[2]) for split_tuple in data if split_tuple[0] == name] if not all(l == lengths[0] for l in lengths): raise ValueError("split '{}' has sources that ".format(name) + "vary in length") # Initialize split dictionary split_dict = dict([(split_name, {}) for split_name in split_names]) # Compute total source lengths and check that splits have the same dtype # across a source source_names = set(split_tuple[1] for split_tuple in data) for name in source_names: splits = [s for s in data if s[1] == name] indices = numpy.cumsum([0] + [len(s[2]) for s in splits]) if not all(s[2].dtype == splits[0][2].dtype for s in splits): raise ValueError("source '{}' has splits that ".format(name) + "vary in dtype") if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits): raise ValueError("source '{}' has splits that ".format(name) + "vary in shapes") dataset = h5file.create_dataset( name, (sum(len(s[2]) for s in splits),) + splits[0][2].shape[1:], dtype=splits[0][2].dtype) dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0) for i, j, s in zip(indices[:-1], indices[1:], splits): if len(s) == 4: split_dict[s[0]][name] = (i, j, None, s[3]) else: split_dict[s[0]][name] = (i, j) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
def make_fuel_dataset(file_name, training_set_data, testing_set_data): """ :param file_name: :type file_name: :param training_set_data: :type training_set_data: list[dict[str, numpy.core.multiarray.ndarray]] :param testing_set_data: :type testing_set_data: list[dict[str, numpy.core.multiarray.ndarray]] :return: :rtype: """ f = h5py.File(file_name, mode='w') nb_training_examples = len(training_set_data) nb_testing_examples = len(testing_set_data) nb_examples = nb_training_examples + nb_testing_examples ts = [ ('audio_features', 'audio_features', 'float32'), ('targets_weak_alarm', 'classes_alarm_weak', 'uint8'), ('targets_weak_vehicle', 'classes_vehicle_weak', 'uint8'), ('targets_strong_alarm', 'classes_alarm_strong', 'uint8'), ('targets_strong_vehicle', 'classes_vehicle_strong', 'uint8') ] shape_labels = ['batch'.encode('utf8'), 'time_frames'.encode('utf8'), 'features'.encode('utf8')] datasets = [] datasets_shapes = [] datasets_shapes_labels = [] for t in ts: datasets.append(f.create_dataset(t[0], (nb_examples, ), dtype=h5py.special_dtype(vlen=np.dtype(t[2])))) datasets[-1][...] = [entry[t[1]].flatten() for entry in training_set_data + testing_set_data] datasets_shapes.append(f.create_dataset('{}_shapes'.format(t[0]), (nb_examples, 3), dtype='int32')) datasets_shapes[-1][...] = np.array([entry[t[1]].reshape((1, ) + entry[t[1]].shape).shape for entry in training_set_data + testing_set_data]) datasets[-1].dims.create_scale(datasets_shapes[-1], 'shapes') datasets[-1].dims[0].attach_scale(datasets_shapes[-1]) datasets_shapes_labels.append(f.create_dataset('{}_shape_labels'.format(t[0]), (3, ), dtype='S11')) datasets_shapes_labels[-1][...] = shape_labels datasets[-1].dims.create_scale(datasets_shapes_labels[-1], 'shape_labels') datasets[-1].dims[0].attach_scale(datasets_shapes_labels[-1]) split_dict = { 'train': {t[0]: (0, nb_training_examples) for t in ts}, 'test': {t[0]: (nb_training_examples, nb_examples) for t in ts} } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_splits(self): """ Save the splits to the data file. """ logger.info('Producing dataset splits...') for split in self.splits: split.save() split_dict = {split.name: split.to_dict() for split in self.splits} splits = H5PYDataset.create_split_array(split_dict) logger.debug('split: %s', splits) logger.info('Saving splits...') with warnings.catch_warnings(): warnings.simplefilter('ignore') self.data_file.attrs['split'] = splits
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tc': (0, nsamples_train) }, 'dev': { 'tc': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tc': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def make_lr_fuel_file(outfile, inda, indb, indc, X, y): """ Makes a FUEL dataset that combines both left and right features. :param outfile: :param inda: :param indb: :param indc: :param X: :param y: :return: """ # Make the pytables table: f = h5py.File(outfile, mode='w') targets = f.create_dataset('targets', y.shape, dtype='int8') l_features = f.create_dataset('l_features', X['l'].shape, dtype='int8') r_features = f.create_dataset('r_features', X['r'].shape, dtype='int8') # Load the data into it: l_features[...] = X['l'] r_features[...] = X['r'] targets[...] = y # Label the axis: targets.dims[0].label = 'sample' targets.dims[1].label = 'class' l_features.dims[0].label = 'sample' l_features.dims[1].label = 'feature' r_features.dims[0].label = 'sample' r_features.dims[1].label = 'feature' # Make a "splits" dictionary as required by Fuel split_dict = { 'train': {'l_features': (0, inda), 'r_features': (0, inda), 'targets': (0, inda)}, 'valid': {'l_features': (inda, inda + indb), 'r_features': (inda, inda + indb), 'targets': (inda, inda + indb)}, 'test': {'l_features': (inda + indb, inda + indb + indc), 'r_features': (inda + indb, inda + indb + indc), 'targets': (inda + indb, inda + indb + indc)}, } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) # Save this new dataset to file f.flush() f.close()
def test_svhn(): data_path = config.data_path try: config.data_path = '.' f = h5py.File('svhn_format_2.hdf5', 'w') f['features'] = numpy.arange(100, dtype='uint8').reshape((10, 10)) f['targets'] = numpy.arange(10, dtype='uint8').reshape((10, 1)) split_dict = {'train': {'features': (0, 8), 'targets': (0, 8)}, 'test': {'features': (8, 10), 'targets': (8, 10)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.close() dataset = SVHN(which_format=2, which_sets=('train',)) assert_equal(dataset.filename, 'svhn_format_2.hdf5') finally: config.data_path = data_path os.remove('svhn_format_2.hdf5')
def generate_h5(labels, name='breakhis_40PX', path='storage/breakhis/40PX', target_size=(256, 256), img_size=[256, 256]): n_samples = len(labels) n_test, n_dev = int(len(labels) * .3), int(len(labels) * .2) n_train = n_samples - n_test - n_dev f = h5py.File(f'{name}.hdf5', mode='w') _images = f.create_dataset('images', [len(labels)] + img_size + [3], dtype='int32') _labels = f.create_dataset('labels', [len(labels)], dtype='int32') _images.dims[0].label = 'batch' _images.dims[1].label = 'width' _images.dims[2].label = 'height' _images.dims[3].label = 'channel' _labels.dims[0].label = 'batch' B = np.copy(np.array(list(labels.keys()))) rng = np.random.RandomState() perm = rng.permutation(B) for i in tqdm(range(perm.shape[0])): name = perm[i] image_path = f"{path}/{name}" img = load_img(image_path, target_size=target_size) _images[i] = img_to_array(img) _labels[i] = labels[name] split_dict = { 'train': { 'images': (0, n_train), 'labels': (0, n_train) }, 'dev': { 'images': (n_train, n_train + n_dev), 'labels': (n_train, n_train + n_dev) }, 'test': { 'images': (n_train + n_dev, n_samples), 'labels': (n_train + n_dev, n_samples) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_camvid(): data_path = config.data_path try: config.data_path = '.' f = h5py.File('camvid.hdf5', 'w') f['features'] = numpy.arange( 10 * 3 * 360 * 480, dtype='uint8').reshape((10, 3, 360, 480)) f['targets'] = numpy.arange( 10 * 360 * 480, dtype='uint8').reshape((10, 360, 480)) split_dict = {'train': {'features': (0, 6), 'targets': (0, 6)}, 'valid': {'features': (6, 8), 'targets': (6, 8)}, 'test': {'features': (8, 10), 'targets': (8, 10)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.close() dataset = Camvid(which_sets=('train',)) assert_equal(dataset.filename, 'camvid.hdf5') finally: config.data_path = data_path
def save_encoded_test_data(save_file, X_test_encoded, y_test): import h5py f = h5py.File(save_file, mode="w") features = f.create_dataset("features", X_test_encoded.shape, dtype="float32") targets = f.create_dataset("targets", y_test.shape, dtype="uint8") features[...] = X_test_encoded targets[...] = y_test split_dict = { "test": { "features": (0, X_test_encoded.shape[0]), "targets": (0, X_test_encoded.shape[0]) } } f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def test_celeba(): data_path = config.data_path try: config.data_path = '.' f = h5py.File('celeba_64.hdf5', 'w') f['features'] = numpy.arange( 10 * 3 * 64 * 64, dtype='uint8').reshape((10, 3, 64, 64)) f['targets'] = numpy.arange( 10 * 40, dtype='uint8').reshape((10, 40)) split_dict = {'train': {'features': (0, 6), 'targets': (0, 6)}, 'valid': {'features': (6, 8), 'targets': (6, 8)}, 'test': {'features': (8, 10), 'targets': (8, 10)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.close() dataset = CelebA(which_format='64', which_sets=('train',)) assert_equal(dataset.filename, 'celeba_64.hdf5') finally: config.data_path = data_path os.remove('celeba_64.hdf5')
def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None): if ent2tfidf_features_path == None: print "Warning: ignoring tfidf features building..." return ent2features = load_ent2features(ent2tfidf_features_path) word_to_idx, idx_to_word = build_voc_from_features(ent2features) logger.info('tfidf desc features vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32') ent_no_emb = 0 for i, men in enumerate(trnMentions + devMentions + tstMentions): if men.entityId not in ent2features: ent_no_emb += 1 continue features = ent2features[men.entityId] input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1]) logger.info('shape of tfidf input dataset: %s', input_features.shape) logger.info('number of entities without embeddings: %d', ent_no_emb) hdf5_file = dsdir + '_desc_features.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('desc_features', input_features.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_features features.dims[0].label = 'description_features' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'desc_features': (0, nsamples_train)}, 'dev': {'desc_features': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'desc_features': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file) logger.info('writing word embeddings') idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) print "embeddings shape: ", idx2embeddings.shape with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, word2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in word2idx: entvec = embeddings[word2idx[mye]] input_entvec[i] = entvec print input_entvec.shape hdf5_file += '_entvec.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('entvec', input_entvec.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = vectorsize features[...] = input_entvec features.dims[0].label = 'entity_vector' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'entvec': (0, nsamples_train) }, 'dev': { 'entvec': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'entvec': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building entityVec dataset finished. It saved in: %s', hdf5_file)
def make_one_sided_fuel_file(outfile, inda, indb, indc, X, y, side): """ Makes a dataset that includes only a single side of features. :param outfile: :param inda: :param indb: :param indc: :param X: :param y: :param side: :return: """ # Make the pytables table: f = h5py.File(outfile, mode='w') targets = f.create_dataset('targets', y.shape, dtype='int8') features = f.create_dataset('{}_features'.format(side), X.shape, dtype='int8') # Load the data into it: features[...] = X targets[...] = y # Label the axis: targets.dims[0].label = 'sample' targets.dims[1].label = 'class' features.dims[0].label = 'sample' features.dims[1].label = 'feature' # Make a "splits" dictionary as required by Fuel split_dict = { 'train': {'{}_features'.format(side): (0, inda), 'targets': (0, inda)}, 'valid': {'{}_features'.format(side): (inda, inda + indb), 'targets': (inda, inda + indb)}, 'test': {'{}_features'.format(side): (inda + indb, inda + indb + indc), 'targets': (inda + indb, inda + indb + indc)}, } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) # Save this new dataset to file f.flush() f.close()
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir): totals = len(trnMentions) + len(devMentions) + len(tstMentions) targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): types_idx = [t2idx[t] for t in men.alltypes] targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx)) hdf5_file = dsdir + '_targets.h5py' f = h5py.File(hdf5_file, mode='w') targets = f.create_dataset('targets', targets_m.shape, dtype='int32') targets.attrs['type_to_ix'] = yaml.dump(t2idx) targets[...] = targets_m targets.dims[0].label = 'all_types' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'targets': (0, nsamples_train)}, 'dev': {'targets': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'targets': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def save_indices_unlabeled(self, opt, unlabeled_indices): path_indices_hdf5 = os.path.join( opt.save_path, os.path.normpath(opt.save_path) + "_unlabeled_indices.hdf5") if os.path.isfile(path_indices_hdf5): print "This file already exists %s" % path_indices_hdf5 quit(0) f = h5py.File(path_indices_hdf5, "w") unlabeled_indices_dataset = f.create_dataset('unlabeled_indices', unlabeled_indices.shape, maxshape=(None, )) unlabeled_indices_dataset[...] = unlabeled_indices split_dict = { "data": { "unlabeled_indices": (0, len(unlabeled_indices)) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tcwords': (0, nsamples_train)}, 'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def writeDataset(self, hdf5, hdf5_filepath, testImages, testLabels, testNames): num_examples = len(testImages) # store images image_size = (256, 256) array_shape = (len(testLabels), 3) + image_size ds_images = hdf5.create_dataset("images", array_shape, dtype=np.uint8) ds_images.dims[0].label = "batch" ds_images.dims[1].label = "channel" ds_images.dims[2].label = "height" ds_images.dims[3].label = "width" # write images to the disk for i, filename in tqdm(enumerate(testImages), total=num_examples, desc=hdf5_filepath): raw_image = cv2.imread(filename, cv2.IMREAD_COLOR) # BGR image image = self.preprocess(raw_image, image_size) ds_images[i] = image # store the targets (class labels) targets = np.array(testLabels, np.int32).reshape(num_examples, 1) ds_targets = hdf5.create_dataset("targets", data=targets) ds_targets.dims[0].label = "batch" ds_targets.dims[1].label = "class_labels" # specify the splits uniqueLabels = list(np.unique(testLabels)) middleLabel = uniqueLabels[int(np.round(len(uniqueLabels) / 2))] test_head = testLabels.index(middleLabel) split_train, split_test = (0, test_head), (test_head, num_examples) split_dict = dict(train=dict(images=split_train, targets=split_train), test=dict(images=split_test, targets=split_test)) hdf5.attrs["split"] = H5PYDataset.create_split_array(split_dict) hdf5.flush() hdf5.close()
def _initialize_conversion(directory, output_path, image_shape): h5file = h5py.File(output_path, mode='w') split_dict = { 'train': { 'features': (0, TRAIN_STOP), 'targets': (0, TRAIN_STOP) }, 'valid': { 'features': (TRAIN_STOP, VALID_STOP), 'targets': (TRAIN_STOP, VALID_STOP) }, 'test': { 'features': (VALID_STOP, NUM_EXAMPLES), 'targets': (VALID_STOP, NUM_EXAMPLES) } } h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) targets_dataset = h5file.create_dataset('targets', (NUM_EXAMPLES, 40), dtype='uint8') targets_dataset.dims[0].label = 'batch' targets_dataset.dims[1].label = 'target' targets_dataset[ ...] = (numpy.loadtxt(os.path.join(directory, ATTRIBUTES_FILE), dtype='int32', skiprows=2, usecols=tuple(range(1, 41))) + 1) / 2 features_dataset = h5file.create_dataset('features', (NUM_EXAMPLES, 3) + image_shape, dtype='uint8') features_dataset.dims[0].label = 'batch' features_dataset.dims[1].label = 'channel' features_dataset.dims[2].label = 'height' features_dataset.dims[3].label = 'width' return h5file
def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1): ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(trnMentions+devMentions+tstMentions,ngram=ngram, MIN_FREQ=5) #train for characters because we only use entities names for characters logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name ngrams = name2ngrams[name] input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams) print input_words.shape ngram_label = 'ngrams' + str(ngram) hdf5_file = dsdir + '_ngrams'+str(ngram)+'.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset(ngram_label, input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(ngram_to_idx) features[...] = input_words features.dims[0].label = ngram_label nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {ngram_label: (0, nsamples_train)}, 'dev': {ngram_label: (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {ngram_label: (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file) if vectorfile is None or vectorfile == '': return logger.info('Now, writing ngram embeddings') embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def test_svhn(): data_path = config.data_path try: config.data_path = '.' f = h5py.File('svhn_format_2.hdf5', 'w') f['features'] = numpy.arange(100, dtype='uint8').reshape((10, 10)) f['targets'] = numpy.arange(10, dtype='uint8').reshape((10, 1)) split_dict = { 'train': { 'features': (0, 8), 'targets': (0, 8) }, 'test': { 'features': (8, 10), 'targets': (8, 10) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.close() dataset = SVHN(which_format=2, which_sets=('train', )) assert_equal(dataset.filename, 'svhn_format_2.hdf5') finally: config.data_path = data_path os.remove('svhn_format_2.hdf5')
f = h5py.File('../../data/dataset.hdf5', mode='w') images = f.create_dataset('images', train_feature.shape, dtype='float32') targets = f.create_dataset('targets', train_target.shape, dtype='float32') images[...] = train_feature targets[...] = train_target split_dict = { 'train': { 'images': (0, train_feature.shape[0]), 'targets': (0, train_target.shape[0]) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() train_set = H5PYDataset('../../data/dataset.hdf5', which_sets=('train',)) #data_stream = DataStream(dataset=train_set, iteration_scheme=scheme) #state = train_set.open() scheme = ShuffledScheme(examples=train_set.num_examples, batch_size=4) data_stream = DataStream(dataset=train_set, iteration_scheme=scheme) for data in data_stream.get_epoch_iterator(): print(data[0], data[1]) standardized_stream = ScaleAndShift(data_stream=data_stream,
split_dict = { 'train': { 'letters': (0, nsamples_train), 'w2v_entvec': (0, nsamples_train), 'w2v_mention': (0, nsamples_train), 'w2v_typecos': (0, nsamples_train), 'nsl_features': (0, nsamples_train), 'targets': (0, nsamples_train) }, 'dev': { 'letters': (nsamples_train, nsamples_train + nsamples_dev), 'w2v_entvec': (nsamples_train, nsamples_train + nsamples_dev), 'w2v_mention': (nsamples_train, nsamples_train + nsamples_dev), 'w2v_typecos': (nsamples_train, nsamples_train + nsamples_dev), 'nsl_features': (nsamples_train, nsamples_train + nsamples_dev), 'targets': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'letters': (nsamples_train + nsamples_dev, nsamples), 'w2v_entvec': (nsamples_train + nsamples_dev, nsamples), 'w2v_mention': (nsamples_train + nsamples_dev, nsamples), 'w2v_typecos': (nsamples_train + nsamples_dev, nsamples), 'nsl_features': (nsamples_train + nsamples_dev, nsamples), 'targets': (nsamples_train + nsamples_dev, nsamples) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
def write_h5py_dataset( nested_dataset_dict, sources_dim_labels, file_path, dtype=config.floatX ): """ Creates a h5py file based dataset (writes this to disk). Parameters ---------- nested_dataset_dict : { <subset>: { <source>: numpy.ndarray() } }, where <subset> : str <source> : str file_path : str sources_dim_labels = { <source> : [str] # length of list = to number of dimensions of source } name : str Returns ------- h5py_file_path """ previous_end_idx = 0 source_to_list_of_subsets = {} split_dict = {} for subset_name in nested_dataset_dict.keys(): if subset_name not in split_dict: split_dict[subset_name] = {} n_samples = nested_dataset_dict[subset_name].values()[0].shape[0] for source_name in nested_dataset_dict[subset_name].keys(): assert nested_dataset_dict[ subset_name ][source_name].shape[0] == n_samples split_dict[subset_name][source_name] = ( previous_end_idx, previous_end_idx + n_samples ) if source_name not in source_to_list_of_subsets: source_to_list_of_subsets[source_name] = [ nested_dataset_dict[subset_name][source_name] ] else: source_to_list_of_subsets[source_name].append( nested_dataset_dict[subset_name][source_name] ) previous_end_idx += n_samples concatenated_subsets = {} for source_name in source_to_list_of_subsets.keys(): concatenated_subsets[source_name] = np.concatenate( source_to_list_of_subsets[source_name], axis=0 ) def write_one_source( source_name, source_data, source_dim_labels, h5py_file ): """ Writes the content for one source to the passed H5PY File. Parameters ---------- source_name : str source_data : ndarray(shape=S) source_dim_labels : [str] len(source_dim_labels) = len(S) h5py_file : h5py.File """ source_handle = h5py_file.create_dataset( source_name, source_data.shape, dtype=dtype ) source_handle[...] = source_data for dim, label in zip(source_handle.dims, source_dim_labels): dim.label = label with h5py.File(file_path, mode='w') as f: for source_name in concatenated_subsets.keys(): write_one_source( source_name=source_name, source_data=concatenated_subsets[source_name], source_dim_labels=sources_dim_labels[source_name], h5py_file=f ) f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close()
logger.warning("missing %s" % identifier) target = int(classmap[os.path.dirname(identifier)]) lists[which_set].setdefault("targets", []).append(target) for i, source in enumerate(sources): compressed = np.fromstring( # level 1 compression is fast and takes out ~90% zlib.compress(cPickle.dumps(data[i]), 1), dtype=np.uint8) #assert np.array_equal(cPickle.loads(zlib.decompress(compressed)), data[i]) print source, data[i].shape, len(compressed) lists[which_set].setdefault(source, []).append(compressed) k += 1 print "%i/%i" % (k, n) split_dict = OrderedDict() b = 0 for which_set, set_by_source in lists.items(): a = b b += len(set_by_source["targets"]) h5file["targets"][a:b] = set_by_source["targets"] for source in sources: h5file[source][a:b] = set_by_source[source] split_dict[which_set] = OrderedDict([ (source, (a, b)) for source in ["targets"] + sources ]) h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close()
def main(): net = vaegan.VAEGAN() network_saver = saver.NetworkSaver('vaegan/models/', net=net) network_saver.load() img_data = np.load('img_data.npy', mmap_mode='r') data_out = np.load('data_out.npy', mmap_mode='r') data_in = np.load('data_in.npy', mmap_mode='r') data_in = np.array(data_in) data_out = np.array(data_out) train_data_size = data_in.shape[0] * train_size print type(img_data), img_data.shape print type(data_in), data_in.shape print type(data_out), data_out.shape data_in, in_size = getConvFeatures(net, data_in, img_data) out_size = len(output_columns) max_prediction = max(future_predictions) + 1 if len(data_in) % seq_length > 0: data_in = data_in[:len(data_in) - len(data_in) % seq_length + max_prediction] else: data_in = data_in[:len(data_in) - seq_length + max_prediction] nsamples = (len(data_in) / seq_redundancy) print 'Saving data to disc...' inputs = np.memmap('inputs.npy', dtype=theano.config.floatX, mode='w+', shape=(nsamples, seq_length, in_size)) outputs = np.memmap('outputs.npy', dtype=theano.config.floatX, mode='w+', shape=(nsamples, seq_length, len(future_predictions) * out_size)) for i, p in enumerate(xrange(0, len(data_in) - max_prediction - seq_length, seq_redundancy)): inputs[i] = np.array([d for d in data_in[p:p + seq_length]]) for j in xrange(len(future_predictions)): outputs[i, :, j * out_size:(j + 1) * out_size] = np.array( [d for d in data_out[p + future_predictions[j]:p + seq_length + future_predictions[j]]]) nsamples = len(inputs) nsamples_train = train_data_size // seq_length print np.isnan(np.sum(inputs)) print np.isnan(np.sum(outputs)) f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('features', inputs.shape, dtype=theano.config.floatX) targets = f.create_dataset('targets', outputs.shape, dtype=theano.config.floatX) features[...] = inputs targets[...] = outputs features.dims[0].label = 'batch' features.dims[1].label = 'sequence' features.dims[2].label = 'features' targets.dims[0].label = 'batch' targets.dims[1].label = 'sequence' targets.dims[2].label = 'outputs' split_dict = { 'train': {'features': (0, nsamples_train), 'targets': (0, nsamples_train)}, 'test': {'features': (nsamples_train, nsamples), 'targets': (nsamples_train, nsamples)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() print nsamples_train, nsamples, train_data_size, seq_length print 'inputs shape:', inputs.shape print 'outputs shape:', outputs.shape print 'image inputs shape:', img_data.shape os.remove(inputs.filename) os.remove(outputs.filename) print 'Files saved on disc for Blocks!'
def convert_svhn_format_1(directory, output_directory, output_filename='svhn_format_1.hdf5'): """Converts the SVHN dataset (format 1) to HDF5. This method assumes the existence of the files `{train,test,extra}.tar.gz`, which are accessible through the official website [SVHNSITE]. .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/ Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ try: output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') TMPDIR = tempfile.mkdtemp() # Every image has three channels (RGB) and variable height and width. # It features a variable number of bounding boxes that identify the # location and label of digits. The bounding box location is specified # using the x and y coordinates of its top left corner along with its # width and height. BoundingBoxes = namedtuple( 'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops']) sources = ('features',) + tuple('bbox_{}'.format(field) for field in BoundingBoxes._fields) source_dtypes = dict([(source, 'uint8') for source in sources[:2]] + [(source, 'uint16') for source in sources[2:]]) source_axis_labels = { 'features': ('channel', 'height', 'width'), 'bbox_labels': ('bounding_box', 'index'), 'bbox_heights': ('bounding_box', 'height'), 'bbox_widths': ('bounding_box', 'width'), 'bbox_lefts': ('bounding_box', 'x'), 'bbox_tops': ('bounding_box', 'y')} # The dataset is split into three sets: the training set, the test set # and an extra set of examples that are somewhat less difficult but # can be used as extra training data. These sets are stored separately # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file # contains a directory named after the split it stores. The examples # are stored in that directory as PNG images. The directory also # contains a 'digitStruct.mat' file with all the bounding box and # label information. splits = ('train', 'test', 'extra') file_paths = dict(zip(splits, FORMAT_1_FILES)) for split, path in file_paths.items(): file_paths[split] = os.path.join(directory, path) digit_struct_paths = dict( [(split, os.path.join(TMPDIR, split, 'digitStruct.mat')) for split in splits]) # We first extract the data files in a temporary directory. While doing # that, we also count the number of examples for each split. Files are # extracted individually, which allows to display a progress bar. Since # the splits will be concatenated in the HDF5 file, we also compute the # start and stop intervals of each split within the concatenated array. def extract_tar(split): with tarfile.open(file_paths[split], 'r:gz') as f: members = f.getmembers() num_examples = sum(1 for m in members if '.png' in m.name) progress_bar_context = progress_bar( name='{} file'.format(split), maxval=len(members), prefix='Extracting') with progress_bar_context as bar: for i, member in enumerate(members): f.extract(member, path=TMPDIR) bar.update(i) return num_examples examples_per_split = OrderedDict( [(split, extract_tar(split)) for split in splits]) cumulative_num_examples = numpy.cumsum( [0] + list(examples_per_split.values())) num_examples = cumulative_num_examples[-1] intervals = zip(cumulative_num_examples[:-1], cumulative_num_examples[1:]) split_intervals = dict(zip(splits, intervals)) # The start and stop indices are used to create a split dict that will # be parsed into the split array required by the H5PYDataset interface. # The split dict is organized as follows: # # dict(split -> dict(source -> (start, stop))) # split_dict = OrderedDict([ (split, OrderedDict([(s, split_intervals[split]) for s in sources])) for split in splits]) h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) # We then prepare the HDF5 dataset. This involves creating datasets to # store data sources and datasets to store auxiliary information # (namely the shapes for variable-length axes, and labels to indicate # what these variable-length axes represent). def make_vlen_dataset(source): # Create a variable-length 1D dataset dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source])) dataset = h5file.create_dataset( source, (num_examples,), dtype=dtype) # Create a dataset to store variable-length shapes. axis_labels = source_axis_labels[source] dataset_shapes = h5file.create_dataset( '{}_shapes'.format(source), (num_examples, len(axis_labels)), dtype='uint16') # Create a dataset to store labels for variable-length axes. dataset_vlen_axis_labels = h5file.create_dataset( '{}_vlen_axis_labels'.format(source), (len(axis_labels),), dtype='S{}'.format( numpy.max([len(label) for label in axis_labels]))) # Fill variable-length axis labels dataset_vlen_axis_labels[...] = [ label.encode('utf8') for label in axis_labels] # Attach auxiliary datasets as dimension scales of the # variable-length 1D dataset. This is in accordance with the # H5PYDataset interface. dataset.dims.create_scale(dataset_shapes, 'shapes') dataset.dims[0].attach_scale(dataset_shapes) dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels') dataset.dims[0].attach_scale(dataset_vlen_axis_labels) # Tag fixed-length axis with its label dataset.dims[0].label = 'batch' for source in sources: make_vlen_dataset(source) # The "fun" part begins: we extract the bounding box and label # information contained in 'digitStruct.mat'. This is a version 7.3 # Matlab file, which uses HDF5 under the hood, albeit with a very # convoluted layout. def get_boxes(split): boxes = [] with h5py.File(digit_struct_paths[split], 'r') as f: bar_name = '{} digitStruct'.format(split) bar_maxval = examples_per_split[split] with progress_bar(bar_name, bar_maxval) as bar: for image_number in range(examples_per_split[split]): # The 'digitStruct' group is the main group of the HDF5 # file. It contains two datasets: 'bbox' and 'name'. # The 'name' dataset isn't of interest to us, as it # stores file names and there's already a one-to-one # mapping between row numbers and image names (e.g. # row 0 corresponds to '1.png', row 1 corresponds to # '2.png', and so on). main_group = f['digitStruct'] # The 'bbox' dataset contains the bounding box and # label information we're after. It has as many rows # as there are images, and one column. Elements of the # 'bbox' dataset are object references that point to # (yet another) group that contains the information # for the corresponding image. image_reference = main_group['bbox'][image_number, 0] # There are five datasets contained in that group: # 'label', 'height', 'width', 'left' and 'top'. Each of # those datasets has as many rows as there are bounding # boxes in the corresponding image, and one column. def get_dataset(name): return main_group[image_reference][name][:, 0] names = ('label', 'height', 'width', 'left', 'top') datasets = dict( [(name, get_dataset(name)) for name in names]) # If there is only one bounding box, the information is # stored directly in the datasets. If there are # multiple bounding boxes, elements of those datasets # are object references pointing to 1x1 datasets that # store the information (fortunately, it's the last # hop we need to make). def get_elements(dataset): if len(dataset) > 1: return [int(main_group[reference][0, 0]) for reference in dataset] else: return [int(dataset[0])] # Names are pluralized in the BoundingBox named tuple. kwargs = dict( [(name + 's', get_elements(dataset)) for name, dataset in iteritems(datasets)]) boxes.append(BoundingBoxes(**kwargs)) if bar: bar.update(image_number) return boxes split_boxes = dict([(split, get_boxes(split)) for split in splits]) # The final step is to fill the HDF5 file. def fill_split(split, bar=None): for image_number in range(examples_per_split[split]): image_path = os.path.join( TMPDIR, split, '{}.png'.format(image_number + 1)) image = numpy.asarray( Image.open(image_path)).transpose(2, 0, 1) bounding_boxes = split_boxes[split][image_number] num_boxes = len(bounding_boxes.labels) index = image_number + split_intervals[split][0] h5file['features'][index] = image.flatten() h5file['features'].dims[0]['shapes'][index] = image.shape for field in BoundingBoxes._fields: name = 'bbox_{}'.format(field) h5file[name][index] = getattr(bounding_boxes, field) h5file[name].dims[0]['shapes'][index] = [num_boxes, 1] # Replace label '10' with '0'. labels = h5file['bbox_labels'][index] labels[labels == 10] = 0 h5file['bbox_labels'][index] = labels if image_number % 1000 == 0: h5file.flush() if bar: bar.update(index) with progress_bar('SVHN format 1', num_examples) as bar: for split in splits: fill_split(split, bar=bar) finally: if os.path.isdir(TMPDIR): shutil.rmtree(TMPDIR) h5file.flush() h5file.close() return (output_path,)
data_to_use = numpy.asarray(data_to_use) input_array = data_to_use[:-frame_length].reshape(num_examples, seq_length, frame_length) target_array = data_to_use[frame_length:].reshape(num_examples, seq_length, frame_length) print input_array.shape print target_array.shape # Make H5PY file print "\nMaking Fuel-formatted HDF5 file..." f = h5py.File(hdf5_file, mode="w") inputs = f.create_dataset("inputs", input_array.shape, dtype="float64") targets = f.create_dataset("targets", target_array.shape, dtype="float64") inputs[...] = input_array targets[...] = target_array inputs.dims[0].label = "batch" inputs.dims[1].label = "sequence" targets.dims[0].label = "batch" targets.dims[1].label = "sequence" # Split in test:train print "doing train:test split (at " + str(train_samples) + ")" num_train_examples = train_samples // example_length split_dict = { "train": {"inputs": (0, num_train_examples), "targets": (0, num_train_examples)}, "dev": {"inputs": (num_train_examples, num_examples), "targets": (num_train_examples, num_examples)}, } f.attrs["split"] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() print "file should be made"