def main(): print 'Loading data...' train_data, train_labels = load_csv('kaggle/train.csv', True) print 'Writing file...' f = h5py.File('kaggle-mnist.hdf5', mode='w') pixel_features = f.create_dataset('pixels', (len(train_data), INPUT_SIZE), dtype='float32') labels = f.create_dataset('labels', (len(train_data), 1), dtype='uint8') for index in xrange(len(train_data)): pixel_features[index] = train_data[index] labels[index] = train_labels[index] split_array = np.empty(2, dtype=np.dtype([ ('split', 'a', 5), ('source', 'a', 15), ('start', np.int64, 1), ('stop', np.int64, 1), ('indices', h5py.special_dtype(ref=h5py.Reference)), ('available', np.bool, 1), ('comment', 'a', 1) ])) split_array[:]['split'] = 'train'.encode('utf-8') split_array[0]['source'] = 'pixels'.encode('utf-8') split_array[1]['source'] = 'labels'.encode('utf-8') split_array[:]['start'] = 0 split_array[:]['stop'] = len(train_data) split_array[:]['indices'] = h5py.Reference() split_array[:]['available'] = True split_array[:]['comment'] = '.'.encode('utf8') f.attrs['split'] = split_array f.flush() f.close()
def set_shot_globals(h5file, shot_globals): """ Writes the shot globals into an already open h5 file """ for name, value in shot_globals.items(): if value is None: # Store it as a null object reference: value = h5py.Reference() try: h5file['globals'].attrs[name] = value except Exception as e: message = ( 'Global %s cannot be saved as an hdf5 attribute. ' % name + 'Globals can only have relatively simple datatypes, with no nested structures. ' + 'Original error was:\n' + '%s: %s' % (e.__class__.__name__, e.message)) raise ValueError(message)
def make_single_run_file(filename, sequenceglobals, runglobals, sequence_id, run_no, n_runs): """Does what it says. runglobals is a dict of this run's globals, the format being the same as that of one element of the list returned by expand_globals. sequence_globals is a nested dictionary of the type returned by get_globals. Every run file needs a sequence ID, generate one with generate_sequence_id. This doesn't have to match the filename of the run file you end up using, though is usually does (exceptions being things like connection tables). run_no and n_runs must be provided, if this run file is part of a sequence, then they should reflect how many run files are being generated which share this sequence_id.""" with h5py.File(filename, 'w') as f: f.attrs['sequence_id'] = sequence_id f.attrs['run number'] = run_no f.attrs['n_runs'] = n_runs f.create_group('globals') if sequenceglobals is not None: for groupname, groupvars in sequenceglobals.items(): group = f['globals'].create_group(groupname) unitsgroup = group.create_group('units') expansiongroup = group.create_group('expansion') for name, (value, units, expansion) in groupvars.items(): group.attrs[name] = value unitsgroup.attrs[name] = units expansiongroup.attrs[name] = expansion for name, value in runglobals.items(): if value is None: # Store it as a null object reference: value = h5py.Reference() try: f['globals'].attrs[name] = value except Exception as e: message = ( 'Global %s cannot be saved as an hdf5 attribute. ' % name + 'Globals can only have relatively simple datatypes, with no nested structures. ' + 'Original error was:\n' + '%s: %s' % (e.__class__.__name__, e.message)) raise ValueError(message)
def make_single_run_file(filename, sequenceglobals, runglobals, sequence_attrs, run_no, n_runs): """Does what it says. runglobals is a dict of this run's globals, the format being the same as that of one element of the list returned by expand_globals. sequence_globals is a nested dictionary of the type returned by get_globals. sequence_attrs is a dict of attributes pertaining to this sequence, as returned by new_sequence_details. run_no and n_runs must be provided, if this run file is part of a sequence, then they should reflect how many run files are being generated in this sequence, all of which must have identical sequence_attrs.""" mkdir_p(os.path.dirname(filename)) with h5py.File(filename, 'w') as f: f.attrs.update(sequence_attrs) f.attrs['run number'] = run_no f.attrs['n_runs'] = n_runs f.create_group('globals') if sequenceglobals is not None: for groupname, groupvars in sequenceglobals.items(): group = f['globals'].create_group(groupname) unitsgroup = group.create_group('units') expansiongroup = group.create_group('expansion') for name, (value, units, expansion) in groupvars.items(): group.attrs[name] = value unitsgroup.attrs[name] = units expansiongroup.attrs[name] = expansion for name, value in runglobals.items(): if value is None: # Store it as a null object reference: value = h5py.Reference() try: f['globals'].attrs[name] = value except Exception as e: message = ( 'Global %s cannot be saved as an hdf5 attribute. ' % name + 'Globals can only have relatively simple datatypes, with no nested structures. ' + 'Original error was:\n' + '%s: %s' % (e.__class__.__name__, e.message if PY2 else str(e))) raise ValueError(message)
def copy_element(val, src_dt, tgt_dt, ctx): logging.debug("copy_element, val: " + str(val) + " val type: " + str(type(val)) + "src_dt: " + dump_dtype(src_dt) + " tgt_dt: " + dump_dtype(tgt_dt)) fin = ctx["fin"] fout = ctx["fout"] out = None if len(src_dt) > 0: out_fields = [] i = 0 for name in src_dt.fields: field_src_dt = src_dt.fields[name][0] field_tgt_dt = tgt_dt.fields[name][0] field_val = val[i] i += 1 out_field = copy_element(field_val, field_src_dt, field_tgt_dt, ctx) out_fields.append(out_field) out = tuple(out_fields) elif src_dt.metadata and 'ref' in src_dt.metadata: if not tgt_dt.metadata or 'ref' not in tgt_dt.metadata: raise TypeError( "Expected tgt dtype to be ref, but got: {}".format(tgt_dt)) ref = tgt_dt.metadata['ref'] if is_reference(ref): # initialize out to null ref if is_h5py(ctx['fout']): out = h5py.Reference() # null h5py ref else: out = '' # h5pyd refs are strings if ref: try: fin_obj = fin[val] except AttributeError as ae: msg = "Unable able to get obj for ref value: {}".format(ae) logging.error(msg) print(msg) return None # TBD - for hsget, the name property is not getting set h5path = fin_obj.name if not h5path: msg = "No path found for ref object" logging.warn(msg) if ctx["verbose"]: print(msg) else: fout_obj = fout[h5path] if is_h5py(ctx['fout']): out = fout_obj.ref else: out = str(fout_obj.ref ) # convert to string for JSON serialization elif is_regionreference(ref): out = "tbd" else: raise TypeError("Unexpected ref type: {}".format(type(ref))) elif src_dt.metadata and 'vlen' in src_dt.metadata: logging.debug("copy_elment, got vlen element, dt: {}".format( src_dt.metadata["vlen"])) if not isinstance(val, np.ndarray): raise TypeError( "Expecting ndarray or vlen element, but got: {}".format( type(val))) if not tgt_dt.metadata or 'vlen' not in tgt_dt.metadata: raise TypeError( "Expected tgt dtype to be vlen, but got: {}".format(tgt_dt)) src_vlen_dt = src_dt.metadata["vlen"] tgt_vlen_dt = tgt_dt.metadata["vlen"] if has_reference(src_vlen_dt): if len(val.shape) == 0: # scalar array e = val[()] v = copy_element(e, src_vlen_dt, tgt_vlen_dt, ctx) out = np.array(v, dtype=tgt_dt) else: out = np.zeros(val.shape, dtype=tgt_dt) for i in range(len(out)): e = val[i] out[i] = copy_element(e, src_vlen_dt, tgt_vlen_dt, ctx) else: # can just directly copy the array out = np.zeros(val.shape, dtype=tgt_dt) out[...] = val[...] else: out = val # can just copy as is return out
def create_split_array(split_dict): """Create a valid array for the `split` attribute of the root node. Parameters ---------- split_dict : dict Maps split names to dict. Those dict map source names to tuples. Those tuples contain two, three or four elements: the start index, the stop index, (optionally) subset indices and (optionally) a comment. If a particular split/source combination isn't present in the split dict, it's considered as unavailable and the `available` element will be set to `False` it its split array entry. """ # Determine maximum split, source and string lengths split_len = max(len(split) for split in split_dict) sources = set() comment_len = 1 for split in split_dict.values(): sources |= set(split.keys()) for val in split.values(): if len(val) == 4: comment_len = max([comment_len, len(val[-1])]) sources = sorted(list(sources)) source_len = max(len(source) for source in sources) # Instantiate empty split array split_array = numpy.empty(len(split_dict) * len(sources), dtype=numpy.dtype([ ('split', 'a', split_len), ('source', 'a', source_len), ('start', numpy.int64, 1), ('stop', numpy.int64, 1), ('indices', h5py.special_dtype(ref=h5py.Reference)), ('available', numpy.bool, 1), ('comment', 'a', comment_len) ])) # Fill split array for i, (split, source) in enumerate(product(split_dict, sources)): if source in split_dict[split]: start, stop = split_dict[split][source][:2] available = True indices = h5py.Reference() # Workaround for bug when pickling an empty string comment = '.' if len(split_dict[split][source]) > 2: indices = split_dict[split][source][2] if len(split_dict[split][source]) > 3: comment = split_dict[split][source][3] if not comment: comment = '.' else: (start, stop, indices, available, comment) = (0, 0, h5py.Reference(), False, '.') # Workaround for H5PY being unable to store unicode type split_array[i]['split'] = split.encode('utf8') split_array[i]['source'] = source.encode('utf8') split_array[i]['start'] = start split_array[i]['stop'] = stop split_array[i]['indices'] = indices split_array[i]['available'] = available split_array[i]['comment'] = comment.encode('utf8') return split_array
('start', np.int64, 1), ('stop', np.int64, 1), ('indices', h5py.special_dtype(ref=h5py.Reference)), ('available', np.bool, 1), ('comment', 'a', 1) ])) split_array[0:3]['split'] = 'train'.encode('utf8') split_array[3:6]['split'] = 'valid'.encode('utf8') split_array[6:9]['split'] = 'test'.encode('utf8') split_array[0:9:3]['source'] = 'features'.encode('utf8') split_array[1:9:3]['source'] = 'locs'.encode('utf8') split_array[2:9:3]['source'] = 'targets'.encode('utf8') split_array[0:3]['start'] = 0 split_array[0:3]['stop'] = 50000 split_array[3:6]['start'] = 50000 split_array[3:6]['stop'] = 60000 split_array[6:9]['start'] = 60000 split_array[6:9]['stop'] = 70000 split_array[:]['indices'] = h5py.Reference() split_array[:]['available'] = True split_array[:]['comment'] = '.'.encode('utf8') f.attrs['split'] = split_array f.flush() f.close() from fuel.datasets import H5PYDataset train_set = H5PYDataset('/Tmp/pezeshki/dataset.hdf5', which_sets=('train', ))
def test_exc(self): """ (Refs) Deref of empty ref raises ValueError """ ref = h5py.Reference() self.assertRaises(ValueError, self.f.__getitem__, ref)
def test_repr(self): """ (Refs) __repr__ works on live and dead references """ ref = h5py.Reference() self.assertIsInstance(repr(ref), basestring) self.assertIsInstance(repr(self.f.ref), basestring)
def test_bool(self): """ (Refs) __nonzero__ tracks validity """ ref = h5py.Reference() self.assert_(not ref) self.assert_(self.f.ref)
def make_dataset(self, trainset, testinput, test_labels, valid_rate=0.01): model_manager = self.model_manager train_user, train_item, _ = list(zip(*trainset)) test_user, test_item = list(zip(*testinput)) user_num = len(set(train_user + test_user)) item_num = len(set(train_item + test_item)) self.user_num = user_num self.item_num = item_num n_valid = round(len(trainset) * valid_rate) train, valid = train_test_split(trainset, test_size=n_valid) train_user, train_item, train_rate = list(zip(*train)) valid_user, valid_item, valid_rate = list(zip(*valid)) train_input_ratings = np.asarray( coo_matrix((train_rate, (train_item, train_user)), shape=(item_num, user_num), dtype='int8').todense()) train_output_ratings = np.zeros((item_num, user_num), dtype='int8') train_input_masks = train_input_ratings.astype(bool).astype('int8') train_output_masks = np.zeros((item_num, user_num), dtype='int8') valid_input_ratings = train_input_ratings.copy() valid_output_ratings = np.asarray( coo_matrix((valid_rate, (valid_item, valid_user)), shape=(item_num, user_num), dtype='int8').todense()) valid_input_masks = train_input_masks.copy() valid_output_masks = valid_output_ratings.astype(bool).astype('int8') test_input_ratings = train_input_ratings + valid_output_ratings test_output_ratings = np.asarray( coo_matrix((test_labels, (test_item, test_user)), shape=(item_num, user_num), dtype='int8').todense()) test_input_masks = train_input_masks + valid_output_masks test_output_masks = test_output_ratings.astype(bool).astype('int8') input_r = np.vstack( (train_input_ratings, valid_input_ratings, test_input_ratings)) input_m = np.vstack( (train_input_masks, valid_input_masks, test_input_masks)) output_r = np.vstack( (train_output_ratings, valid_output_ratings, test_output_ratings)) output_m = np.vstack( (train_output_masks, valid_output_masks, test_output_masks)) f = h5py.File(self.data_path, 'w') input_ratings = f.create_dataset('input_ratings', shape=(item_num * 3, user_num), dtype='int8', data=input_r) input_ratings.dims[0].label = 'batch' input_ratings.dims[1].label = 'movies' input_masks = f.create_dataset('input_masks', shape=(item_num * 3, user_num), dtype='int8', data=input_m) input_masks.dims[0].label = 'batch' input_masks.dims[1].label = 'movies' output_ratings = f.create_dataset('output_ratings', shape=(item_num * 3, user_num), dtype='int8', data=output_r) output_ratings.dims[0].label = 'batch' output_ratings.dims[1].label = 'movies' output_masks = f.create_dataset('output_masks', shape=(item_num * 3, user_num), dtype='int8', data=output_m) output_masks.dims[0].label = 'batch' output_masks.dims[1].label = 'movies' split_array = np.empty(12, dtype=([ ('split', 'a', 5), ('source', 'a', 14), ('start', np.int64, 1), ('stop', np.int64, 1), ('indices', h5py.special_dtype(ref=h5py.Reference)), ('available', np.bool, 1), ('comment', 'a', 1) ])) split_array[0:4]['split'] = 'train'.encode('utf8') split_array[4:8]['split'] = 'valid'.encode('utf8') split_array[8:12]['split'] = 'test'.encode('utf8') split_array[0:12:4]['source'] = 'input_ratings'.encode('utf8') split_array[1:12:4]['source'] = 'input_masks'.encode('utf8') split_array[2:12:4]['source'] = 'output_ratings'.encode('utf8') split_array[3:12:4]['source'] = 'output_masks'.encode('utf8') split_array[0:4]['start'] = 0 split_array[0:4]['stop'] = item_num split_array[4:8]['start'] = item_num split_array[4:8]['stop'] = item_num * 2 split_array[8:12]['start'] = item_num * 2 split_array[8:12]['stop'] = item_num * 3 split_array[:]['indices'] = h5py.Reference() split_array[:]['available'] = True split_array[:]['comment'] = '.'.encode('utf8') f.attrs['split'] = split_array f.flush() f.close() f = open(os.path.join(model_manager.path_name, 'metadata'), 'w') line = 'n_users:%d\n' % user_num f.write(line) line = 'n_movies:%d' % item_num f.write(line) f.close()
def write_movie_data(ratings, data_path, output, seed): users = {} movs = {} cnt_u = 0 cnt_i = 0 for user_id, mov_id, rating, _ in ratings: if user_id not in users.keys(): users[user_id] = cnt_u cnt_u += 1 if mov_id not in movs.keys(): movs[mov_id] = cnt_i cnt_i += 1 n_users = len(users) n_movies = len(movs) train_ratio = 0.9 * 0.995 valid_ratio = 0.9 * 0.005 test_ratio = 0.1 n_ratings = len(ratings) n_test = np.ceil(n_ratings * test_ratio) n_valid = np.ceil(n_ratings * valid_ratio) n_train = n_ratings - n_test - n_valid train_input_ratings = np.zeros((n_movies, n_users), dtype='int8') train_output_ratings = np.zeros((n_movies, n_users), dtype='int8') train_input_masks = np.zeros((n_movies, n_users), dtype='int8') train_output_masks = np.zeros((n_movies, n_users), dtype='int8') valid_input_ratings = np.zeros((n_movies, n_users), dtype='int8') valid_output_ratings = np.zeros((n_movies, n_users), dtype='int8') valid_input_masks = np.zeros((n_movies, n_users), dtype='int8') valid_output_masks = np.zeros((n_movies, n_users), dtype='int8') test_input_ratings = np.zeros((n_movies, n_users), dtype='int8') test_output_ratings = np.zeros((n_movies, n_users), dtype='int8') test_input_masks = np.zeros((n_movies, n_users), dtype='int8') test_output_masks = np.zeros((n_movies, n_users), dtype='int8') random.seed(seed) random.shuffle(ratings) total_n_train = 0 total_n_valid = 0 total_n_test = 0 cnt = 0 for user_id, mov_id, rating, _ in ratings: if cnt < n_train: train_input_ratings[movs[mov_id], users[user_id]] = rating train_input_masks[movs[mov_id], users[user_id]] = 1 valid_input_ratings[movs[mov_id], users[user_id]] = rating valid_input_masks[movs[mov_id], users[user_id]] = 1 total_n_train += 1 elif cnt < n_train + n_valid: valid_output_ratings[movs[mov_id], users[user_id]] = rating valid_output_masks[movs[mov_id], users[user_id]] = 1 total_n_valid += 1 else: test_output_ratings[movs[mov_id], users[user_id]] = rating test_output_masks[movs[mov_id], users[user_id]] = 1 total_n_test += 1 cnt += 1 test_input_ratings = train_input_ratings + valid_output_ratings test_input_masks = train_input_masks + valid_output_masks # rating_mat = csr_matrix(rating_mat) input_r = np.vstack( (train_input_ratings, valid_input_ratings, test_input_ratings)) input_m = np.vstack( (train_input_masks, valid_input_masks, test_input_masks)) output_r = np.vstack( (train_output_ratings, valid_output_ratings, test_output_ratings)) output_m = np.vstack( (train_output_masks, valid_output_masks, test_output_masks)) f = h5py.File(os.path.join(output, 'movielens-1m.hdf5'), 'w') input_ratings = f.create_dataset('input_ratings', shape=(n_movies * 3, n_users), dtype='int8', data=input_r) input_ratings.dims[0].label = 'batch' input_ratings.dims[1].label = 'movies' input_masks = f.create_dataset('input_masks', shape=(n_movies * 3, n_users), dtype='int8', data=input_m) input_masks.dims[0].label = 'batch' input_masks.dims[1].label = 'movies' output_ratings = f.create_dataset('output_ratings', shape=(n_movies * 3, n_users), dtype='int8', data=output_r) output_ratings.dims[0].label = 'batch' output_ratings.dims[1].label = 'movies' output_masks = f.create_dataset('output_masks', shape=(n_movies * 3, n_users), dtype='int8', data=output_m) output_masks.dims[0].label = 'batch' output_masks.dims[1].label = 'movies' split_array = np.empty(12, dtype=([('split', 'a', 5), ('source', 'a', 14), ('start', np.int64, 1), ('stop', np.int64, 1), ('indices', h5py.special_dtype(ref=h5py.Reference)), ('available', np.bool, 1), ('comment', 'a', 1)])) split_array[0:4]['split'] = 'train'.encode('utf8') split_array[4:8]['split'] = 'valid'.encode('utf8') split_array[8:12]['split'] = 'test'.encode('utf8') split_array[0:12:4]['source'] = 'input_ratings'.encode('utf8') split_array[1:12:4]['source'] = 'input_masks'.encode('utf8') split_array[2:12:4]['source'] = 'output_ratings'.encode('utf8') split_array[3:12:4]['source'] = 'output_masks'.encode('utf8') split_array[0:4]['start'] = 0 split_array[0:4]['stop'] = n_movies split_array[4:8]['start'] = n_movies split_array[4:8]['stop'] = n_movies * 2 split_array[8:12]['start'] = n_movies * 2 split_array[8:12]['stop'] = n_movies * 3 split_array[:]['indices'] = h5py.Reference() split_array[:]['available'] = True split_array[:]['comment'] = '.'.encode('utf8') f.attrs['split'] = split_array f.flush() f.close() f = open(os.path.join(output, 'metadata'), 'w') line = 'n_users:%d\n' % n_users f.write(line) line = 'n_movies:%d' % n_movies f.write(line) f.close() f = open(os.path.join(output, 'user_dict'), 'wb') import cPickle cPickle.dump(users, f) f.close() f = open(os.path.join(output, 'movie_dict'), 'wb') cPickle.dump(movs, f) f.close()