Ejemplo n.º 1
0
def main():
    print 'Loading data...'
    train_data, train_labels = load_csv('kaggle/train.csv', True)
    print 'Writing file...'
    f = h5py.File('kaggle-mnist.hdf5', mode='w')
    pixel_features = f.create_dataset('pixels', (len(train_data), INPUT_SIZE),
                                      dtype='float32')
    labels = f.create_dataset('labels', (len(train_data), 1), dtype='uint8')
    for index in xrange(len(train_data)):
        pixel_features[index] = train_data[index]
        labels[index] = train_labels[index]
    split_array = np.empty(2,
                           dtype=np.dtype([
                               ('split', 'a', 5), ('source', 'a', 15),
                               ('start', np.int64, 1), ('stop', np.int64, 1),
                               ('indices',
                                h5py.special_dtype(ref=h5py.Reference)),
                               ('available', np.bool, 1), ('comment', 'a', 1)
                           ]))
    split_array[:]['split'] = 'train'.encode('utf-8')
    split_array[0]['source'] = 'pixels'.encode('utf-8')
    split_array[1]['source'] = 'labels'.encode('utf-8')
    split_array[:]['start'] = 0
    split_array[:]['stop'] = len(train_data)
    split_array[:]['indices'] = h5py.Reference()
    split_array[:]['available'] = True
    split_array[:]['comment'] = '.'.encode('utf8')
    f.attrs['split'] = split_array
    f.flush()
    f.close()
def set_shot_globals(h5file, shot_globals):
    """
    Writes the shot globals into an already open h5 file
    """
    for name, value in shot_globals.items():
        if value is None:
            # Store it as a null object reference:
            value = h5py.Reference()
        try:
            h5file['globals'].attrs[name] = value
        except Exception as e:
            message = (
                'Global %s cannot be saved as an hdf5 attribute. ' % name +
                'Globals can only have relatively simple datatypes, with no nested structures. '
                + 'Original error was:\n' + '%s: %s' %
                (e.__class__.__name__, e.message))
            raise ValueError(message)
Ejemplo n.º 3
0
def make_single_run_file(filename, sequenceglobals, runglobals, sequence_id,
                         run_no, n_runs):
    """Does what it says. runglobals is a dict of this run's globals,
    the format being the same as that of one element of the list returned
    by expand_globals.  sequence_globals is a nested dictionary of the
    type returned by get_globals. Every run file needs a sequence ID,
    generate one with generate_sequence_id. This doesn't have to match
    the filename of the run file you end up using, though is usually does
    (exceptions being things like connection tables). run_no and n_runs
    must be provided, if this run file is part of a sequence, then they
    should reflect how many run files are being generated which share
    this sequence_id."""
    with h5py.File(filename, 'w') as f:
        f.attrs['sequence_id'] = sequence_id
        f.attrs['run number'] = run_no
        f.attrs['n_runs'] = n_runs
        f.create_group('globals')
        if sequenceglobals is not None:
            for groupname, groupvars in sequenceglobals.items():
                group = f['globals'].create_group(groupname)
                unitsgroup = group.create_group('units')
                expansiongroup = group.create_group('expansion')
                for name, (value, units, expansion) in groupvars.items():
                    group.attrs[name] = value
                    unitsgroup.attrs[name] = units
                    expansiongroup.attrs[name] = expansion
        for name, value in runglobals.items():
            if value is None:
                # Store it as a null object reference:
                value = h5py.Reference()
            try:
                f['globals'].attrs[name] = value
            except Exception as e:
                message = (
                    'Global %s cannot be saved as an hdf5 attribute. ' % name +
                    'Globals can only have relatively simple datatypes, with no nested structures. '
                    + 'Original error was:\n' + '%s: %s' %
                    (e.__class__.__name__, e.message))
                raise ValueError(message)
Ejemplo n.º 4
0
def make_single_run_file(filename, sequenceglobals, runglobals, sequence_attrs,
                         run_no, n_runs):
    """Does what it says. runglobals is a dict of this run's globals, the format being
    the same as that of one element of the list returned by expand_globals.
    sequence_globals is a nested dictionary of the type returned by get_globals.
    sequence_attrs is a dict of attributes pertaining to this sequence, as returned by
    new_sequence_details. run_no and n_runs must be provided, if this run file is part
    of a sequence, then they should reflect how many run files are being generated in
    this sequence, all of which must have identical sequence_attrs."""
    mkdir_p(os.path.dirname(filename))
    with h5py.File(filename, 'w') as f:
        f.attrs.update(sequence_attrs)
        f.attrs['run number'] = run_no
        f.attrs['n_runs'] = n_runs
        f.create_group('globals')
        if sequenceglobals is not None:
            for groupname, groupvars in sequenceglobals.items():
                group = f['globals'].create_group(groupname)
                unitsgroup = group.create_group('units')
                expansiongroup = group.create_group('expansion')
                for name, (value, units, expansion) in groupvars.items():
                    group.attrs[name] = value
                    unitsgroup.attrs[name] = units
                    expansiongroup.attrs[name] = expansion
        for name, value in runglobals.items():
            if value is None:
                # Store it as a null object reference:
                value = h5py.Reference()
            try:
                f['globals'].attrs[name] = value
            except Exception as e:
                message = (
                    'Global %s cannot be saved as an hdf5 attribute. ' % name +
                    'Globals can only have relatively simple datatypes, with no nested structures. '
                    + 'Original error was:\n' + '%s: %s' %
                    (e.__class__.__name__, e.message if PY2 else str(e)))
                raise ValueError(message)
Ejemplo n.º 5
0
def copy_element(val, src_dt, tgt_dt, ctx):
    logging.debug("copy_element, val: " + str(val) + " val type: " +
                  str(type(val)) + "src_dt: " + dump_dtype(src_dt) +
                  " tgt_dt: " + dump_dtype(tgt_dt))

    fin = ctx["fin"]
    fout = ctx["fout"]
    out = None
    if len(src_dt) > 0:
        out_fields = []
        i = 0
        for name in src_dt.fields:
            field_src_dt = src_dt.fields[name][0]
            field_tgt_dt = tgt_dt.fields[name][0]
            field_val = val[i]
            i += 1
            out_field = copy_element(field_val, field_src_dt, field_tgt_dt,
                                     ctx)
            out_fields.append(out_field)
            out = tuple(out_fields)
    elif src_dt.metadata and 'ref' in src_dt.metadata:
        if not tgt_dt.metadata or 'ref' not in tgt_dt.metadata:
            raise TypeError(
                "Expected tgt dtype to be ref, but got: {}".format(tgt_dt))
        ref = tgt_dt.metadata['ref']
        if is_reference(ref):
            # initialize out to null ref
            if is_h5py(ctx['fout']):
                out = h5py.Reference()  # null h5py ref
            else:
                out = ''  # h5pyd refs are strings

            if ref:
                try:
                    fin_obj = fin[val]
                except AttributeError as ae:
                    msg = "Unable able to get obj for ref value: {}".format(ae)
                    logging.error(msg)
                    print(msg)
                    return None

                # TBD - for hsget, the name property is not getting set
                h5path = fin_obj.name
                if not h5path:
                    msg = "No path found for ref object"
                    logging.warn(msg)
                    if ctx["verbose"]:
                        print(msg)
                else:
                    fout_obj = fout[h5path]
                    if is_h5py(ctx['fout']):
                        out = fout_obj.ref
                    else:
                        out = str(fout_obj.ref
                                  )  # convert to string for JSON serialization

        elif is_regionreference(ref):
            out = "tbd"
        else:
            raise TypeError("Unexpected ref type: {}".format(type(ref)))
    elif src_dt.metadata and 'vlen' in src_dt.metadata:
        logging.debug("copy_elment, got vlen element, dt: {}".format(
            src_dt.metadata["vlen"]))
        if not isinstance(val, np.ndarray):
            raise TypeError(
                "Expecting ndarray or vlen element, but got: {}".format(
                    type(val)))
        if not tgt_dt.metadata or 'vlen' not in tgt_dt.metadata:
            raise TypeError(
                "Expected tgt dtype to be vlen, but got: {}".format(tgt_dt))
        src_vlen_dt = src_dt.metadata["vlen"]
        tgt_vlen_dt = tgt_dt.metadata["vlen"]
        if has_reference(src_vlen_dt):
            if len(val.shape) == 0:
                # scalar array
                e = val[()]
                v = copy_element(e, src_vlen_dt, tgt_vlen_dt, ctx)
                out = np.array(v, dtype=tgt_dt)
            else:

                out = np.zeros(val.shape, dtype=tgt_dt)
                for i in range(len(out)):
                    e = val[i]
                    out[i] = copy_element(e, src_vlen_dt, tgt_vlen_dt, ctx)
        else:
            # can just directly copy the array
            out = np.zeros(val.shape, dtype=tgt_dt)
            out[...] = val[...]
    else:
        out = val  # can just copy as is
    return out
Ejemplo n.º 6
0
    def create_split_array(split_dict):
        """Create a valid array for the `split` attribute of the root node.

        Parameters
        ----------
        split_dict : dict
            Maps split names to dict. Those dict map source names to
            tuples. Those tuples contain two, three or four elements:
            the start index, the stop index, (optionally) subset
            indices and (optionally) a comment.  If a particular
            split/source combination isn't present in the split dict,
            it's considered as unavailable and the `available` element
            will be set to `False` it its split array entry.

        """
        # Determine maximum split, source and string lengths
        split_len = max(len(split) for split in split_dict)
        sources = set()
        comment_len = 1
        for split in split_dict.values():
            sources |= set(split.keys())
            for val in split.values():
                if len(val) == 4:
                    comment_len = max([comment_len, len(val[-1])])
        sources = sorted(list(sources))
        source_len = max(len(source) for source in sources)

        # Instantiate empty split array
        split_array = numpy.empty(len(split_dict) * len(sources),
                                  dtype=numpy.dtype([
                                      ('split', 'a', split_len),
                                      ('source', 'a', source_len),
                                      ('start', numpy.int64, 1),
                                      ('stop', numpy.int64, 1),
                                      ('indices',
                                       h5py.special_dtype(ref=h5py.Reference)),
                                      ('available', numpy.bool, 1),
                                      ('comment', 'a', comment_len)
                                  ]))

        # Fill split array
        for i, (split, source) in enumerate(product(split_dict, sources)):
            if source in split_dict[split]:
                start, stop = split_dict[split][source][:2]
                available = True
                indices = h5py.Reference()
                # Workaround for bug when pickling an empty string
                comment = '.'
                if len(split_dict[split][source]) > 2:
                    indices = split_dict[split][source][2]
                if len(split_dict[split][source]) > 3:
                    comment = split_dict[split][source][3]
                    if not comment:
                        comment = '.'
            else:
                (start, stop, indices, available,
                 comment) = (0, 0, h5py.Reference(), False, '.')
            # Workaround for H5PY being unable to store unicode type
            split_array[i]['split'] = split.encode('utf8')
            split_array[i]['source'] = source.encode('utf8')
            split_array[i]['start'] = start
            split_array[i]['stop'] = stop
            split_array[i]['indices'] = indices
            split_array[i]['available'] = available
            split_array[i]['comment'] = comment.encode('utf8')

        return split_array
Ejemplo n.º 7
0
                           ('start', np.int64, 1), ('stop', np.int64, 1),
                           ('indices', h5py.special_dtype(ref=h5py.Reference)),
                           ('available', np.bool, 1), ('comment', 'a', 1)
                       ]))

split_array[0:3]['split'] = 'train'.encode('utf8')
split_array[3:6]['split'] = 'valid'.encode('utf8')
split_array[6:9]['split'] = 'test'.encode('utf8')

split_array[0:9:3]['source'] = 'features'.encode('utf8')
split_array[1:9:3]['source'] = 'locs'.encode('utf8')
split_array[2:9:3]['source'] = 'targets'.encode('utf8')

split_array[0:3]['start'] = 0
split_array[0:3]['stop'] = 50000
split_array[3:6]['start'] = 50000
split_array[3:6]['stop'] = 60000
split_array[6:9]['start'] = 60000
split_array[6:9]['stop'] = 70000

split_array[:]['indices'] = h5py.Reference()
split_array[:]['available'] = True
split_array[:]['comment'] = '.'.encode('utf8')
f.attrs['split'] = split_array

f.flush()
f.close()

from fuel.datasets import H5PYDataset
train_set = H5PYDataset('/Tmp/pezeshki/dataset.hdf5', which_sets=('train', ))
Ejemplo n.º 8
0
 def test_exc(self):
     """ (Refs) Deref of empty ref raises ValueError """
     ref = h5py.Reference()
     self.assertRaises(ValueError, self.f.__getitem__, ref)
Ejemplo n.º 9
0
 def test_repr(self):
     """ (Refs) __repr__ works on live and dead references """
     ref = h5py.Reference()
     self.assertIsInstance(repr(ref), basestring)
     self.assertIsInstance(repr(self.f.ref), basestring)
Ejemplo n.º 10
0
 def test_bool(self):
     """ (Refs) __nonzero__ tracks validity """
     ref = h5py.Reference()
     self.assert_(not ref)
     self.assert_(self.f.ref)
Ejemplo n.º 11
0
    def make_dataset(self, trainset, testinput, test_labels, valid_rate=0.01):
        model_manager = self.model_manager
        train_user, train_item, _ = list(zip(*trainset))
        test_user, test_item = list(zip(*testinput))
        user_num = len(set(train_user + test_user))
        item_num = len(set(train_item + test_item))
        self.user_num = user_num
        self.item_num = item_num

        n_valid = round(len(trainset) * valid_rate)

        train, valid = train_test_split(trainset, test_size=n_valid)
        train_user, train_item, train_rate = list(zip(*train))
        valid_user, valid_item, valid_rate = list(zip(*valid))

        train_input_ratings = np.asarray(
            coo_matrix((train_rate, (train_item, train_user)),
                       shape=(item_num, user_num),
                       dtype='int8').todense())
        train_output_ratings = np.zeros((item_num, user_num), dtype='int8')
        train_input_masks = train_input_ratings.astype(bool).astype('int8')
        train_output_masks = np.zeros((item_num, user_num), dtype='int8')

        valid_input_ratings = train_input_ratings.copy()
        valid_output_ratings = np.asarray(
            coo_matrix((valid_rate, (valid_item, valid_user)),
                       shape=(item_num, user_num),
                       dtype='int8').todense())
        valid_input_masks = train_input_masks.copy()
        valid_output_masks = valid_output_ratings.astype(bool).astype('int8')

        test_input_ratings = train_input_ratings + valid_output_ratings
        test_output_ratings = np.asarray(
            coo_matrix((test_labels, (test_item, test_user)),
                       shape=(item_num, user_num),
                       dtype='int8').todense())
        test_input_masks = train_input_masks + valid_output_masks
        test_output_masks = test_output_ratings.astype(bool).astype('int8')

        input_r = np.vstack(
            (train_input_ratings, valid_input_ratings, test_input_ratings))
        input_m = np.vstack(
            (train_input_masks, valid_input_masks, test_input_masks))
        output_r = np.vstack(
            (train_output_ratings, valid_output_ratings, test_output_ratings))
        output_m = np.vstack(
            (train_output_masks, valid_output_masks, test_output_masks))

        f = h5py.File(self.data_path, 'w')
        input_ratings = f.create_dataset('input_ratings',
                                         shape=(item_num * 3, user_num),
                                         dtype='int8',
                                         data=input_r)
        input_ratings.dims[0].label = 'batch'
        input_ratings.dims[1].label = 'movies'
        input_masks = f.create_dataset('input_masks',
                                       shape=(item_num * 3, user_num),
                                       dtype='int8',
                                       data=input_m)
        input_masks.dims[0].label = 'batch'
        input_masks.dims[1].label = 'movies'
        output_ratings = f.create_dataset('output_ratings',
                                          shape=(item_num * 3, user_num),
                                          dtype='int8',
                                          data=output_r)
        output_ratings.dims[0].label = 'batch'
        output_ratings.dims[1].label = 'movies'
        output_masks = f.create_dataset('output_masks',
                                        shape=(item_num * 3, user_num),
                                        dtype='int8',
                                        data=output_m)
        output_masks.dims[0].label = 'batch'
        output_masks.dims[1].label = 'movies'

        split_array = np.empty(12,
                               dtype=([
                                   ('split', 'a', 5), ('source', 'a', 14),
                                   ('start', np.int64, 1),
                                   ('stop', np.int64, 1),
                                   ('indices',
                                    h5py.special_dtype(ref=h5py.Reference)),
                                   ('available', np.bool, 1),
                                   ('comment', 'a', 1)
                               ]))
        split_array[0:4]['split'] = 'train'.encode('utf8')
        split_array[4:8]['split'] = 'valid'.encode('utf8')
        split_array[8:12]['split'] = 'test'.encode('utf8')
        split_array[0:12:4]['source'] = 'input_ratings'.encode('utf8')
        split_array[1:12:4]['source'] = 'input_masks'.encode('utf8')
        split_array[2:12:4]['source'] = 'output_ratings'.encode('utf8')
        split_array[3:12:4]['source'] = 'output_masks'.encode('utf8')
        split_array[0:4]['start'] = 0
        split_array[0:4]['stop'] = item_num
        split_array[4:8]['start'] = item_num
        split_array[4:8]['stop'] = item_num * 2
        split_array[8:12]['start'] = item_num * 2
        split_array[8:12]['stop'] = item_num * 3
        split_array[:]['indices'] = h5py.Reference()
        split_array[:]['available'] = True
        split_array[:]['comment'] = '.'.encode('utf8')
        f.attrs['split'] = split_array
        f.flush()
        f.close()

        f = open(os.path.join(model_manager.path_name, 'metadata'), 'w')
        line = 'n_users:%d\n' % user_num
        f.write(line)
        line = 'n_movies:%d' % item_num
        f.write(line)
        f.close()
Ejemplo n.º 12
0
def write_movie_data(ratings, data_path, output, seed):

    users = {}
    movs = {}
    cnt_u = 0
    cnt_i = 0
    for user_id, mov_id, rating, _ in ratings:
        if user_id not in users.keys():
            users[user_id] = cnt_u
            cnt_u += 1
        if mov_id not in movs.keys():
            movs[mov_id] = cnt_i
            cnt_i += 1
    n_users = len(users)
    n_movies = len(movs)
    train_ratio = 0.9 * 0.995
    valid_ratio = 0.9 * 0.005
    test_ratio = 0.1
    n_ratings = len(ratings)
    n_test = np.ceil(n_ratings * test_ratio)
    n_valid = np.ceil(n_ratings * valid_ratio)
    n_train = n_ratings - n_test - n_valid

    train_input_ratings = np.zeros((n_movies, n_users), dtype='int8')
    train_output_ratings = np.zeros((n_movies, n_users), dtype='int8')
    train_input_masks = np.zeros((n_movies, n_users), dtype='int8')
    train_output_masks = np.zeros((n_movies, n_users), dtype='int8')

    valid_input_ratings = np.zeros((n_movies, n_users), dtype='int8')
    valid_output_ratings = np.zeros((n_movies, n_users), dtype='int8')
    valid_input_masks = np.zeros((n_movies, n_users), dtype='int8')
    valid_output_masks = np.zeros((n_movies, n_users), dtype='int8')

    test_input_ratings = np.zeros((n_movies, n_users), dtype='int8')
    test_output_ratings = np.zeros((n_movies, n_users), dtype='int8')
    test_input_masks = np.zeros((n_movies, n_users), dtype='int8')
    test_output_masks = np.zeros((n_movies, n_users), dtype='int8')

    random.seed(seed)
    random.shuffle(ratings)
    total_n_train = 0
    total_n_valid = 0
    total_n_test = 0
    cnt = 0
    for user_id, mov_id, rating, _ in ratings:
        if cnt < n_train:
            train_input_ratings[movs[mov_id], users[user_id]] = rating
            train_input_masks[movs[mov_id], users[user_id]] = 1
            valid_input_ratings[movs[mov_id], users[user_id]] = rating
            valid_input_masks[movs[mov_id], users[user_id]] = 1
            total_n_train += 1
        elif cnt < n_train + n_valid:
            valid_output_ratings[movs[mov_id], users[user_id]] = rating
            valid_output_masks[movs[mov_id], users[user_id]] = 1
            total_n_valid += 1
        else:
            test_output_ratings[movs[mov_id], users[user_id]] = rating
            test_output_masks[movs[mov_id], users[user_id]] = 1
            total_n_test += 1
        cnt += 1
    test_input_ratings = train_input_ratings + valid_output_ratings
    test_input_masks = train_input_masks + valid_output_masks

    #     rating_mat = csr_matrix(rating_mat)

    input_r = np.vstack(
        (train_input_ratings, valid_input_ratings, test_input_ratings))
    input_m = np.vstack(
        (train_input_masks, valid_input_masks, test_input_masks))
    output_r = np.vstack(
        (train_output_ratings, valid_output_ratings, test_output_ratings))
    output_m = np.vstack(
        (train_output_masks, valid_output_masks, test_output_masks))

    f = h5py.File(os.path.join(output, 'movielens-1m.hdf5'), 'w')
    input_ratings = f.create_dataset('input_ratings',
                                     shape=(n_movies * 3, n_users),
                                     dtype='int8',
                                     data=input_r)
    input_ratings.dims[0].label = 'batch'
    input_ratings.dims[1].label = 'movies'
    input_masks = f.create_dataset('input_masks',
                                   shape=(n_movies * 3, n_users),
                                   dtype='int8',
                                   data=input_m)
    input_masks.dims[0].label = 'batch'
    input_masks.dims[1].label = 'movies'
    output_ratings = f.create_dataset('output_ratings',
                                      shape=(n_movies * 3, n_users),
                                      dtype='int8',
                                      data=output_r)
    output_ratings.dims[0].label = 'batch'
    output_ratings.dims[1].label = 'movies'
    output_masks = f.create_dataset('output_masks',
                                    shape=(n_movies * 3, n_users),
                                    dtype='int8',
                                    data=output_m)
    output_masks.dims[0].label = 'batch'
    output_masks.dims[1].label = 'movies'

    split_array = np.empty(12,
                           dtype=([('split', 'a', 5), ('source', 'a', 14),
                                   ('start', np.int64, 1),
                                   ('stop', np.int64, 1),
                                   ('indices',
                                    h5py.special_dtype(ref=h5py.Reference)),
                                   ('available', np.bool, 1),
                                   ('comment', 'a', 1)]))
    split_array[0:4]['split'] = 'train'.encode('utf8')
    split_array[4:8]['split'] = 'valid'.encode('utf8')
    split_array[8:12]['split'] = 'test'.encode('utf8')
    split_array[0:12:4]['source'] = 'input_ratings'.encode('utf8')
    split_array[1:12:4]['source'] = 'input_masks'.encode('utf8')
    split_array[2:12:4]['source'] = 'output_ratings'.encode('utf8')
    split_array[3:12:4]['source'] = 'output_masks'.encode('utf8')
    split_array[0:4]['start'] = 0
    split_array[0:4]['stop'] = n_movies
    split_array[4:8]['start'] = n_movies
    split_array[4:8]['stop'] = n_movies * 2
    split_array[8:12]['start'] = n_movies * 2
    split_array[8:12]['stop'] = n_movies * 3
    split_array[:]['indices'] = h5py.Reference()
    split_array[:]['available'] = True
    split_array[:]['comment'] = '.'.encode('utf8')
    f.attrs['split'] = split_array
    f.flush()
    f.close()

    f = open(os.path.join(output, 'metadata'), 'w')
    line = 'n_users:%d\n' % n_users
    f.write(line)
    line = 'n_movies:%d' % n_movies
    f.write(line)
    f.close()

    f = open(os.path.join(output, 'user_dict'), 'wb')
    import cPickle
    cPickle.dump(users, f)
    f.close()

    f = open(os.path.join(output, 'movie_dict'), 'wb')
    cPickle.dump(movs, f)
    f.close()