Example #1
0
def _init_table(save_file, trajectories_data):
    tot_rows = len(trajectories_data)
    trajectories_data.index = np.arange(tot_rows)

    TABLE_FILTERS = tables.Filters(complevel=5,
                                   complib='zlib',
                                   shuffle=True,
                                   fletcher32=True)
    with tables.File(save_file, 'w') as fid:
        rec_data = trajectories_data.to_records(index=False)
        rec_data[
            'skeleton_id'] = trajectories_data.index  #this is only for the viewer
        rec_data['frame_number'] = trajectories_data.index
        fid.create_table('/',
                         'trajectories_data',
                         obj=rec_data,
                         filters=TABLE_FILTERS)

        fid.create_carray('/',
                          'skeleton',
                          tables.Float32Atom(dflt=np.nan), (tot_rows, 49, 2),
                          filters=TABLE_FILTERS)
        fid.create_carray('/',
                          'mask',
                          tables.Float32Atom(dflt=np.nan),
                          (tot_rows, roi_size, roi_size),
                          filters=TABLE_FILTERS)
Example #2
0
    def create_data_file(out_file, n_samples, image_shape, channels=4):
        hdf5_file = tables.open_file(out_file, mode='w')

        # complevel - compression level
        # complib - the library for compression
        filters = tables.Filters(complevel=5, complib='blosc')

        data_shape = tuple([0, channels] + list(image_shape))

        truth_shape = tuple([0, 1] + list(image_shape))

        data_storage = hdf5_file.create_earray(hdf5_file.root,
                                               'data',
                                               tables.Float32Atom(),
                                               shape=data_shape,
                                               filters=filters,
                                               expectedrows=n_samples)

        truth_storage = hdf5_file.create_earray(hdf5_file.root,
                                                'true',
                                                tables.UInt8Atom(),
                                                shape=truth_shape,
                                                filters=filters,
                                                expectedrows=n_samples)

        affine_storage = hdf5_file.create_earray(hdf5_file.root,
                                                 'affine',
                                                 tables.Float32Atom(),
                                                 shape=(0, 4, 4),
                                                 filters=filters,
                                                 expectedrows=n_samples)
        return hdf5_file, data_storage, truth_storage, affine_storage
def merge(out, fnames):
    data = tables.openFile(out, mode='a')

    for fname in fnames:
        f = tables.openFile(fname, mode='r')
        raw_targets = f.root.denseFeat

        if 'denseFeat' in data.root:
            prev_data = data.root.denseFeat
            targets = data.createCArray(data.root,
                                        '_y',
                                        atom=tables.Float32Atom(),
                                        shape=((raw_targets.shape[0] +
                                                prev_data.shape[0], 436)))
            targets[:prev_data.shape[0], :] = prev_data[:, :]
            targets[prev_data.shape[0]:, :] = raw_targets[:, :]
            data.flush()
            data.removeNode(data.root, "denseFeat", 1)
        else:
            targets = data.createCArray(data.root,
                                        '_y',
                                        atom=tables.Float32Atom(),
                                        shape=((raw_targets.shape[0], 436)))
            targets[:, :] = raw_targets[:, :]
            data.flush()

        data.renameNode(data.root, "denseFeat", "_y")
        data.flush()

        f.close()

    data.close()
Example #4
0
def create_data_file(out_file, n_channels, n_samples, image_shape):
    hdf5_file = tables.open_file(out_file, mode='w')
    print("DEBUG: Opening HDF5 file")
    filters = tables.Filters(complevel=5, complib='blosc')
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))
    print("DEBUG: Writing data_storage to HDF5 file")
    data_storage = hdf5_file.create_earray(hdf5_file.root,
                                           'data',
                                           tables.Float32Atom(),
                                           shape=data_shape,
                                           filters=filters,
                                           expectedrows=n_samples)

    print("DEBUG: Writing truth_storage to HDF5 file")
    truth_storage = hdf5_file.create_earray(hdf5_file.root,
                                            'truth',
                                            tables.UInt8Atom(),
                                            shape=truth_shape,
                                            filters=filters,
                                            expectedrows=n_samples)

    print("DEBUG: Writing affine_storage to HDF5 file")
    affine_storage = hdf5_file.create_earray(hdf5_file.root,
                                             'affine',
                                             tables.Float32Atom(),
                                             shape=(0, 4, 4),
                                             filters=filters,
                                             expectedrows=n_samples)
    return hdf5_file, data_storage, truth_storage, affine_storage
Example #5
0
def setup_hdf5(h5_filename, expectedrows):

    # Open file
    h5file = tables.open_file(h5_filename, mode="w")

    # A group for the normal data
    table = h5file.create_table(h5file.root,
                                "summary",
                                WhiskerSeg,
                                "Summary data about each whisker segment",
                                expectedrows=expectedrows)

    # Put the contour here
    xpixels_vlarray = h5file.create_vlarray(
        h5file.root,
        'pixels_x',
        tables.Float32Atom(shape=()),
        title='Every pixel of each whisker (x-coordinate)',
        expectedrows=expectedrows)
    ypixels_vlarray = h5file.create_vlarray(
        h5file.root,
        'pixels_y',
        tables.Float32Atom(shape=()),
        title='Every pixel of each whisker (y-coordinate)',
        expectedrows=expectedrows)

    h5file.close()
Example #6
0
def initialize_file(save_name, experiments_data, roi_size, frac_train=0.99):
    #divide data into train and test set
    experiments_data = _add_is_train(experiments_data, frac_train)

    #make sure the exp_data has the correct format for strings
    exp_data_r = experiments_data.to_records(index=False)
    dtypes = []
    for col in experiments_data.columns:
        ss = str(exp_data_r[col].dtype)

        if ss == 'object':
            ss = 'S{}'.format(
                experiments_data[col].map(lambda x: len(x)).max())
        dtypes.append((col, ss))
    dtypes = np.dtype(dtypes)
    exp_data_r = exp_data_r.astype(dtypes)

    roi_data_dtypes = np.dtype([(x, np.int32) for x in ROI_DATA_COLS])

    #create the new file
    with tables.File(str(save_name), 'w') as fid_samples:
        fid_samples.create_table('/',
                                 "experiments_data",
                                 exp_data_r,
                                 filters=TABLE_FILTERS)

        fid_samples.create_table('/',
                                 "roi_data",
                                 roi_data_dtypes,
                                 filters=TABLE_FILTERS)

        coords_g = fid_samples.create_group('/', 'coordinates')
        fid_samples.create_earray(coords_g,
                                  'skeletons',
                                  atom=tables.Float32Atom(),
                                  shape=(0, 49, 2),
                                  chunkshape=(1, 49, 2),
                                  filters=TABLE_FILTERS)

        fid_samples.create_earray(coords_g,
                                  'widths',
                                  atom=tables.Float32Atom(),
                                  shape=(0, 49),
                                  chunkshape=(1, 49),
                                  filters=TABLE_FILTERS)

        fid_samples.create_earray('/',
                                  'mask',
                                  atom=tables.Float32Atom(),
                                  shape=(0, roi_size, roi_size),
                                  chunkshape=(1, roi_size, roi_size),
                                  filters=TABLE_FILTERS)

        fid_samples.create_earray('/',
                                  'full_data',
                                  atom=tables.Float32Atom(),
                                  shape=(0, roi_size, roi_size),
                                  chunkshape=(1, roi_size, roi_size),
                                  filters=TABLE_FILTERS)
def addToH5File(h5file, clusters, freqs, store_intermediate=False):
    #group into nrsolutions
    poss_nr_sol = []
    groups = []
    for clusteridx, cluster in enumerate(clusters):
        if not cluster['nrsol'] in poss_nr_sol:
            poss_nr_sol.append(cluster['nrsol'])
            groups.append([])
        idx = poss_nr_sol.index(cluster['nrsol'])
        groups[idx].append(clusteridx)

    if 'sagefreqIdx' in h5file.root:
        h5file.removeNode('/sagefreqIdx')

    h5file.createArray(h5file.root, 'sagefreqIdx', freqs)
    for igrp, grp in enumerate(groups):
        # create arrays:
        if store_intermediate:
            cdata = np.load('tmp_store_cdata_%d.npy' % (grp[0]))
        else:
            cdata = clusters[grp[0]]['cdata']

        arrayshape = cdata.shape[:-1] + (len(grp), 4)
        for name in [
                'sageradec%d' % igrp,
                'sagephases%d' % igrp,
                'sageamplitudes%d' % igrp
        ]:
            if name in h5file.root:
                h5file.removeNode('/' + name)

        srcarray = h5file.createCArray(h5file.root,
                                       'sageradec%d' % igrp,
                                       tab.Float32Atom(),
                                       shape=(len(grp), 2))
        pharray = h5file.createCArray(h5file.root,
                                      'sagephases%d' % igrp,
                                      tab.Float32Atom(),
                                      shape=arrayshape)
        amparray = h5file.createCArray(h5file.root,
                                       'sageamplitudes%d' % igrp,
                                       tab.Float32Atom(),
                                       shape=arrayshape)
        for idx, clusteridx in enumerate(grp):
            if store_intermediate:
                cdata = np.load('tmp_store_cdata_%d.npy' % (clusteridx))
            else:
                cdata = clusters[clusteridx]['cdata']
            pharray[:, :, :, idx, :] = np.angle(cdata)
            amparray[:, :, :, idx, :] = np.absolute(cdata)
            srcarray[idx, :] = np.array(
                [clusters[clusteridx]['Ra'], clusters[clusteridx]['Dec']])
            clusters[clusteridx]['cdata'] = []
            if store_intermediate:
                call("rm tmp_store_cdata_%d.npy" % (clusteridx), shell=True)
        pharray.flush()
        amparray.flush()
        srcarray.flush()
Example #8
0
def tables(docompute, dowrite, complib, verbose):

    # Filenames
    ifilename = os.path.join(OUT_DIR, "expression-inputs.h5")
    ofilename = os.path.join(OUT_DIR, "expression-outputs.h5")

    # Filters
    shuffle = True
    if complib == 'blosc':
        filters = tb.Filters(complevel=1, complib='blosc', shuffle=shuffle)
    elif complib == 'lzo':
        filters = tb.Filters(complevel=1, complib='lzo', shuffle=shuffle)
    elif complib == 'zlib':
        filters = tb.Filters(complevel=1, complib='zlib', shuffle=shuffle)
    else:
        filters = tb.Filters(complevel=0, shuffle=False)
    if verbose:
        print("Will use filters:", filters)

    if dowrite:
        f = tb.open_file(ifilename, 'w')

        # Build input arrays
        t0 = time()
        root = f.root
        a = f.create_carray(root, 'a', tb.Float32Atom(),
                            shape, filters=filters)
        b = f.create_carray(root, 'b', tb.Float32Atom(),
                            shape, filters=filters)
        if verbose:
            print("chunkshape:", a.chunkshape)
            print("chunksize:", np.prod(a.chunkshape) * a.dtype.itemsize)
        #row = np.linspace(0, 1, ncols)
        row = np.arange(0, ncols, dtype='float32')
        for i in range(nrows):
            a[i] = row * (i + 1)
            b[i] = row * (i + 1) * 2
        f.close()
        print("[tables.Expr] Time for creating inputs:", round(time() - t0, 3))

    if docompute:
        f = tb.open_file(ifilename, 'r')
        fr = tb.open_file(ofilename, 'w')
        a = f.root.a
        b = f.root.b
        r1 = f.create_carray(fr.root, 'r1', tb.Float32Atom(), shape,
                             filters=filters)
        # The expression
        e = tb.Expr(expr)
        e.set_output(r1)
        t0 = time()
        e.eval()
        if verbose:
            print("First ten values:", r1[0, :10])
        f.close()
        fr.close()
        print("[tables.Expr] Time for computing & save:",
              round(time() - t0, 3))
def create_features(file_names, file_path):
    flag = 0
    columns = ['train_cat', 'train_dog']
    w = tables.open_file('../window.h5', 'w')
    atom1 = tables.Float32Atom()
    array_w = w.create_earray(w.root, 'data', atom1, (0, 20))
    l = tables.open_file('../label.h5', 'w')
    atom2 = tables.Float32Atom()
    array_l = l.create_earray(l.root, 'data', atom2, (0, 1))
    temp = []
    scaler = StandardScaler()
    for col in columns:
        count = 0
        for file in file_names[col]:
            if file == 0:
                continue
            filefp = file_path + file
            data, fs = soundfile.read(filefp)
            data = data.reshape(-1, 1)
            wn = np.random.randn(len(data)).reshape(-1, 1)
            data_wn = data + 0.0075 * wn
            scaler.fit(data)
            data = scaler.transform(data)
            data_wn = scaler.transform(data_wn)
            #Creating MFCC features for 50ms(800 data points) window and 50% overlap(400 hop_length)
            mfcc = librosa.feature.mfcc(y=data.reshape(data.shape[0], ),
                                        sr=fs,
                                        n_fft=800,
                                        hop_length=400)
            mfcc_wn = librosa.feature.mfcc(y=data_wn.reshape(
                data_wn.shape[0], ),
                                           sr=fs,
                                           n_fft=800,
                                           hop_length=400)
            #Finally for each of the windows 20 features are extracted
            mfcc = mfcc.reshape(-1, 20)
            mfcc_wn = mfcc_wn.reshape(-1, 20)
            #Creating labels
            if col == 'train_cat':
                x = 0
            else:
                x = 1
            label = np.array([[x]
                              for i in range(mfcc.shape[0])]).reshape(-1, 1)
            label_wn = np.array([[x] for i in range(mfcc_wn.shape[0])
                                 ]).reshape(-1, 1)
            array_w.append(mfcc)
            array_w.append(mfcc_wn)
            array_l.append(label)
            array_l.append(label_wn)
            count += mfcc.shape[0] + mfcc_wn.shape[0]
        temp.append(count)
    n_cat = temp[0]  #number of windows with class label as cat
    n_dog = temp[1]  #number of windows with class label as dog
    w.close()
    l.close()
    return n_cat, n_dog
Example #10
0
    def create_hdf5_file(self,
                         output_filepath,
                         data_group_labels=None,
                         case_list=None):

        if data_group_labels is None:
            data_group_labels = self.data_groups.keys()

        hdf5_file = tables.open_file(output_filepath, mode='w')
        filters = tables.Filters(complevel=5, complib='blosc')

        for data_label, data_group in self.data_groups.iteritems():

            num_cases = len(self.cases) * self.multiplier

            if num_cases == 0:
                raise FileNotFoundError(
                    'WARNING: No cases found. Cannot write to file.')

            if data_group.output_shape is None:
                output_shape = data_group.get_shape()
            else:
                output_shape = data_group.output_shape

            # Add batch dimension
            data_shape = tuple([0] + list(output_shape))

            data_group.data_storage = hdf5_file.create_earray(
                hdf5_file.root,
                data_label,
                tables.Float32Atom(),
                shape=data_shape,
                filters=filters,
                expectedrows=num_cases)

            # Naming convention is bad here, TODO, think about this.
            data_group.casename_storage = hdf5_file.create_earray(
                hdf5_file.root,
                '_'.join([data_label, 'casenames']),
                tables.StringAtom(256),
                shape=(0, 1),
                filters=filters,
                expectedrows=num_cases)
            data_group.affine_storage = hdf5_file.create_earray(
                hdf5_file.root,
                '_'.join([data_label, 'affines']),
                tables.Float32Atom(),
                shape=(0, 4, 4),
                filters=filters,
                expectedrows=num_cases)

        return hdf5_file
Example #11
0
def create_data_file(out_file,
                     n_channels,
                     n_samples,
                     image_shape,
                     storage_names=('data', 'truth', 'affine'),
                     affine_shape=(0, 4, 4),
                     normalize=True,
                     affine_dtype=tables.Float32Atom()):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(
        complevel=5
    )  #, complib='blosc')  # suggested remove in https://github.com/ellisdg/3DUnetCNN/issues/58
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))

    if not normalize:
        data_storage = hdf5_file.create_earray(hdf5_file.root,
                                               storage_names[0],
                                               tables.Int8Atom(),
                                               shape=data_shape,
                                               filters=filters,
                                               expectedrows=n_samples)
    else:
        data_storage = hdf5_file.create_earray(hdf5_file.root,
                                               storage_names[0],
                                               tables.Float32Atom(),
                                               shape=data_shape,
                                               filters=filters,
                                               expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root,
                                            storage_names[1],
                                            tables.UInt8Atom(),
                                            shape=truth_shape,
                                            filters=filters,
                                            expectedrows=n_samples)
    affine_storage = hdf5_file.create_earray(hdf5_file.root,
                                             storage_names[2],
                                             affine_dtype,
                                             shape=affine_shape,
                                             filters=filters,
                                             expectedrows=n_samples)
    if len(storage_names) == 4:
        normalization_storage = hdf5_file.create_earray(hdf5_file.root,
                                                        storage_names[3],
                                                        tables.Float32Atom(),
                                                        shape=(0, 2),
                                                        filters=filters,
                                                        expectedrows=n_samples)
        # will hold mean and std of this case for later normalization
        return hdf5_file, data_storage, truth_storage, affine_storage, normalization_storage
    return hdf5_file, data_storage, truth_storage, affine_storage
Example #12
0
    def save(self, filename: str, mode: str = 'h5', **kwargs):
        """

        :param filename:
        :param mode:
        :param kwargs:
        :return:
        """
        if mode == 'h5':
            compression = tables.Filters(complib='zlib',
                                         shuffle=True,
                                         complevel=1)
            h5handle = tables.open_file(filename,
                                        mode="w",
                                        title="Test file",
                                        filters=compression)
            h5handle.create_array('/',
                                  'topology',
                                  np.array(json.dumps(
                                      self.dye.dye_definition)).reshape(1),
                                  shape=(1, ))
            h5handle.create_earray(where='/',
                                   name='coordinates',
                                   atom=tables.Float32Atom(),
                                   shape=(0, self.dye.n_atoms, 3))
            h5handle.create_earray(where='/',
                                   name='time',
                                   atom=tables.Float32Atom(),
                                   shape=(0, ))
            h5handle.create_group(where='/', name='fluorescence')
            h5handle.create_earray(where='/fluorescence/',
                                   name='quencher_distance',
                                   atom=tables.Float32Atom(),
                                   shape=(0, ))
            # set units
            h5handle.root.time.set_attr('units', 'picoseconds')
            h5handle.root.xyz.set_attr('units', 'angstroms')
            h5handle.root.fluorescence.quencher_distance.set_attr(
                'units', 'angstroms')

            h5handle.root.xyz.append(self.xyz)
            h5handle.root.time.append(self.time_axis)
            h5handle.close()
        elif mode == 'xyz':
            skip = kwargs.get('skip', 1)
            coordinates = self.xyz[::skip]
            n_frames = coordinates.shape[0]
            coordinates = coordinates.reshape(n_frames, 3)
            chisurf.fio.coordinates.write_xyz(filename, coordinates)
        elif mode == 'npy':
            np.save(filename, self.xyz)
Example #13
0
    def createHDF5File(self):
        out_file_path = os.path.join(self._output_path, self._output_file_name)
        try:
            hdf5_file = tables.open_file(out_file_path, mode='w')
            filters = tables.Filters(complevel=5, complib='blosc')
            data_shape = tuple([0, self.num_modalities] +
                               list(self._image_shape))
            data_storage = hdf5_file.create_earray(
                hdf5_file.root,
                'data',
                tables.Float32Atom(),
                shape=data_shape,
                filters=filters,
                expectedrows=self.num_modalities)
            if self.label_format == "nii":
                truth_shape = tuple([0, 1] + list(self._image_shape))
                truth_storage = hdf5_file.create_earray(
                    hdf5_file.root,
                    'truth',
                    tables.UInt8Atom(),
                    shape=truth_shape,
                    filters=filters,
                    expectedrows=self.num_modalities)
            elif self.label_format == 'csv':
                truth_shape = tuple([0, self._image_shape[-1]])
                truth_storage = hdf5_file.create_earray(
                    hdf5_file.root,
                    'truth',
                    tables.UInt32Atom(),
                    shape=truth_shape,
                    filters=filters,
                    expectedrows=self.num_modalities)
            else:
                raise ValueError("Fail to recognize label format: %s" %
                                 self.label_format)

            affine_storage = hdf5_file.create_earray(
                hdf5_file.root,
                'affine',
                tables.Float32Atom(),
                shape=(0, 4, 4),
                filters=filters,
                expectedrows=self.num_modalities)
            return hdf5_file, data_storage, truth_storage, affine_storage
        except Exception as e:
            # If something goes wrong, delete the incomplete data file
            os.remove(out_file_path)
            raise e
Example #14
0
def main():
    here = os.path.abspath(os.path.dirname(__file__))
    data_dir = os.path.abspath(os.path.join(here, '..', 'data'))
    file_path = os.path.join(data_dir, 'pytables-earray.h5')

    # One (and only one) of the shape dimensions *must* be 0. The dimension
    # being 0 means that the resulting EArray object can be extended along it.
    # Multiple enlargeable dimensions are not supported right now.
    shape = (0, 300)

    # An EArray contains homogeneous data. Every atomic object (i.e. every
    # single element) has the same type and shape.
    atom = tb.Float32Atom()
    # An EArray supports compression
    filters = tb.Filters(complevel=5, complib='zlib')

    with tb.open_file(file_path, 'w') as f:
        # create an empty EArray
        earray = f.create_earray(where='/',
                                 name='Array0',
                                 atom=atom,
                                 shape=shape,
                                 title='My EArray',
                                 filters=filters)

        # number of times that we need to write some data
        num = 100
        for i in range(num):
            rows = np.random.randint(low=10, high=100)
            cols = shape[1]
            # define some data
            sequence = np.random.random((rows, cols)).astype('float32')
            # append the data to the EArray
            earray.append(sequence=sequence)
Example #15
0
    def insert_embeddings_pytables(self):
        try:
            self.get_model()
            self.model.init_model()
            self.model.load()

            embeds_file = tables.open_file(os.path.join(
                cnt.DATA_FOLDER, cnt.SIAMESE_EMBEDDINGS_FILE),
                                           mode='w')
            atom = tables.Float32Atom()
            embeds_arr = embeds_file.create_earray(
                embeds_file.root, 'data', atom,
                (0, cnt.SIAMESE_EMBEDDING_SIZE))

            sent_tokens_file = tables.open_file(os.path.join(
                cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE),
                                                mode='r')
            sent_tokens = sent_tokens_file.root.data

            n, batch_size = len(sent_tokens), cnt.PYTABLES_INSERT_BATCH_SIZE
            num_batches = int(math.ceil(float(n) / batch_size))

            for m in range(num_batches):
                start, end = m * batch_size, min((m + 1) * batch_size, n)
                tokens_arr_input = gutils.get_wv_siamese(
                    self.wv_model, sent_tokens[start:end, :])
                embeds = self.model.get_embeddings(tokens_arr_input)
                embeds_arr.append(embeds)

        finally:
            sent_tokens_file.close()
            embeds_file.close()
Example #16
0
def add_group_hdf5(save_path, group, expected_shapes, where='/', names=None):
    """
    Adds a new group to an HDF5 archive, with the x train, y train, x test, and y test separated. The expected_shapes
    should be a list of length 4, one shape that matches to each of these data subsets, unless you override the name
    parameters.

    Parameters
    ----------
    save_path <string> : The physical path to the archive file
    group <string> : The name for the group you are adding
    expected_shapes <list> : A list of expected shapes for the data that will be added here. It doesn't have to be exact
    where <string> : A the root hierarchical path that you want to add the group to
    names <list> : A list of strings for the names of the subsets of the groups

    Returns
    -------
    hdf5_file, [data] : The HDF5 file that was opened and a list of arrays for each of the datasets that were added.

    """
    names = names if names else ["x_train", "y_train", "x_test", "y_test"]
    hdf5_file = tables.open_file(save_path, mode='a')
    h_comp = tables.Filters(complevel=5, complib='blosc')
    h_group = hdf5_file.create_group(where, group, group)
    h_data = []
    for k, shape in zip(names, expected_shapes):
        h_data.append(hdf5_file.create_earray(h_group, k,
                                              tables.Float32Atom(),
                                              shape=(0, shape[1]),
                                              filters=h_comp,
                                              expectedrows=shape[0]))
    return hdf5_file, h_data
Example #17
0
def add_transformed(save_path, group, buffer=1000, where='/'):
    """
    Takes a group and normalizes it for training. This is done quietly and only transformed groups are used for
    training networks.

    Parameters
    ----------
    save_path <string> : The physical path to the archive file
    group <string> : The name for the group you are adding
    buffer <int> : an integer defining the number of data points to load into memory at a time.
    where <string> : A the root hierarchical path that you want to add the group to

    """
    hdf5_file = tables.open_file(save_path, mode='a')
    parent = hdf5_file.get_node(where+group)
    data = (parent.x_train, parent.y_train, parent.x_test, parent.y_test)
    shapes = map(lambda x: x.shape, data)
    h_comp = tables.Filters(complevel=5, complib='blosc')
    h_group = hdf5_file.create_group(where+group, 'transformed', 'Data scaled to Gaussian distribution')
    h_data = []
    for k, shape in zip(["x_train", "y_train", "x_test", "y_test"], shapes):
        h_data.append(hdf5_file.create_carray(h_group, k,
                                              tables.Float32Atom(),
                                              shape=shape,
                                              filters=h_comp))
    scale = tr.get_transform(data[0])
    for i, d in enumerate(data):
        for j in xrange(int(math.ceil(d.shape[0] / buffer))):
            if i%2 == 1:
                h_data[i][j*buffer:(j+1)*buffer] = d[j*buffer:(j+1)*buffer]
            else:
                h_data[i][j * buffer:(j + 1) * buffer] = scale.transform(d[j * buffer:(j + 1) * buffer])
    hdf5_file.flush()
    hdf5_file.close()
def open_h5_files() -> (list, list):

    float_atom = tables.Float32Atom()
    int_atom = tables.Int32Atom()

    fd_m = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER, "all.h5"),
                            mode="w")
    data_m = fd_m.create_earray(fd_m.root,
                                "data",
                                float_atom,
                                (0, MATRIX_DIMENSION, MATRIX_DIMENSION),
                                expectedrows=600000)
    label_m = fd_m.create_earray(fd_m.root,
                                 "labels",
                                 int_atom, (0, 1),
                                 expectedrows=600000)

    fd_t = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER, "all.h5"),
                            mode="w")

    data_t = fd_t.create_earray(
        fd_t.root,
        "data",
        float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION),
        expectedrows=600000)
    label_t = fd_t.create_earray(fd_t.root,
                                 "labels",
                                 int_atom, (0, 1),
                                 expectedrows=600000)

    fd_m_test = tables.open_file(os.path.join(MATRIX_DATASET_FOLDER,
                                              "all_test.h5"),
                                 mode="w")
    data_m_test = fd_m.create_earray(fd_m_test.root,
                                     "data",
                                     float_atom,
                                     (0, MATRIX_DIMENSION, MATRIX_DIMENSION),
                                     expectedrows=60000)
    label_m_test = fd_m.create_earray(fd_m_test.root,
                                      "labels",
                                      int_atom, (0, 1),
                                      expectedrows=60000)

    fd_t_test = tables.open_file(os.path.join(TENSOR_DATASET_FOLDER,
                                              "all_test.h5"),
                                 mode="w")
    data_t_test = fd_m.create_earray(
        fd_t_test.root,
        "data",
        float_atom, (0, TENSOR_DIMENSION, TENSOR_DIMENSION, TENSOR_DIMENSION),
        expectedrows=60000)
    label_t_test = fd_m.create_earray(fd_t_test.root,
                                      "labels",
                                      int_atom, (0, 1),
                                      expectedrows=60000)

    return (fd_m,data_m,label_m),\
       (fd_t,data_t,label_t),\
       (fd_m_test,data_m_test,label_m_test),\
       (fd_t_test,data_t_test,label_t_test)
Example #19
0
    def init_hdf5(path, shapes):
        """
        Initialize hdf5 file to be used ba dataset
        """

        x_shape, y_shape = shapes
        # make pytables
        ensure_tables()
        h5file = tables.openFile(path, mode="w", title="SVHN Dataset")
        gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
        atom = tables.Float32Atom(
        ) if config.floatX == 'float32' else tables.Float64Atom()
        filters = DenseDesignMatrixPyTables.filters
        h5file.createCArray(gcolumns,
                            'X',
                            atom=atom,
                            shape=x_shape,
                            title="Data values",
                            filters=filters)
        h5file.createCArray(gcolumns,
                            'y',
                            atom=atom,
                            shape=y_shape,
                            title="Data targets",
                            filters=filters)
        return h5file, gcolumns
Example #20
0
def create_data_file(out_file, n_samples, image_shape, modality_names):
    #     pdb.set_trace()
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    modality_shape = tuple([0, 1] + list(image_shape))
    brain_width_shape = (0, 2, 3)

    modality_storage_list = [
        hdf5_file.create_earray(hdf5_file.root,
                                modality_name,
                                tables.Float32Atom(),
                                shape=modality_shape,
                                filters=filters,
                                expectedrows=n_samples)
        for modality_name in modality_names
    ]

    brain_width_storage = hdf5_file.create_earray(hdf5_file.root,
                                                  'brain_width',
                                                  tables.UInt8Atom(),
                                                  shape=brain_width_shape,
                                                  filters=filters,
                                                  expectedrows=n_samples)

    return hdf5_file, modality_storage_list, brain_width_storage
Example #21
0
    def init_hdf5(self, path, shapes):
        """
        .. todo::

            WRITEME properly

        Initialize hdf5 file to be used ba dataset
        """

        x_shape, y_shape = shapes
        # make pytables
        ensure_tables()
        h5file = tables.openFile(path, mode="w", title="SVHN Dataset")
        gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
        atom = (tables.Float32Atom()
                if config.floatX == 'float32' else tables.Float64Atom())
        h5file.createCArray(gcolumns,
                            'X',
                            atom=atom,
                            shape=x_shape,
                            title="Data values",
                            filters=self.filters)
        h5file.createCArray(gcolumns,
                            'y',
                            atom=atom,
                            shape=y_shape,
                            title="Data targets",
                            filters=self.filters)
        return h5file, gcolumns
Example #22
0
 def create_VLFloatArray(self, name, array, group):
     """Stores a homogenous variable length float array in a group"""
     self.h5file.create_vlarray(group,
                                 name,
                                 tables.Float32Atom(),
                                 "ragged array of floats",
                                 chunkshape = 512)
Example #23
0
def create_image_data():
    try:
        img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='w')
        atom = tables.Float32Atom()
        img_arr = img_arr_file.create_earray(
            img_arr_file.root, 'data', atom,
            (0, cnt.IMAGE_SIZE, cnt.IMAGE_SIZE, 3))

        chunk_size, labels = 5000, []
        for df_chunk in pd.read_csv(cnt.OUTPUT_FILE_PATH,
                                    chunksize=chunk_size):
            df = df_chunk[list(
                df_chunk['image_path'].apply(lambda x: os.path.exists(x)))]
            print(df.shape)
            labels += list(df['age_group'])
            file_paths = list(df['image_path'])
            img_arr.append([
                img_to_array(
                    load_img(image).convert('RGB').resize(
                        (cnt.IMAGE_SIZE, cnt.IMAGE_SIZE))) / 255.0
                for image in file_paths
            ])

        shutils.save_data_pkl(labels, cnt.LABELS_PATH)
    finally:
        img_arr_file.close()
Example #24
0
   def interpolate( self, facetlistfile ) :
      
      """
      """
      
      #facetdbname = os.path.join(self.globaldb, 'facets')
      #os.system( 'makesourcedb in=%s out=%s append=False' % (facetlistfile, facetdbname) )
      
      #patch_table = pt.table( os.path.join(facetdbname, 'SOURCES', 'PATCHES' ) )
      
      #if 'facets' in self.hdf5.root: self.hdf5.root.facets.remove()
      #description = {'name': tables.StringCol(40), 'position':tables.Float64Col(2)}
      #self.facets = self.hdf5.createTable(self.hdf5.root, 'facets', description)

      #facet = self.facets.row
      #for patch in patch_table :
         #facet['name'] = patch['PATCHNAME']
         #facet['position'] = array([patch['RA'], patch['DEC']])
         #facet.append()
      #self.facets.flush()
      self.N_facets = len(self.facets)
      
      self.facet_names = self.facets[:]['name']
      self.facet_positions = self.facets[:]['position']

      print self.n_list
      if 'STEC_facets' in self.hdf5.root: self.hdf5.root.STEC_facets.remove()
      self.STEC_facets = self.hdf5.createCArray(self.hdf5.root, 'STEC_facets', tables.Float32Atom(), shape = (self.N_pol, self.n_list[:].shape[0],  self.N_facets, self.N_stations))

      #if 'facet_piercepoints' in self.hdf5.root: self.hdf5.root.facet_piercepoints.remove()
      #description = {'positions':tables.Float64Col((self.N_facets, self.N_stations,2)), \
                     #'positions_xyz':tables.Float64Col((self.N_facets, self.N_stations,3)), \
                     #'zenith_angles':tables.Float64Col((self.N_facets, self.N_stations))}
      #self.facet_piercepoints = self.hdf5.createTable(self.hdf5.root, 'facet_piercepoints', description)
      #height = self.piercepoints.attrs.height
      #facet_piercepoints_row = self.facet_piercepoints.row
      #print "Calculating facet piercepoints..."
      #for n in self.n_list:
         #piercepoints = PiercePoints( self.times[ n ], self.pointing, self.array_center, self.facet_positions, self.station_positions, height = height )
         #facet_piercepoints_row['positions'] = piercepoints.positions
         #facet_piercepoints_row['positions_xyz'] = piercepoints.positions_xyz
         #facet_piercepoints_row['zenith_angles'] = piercepoints.zenith_angles
         #facet_piercepoints_row.append()
      #self.facet_piercepoints.flush()

      r_0 = self.TECfit_white.attrs.r_0
      beta = self.TECfit_white.attrs.beta
      
      for facet_idx in range(self.N_facets) :
         for station_idx in range(self.N_stations):
            for pol_idx in range(self.N_pol) :
               TEC_list = []
               for n in range(len(self.n_list)):
                  p = self.facet_piercepoints[n]['positions_xyz'][facet_idx, station_idx,:]
                  za = self.facet_piercepoints[n]['zenith_angles'][facet_idx, station_idx]
                  Xp_table = reshape(self.piercepoints[n]['positions_xyz'], (self.N_piercepoints, 3) )
                  v = self.TECfit_white[ pol_idx, n, :, : ].reshape((self.N_piercepoints,1))
                  D2 = sum((Xp_table - p)**2,1)
                  C = (D2 / ( r_0**2 ) )**( beta / 2. ) / -2.
                  self.STEC_facets[pol_idx, n,  facet_idx, station_idx] = dot(C, v)/cos(za)
Example #25
0
def get_cnn_features(image_list, split, batch_size, relu=False):
    hdf5_path = "%s-%s" % (split, "cnn_features.hdf5")
    hdf5_file = tables.open_file(hdf5_path, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    data_storage = hdf5_file.createEArray(hdf5_file.root,
                                          'feats',
                                          tables.Float32Atom(),
                                          shape=(0, 100352),
                                          filters=filters,
                                          expectedrows=len(image_list))
    for start, end in zip(
            range(0,
                  len(image_list) + batch_size, batch_size),
            range(batch_size,
                  len(image_list) + batch_size, batch_size)):
        print("Processing %s images %d-%d / %d" %
              (split, start, end, len(image_list)))
        batch_list = image_list[start:end]
        feats = cnn.get_features(batch_list,
                                 layers='conv5_4',
                                 layer_sizes=[512, 14, 14])
        # transpose and flatten feats to prepare for reshape(14*14, 512)
        feats = np.array(map(lambda x: x.T.flatten(), feats))
        if relu:
            feats = np.clip(feats, a_min=0., a_max=np.inf, out=feats)  # RELU
        data_storage.append(feats)
    print("Finished processing %d images" % len(data_storage))
    hdf5_file.close()
Example #26
0
def create_hdf5_file(output_filepath, data_groups, data_collection):

    # Investigate hdf5 files.
    hdf5_file = tables.open_file(output_filepath, mode='w')

    # Investigate this line.
    # Compression levels = complevel. No compression = 0
    # Compression library = Method of compresion.
    filters = tables.Filters(complevel=5, complib='blosc')

    data_storages = []

    for data_group_label in data_groups:

        data_group = data_collection.data_groups[data_group_label]

        num_cases, output_shape = data_group.get_augment_num_shape()
        modalities = data_group.get_modalities()

        # Input data has multiple 'channels' i.e. modalities.
        data_shape = tuple([0, modalities] + list(output_shape))
        print data_group.label, data_shape
        data_group.data_storage = hdf5_file.create_earray(
            hdf5_file.root,
            data_group.label,
            tables.Float32Atom(),
            shape=data_shape,
            filters=filters,
            expectedrows=num_cases)

    return hdf5_file
Example #27
0
    def insert_embeddings_pytables(self, batch_size=25000):
        try:
            self.load()

            embeds_file = tables.open_file('data/w2v_embeddings.h5', mode='w')
            atom = tables.Float32Atom()
            embeds_arr = embeds_file.create_earray(embeds_file.root, 'data',
                                                   atom,
                                                   (0, self.embedding_size))

            tokens_file = tables.open_file('data/sent_tokens.h5', mode='r')
            sent_tokens = tokens_file.root.data

            sent_tokens = [[w.decode('utf-8') for w in tokens]
                           for tokens in sent_tokens]

            n = len(sent_tokens)
            num_batches = int(math.ceil(float(n) / batch_size))

            vocabulary = [
                word
                for word, index in self.tfidf_vectorizer.vocabulary_.items()
            ]

            for m in range(num_batches):
                start, end = m * batch_size, min((m + 1) * batch_size, n)
                matrix = self.tfidf_vectorizer.transform(
                    sent_tokens[start:end, :])
                embeds = self.get_weighted_sentence_vectors(
                    matrix, vocabulary, end - start)
                embeds_arr.append(embeds)

        finally:
            tokens_file.close()
            embeds_file.close()
def create_hdf5_file(output_filepath,
                     num_cases,
                     output_sizes,
                     preloaded=False):
    """ Creates a multi-tiered HDF5 file at each resolution provided in 'output_sizes'.
        Also stores string filepaths associated with the data.

        Big credit to https://github.com/ellisdg/3DUnetCNN for bringing HDF5 into
        my life.
    """

    hdf5_file = tables.open_file(output_filepath, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')

    hdf5_file.create_earray(hdf5_file.root,
                            'imagenames',
                            tables.StringAtom(256),
                            shape=(0, 1),
                            filters=filters,
                            expectedrows=num_cases)

    for output_size in output_sizes:
        hdf5_file.create_earray(hdf5_file.root,
                                'data_' + str(output_size[0]),
                                tables.Float32Atom(),
                                shape=(0, ) + output_size,
                                filters=filters,
                                expectedrows=num_cases)

    return hdf5_file
Example #29
0
    def dump_test_set(self, h5filepath, nframes, framesize):
        # set rng to a hardcoded state, so we always have the same test set!
        self.numpy_rng.seed(1)
        with tables.openFile(h5filepath, 'w') as h5file:

            h5file.createArray(h5file.root, 'test_targets',
                               self.partitions['test']['targets'])

            vids = h5file.createCArray(h5file.root,
                                       'test_images',
                                       tables.Float32Atom(),
                                       shape=(10000, nframes, framesize,
                                              framesize),
                                       filters=tables.Filters(complevel=5,
                                                              complib='zlib'))

            pos = h5file.createCArray(h5file.root,
                                      'test_pos',
                                      tables.UInt16Atom(),
                                      shape=(10000, nframes, 2),
                                      filters=tables.Filters(complevel=5,
                                                             complib='zlib'))
            for i in range(100):
                print i
                (vids[i * 100:(i + 1) * 100], pos[i * 100:(i + 1) * 100],
                 _) = self.get_batch('test',
                                     100,
                                     nframes,
                                     framesize,
                                     idx=np.arange(i * 100, (i + 1) * 100))
                h5file.flush()
Example #30
0
    def resize(h5file, start, stop):
        ensure_tables()
        # TODO is there any smarter and more efficient way to this?

        data = h5file.getNode('/', "Data")
        try:
            gcolumns = h5file.createGroup('/', "Data_", "Data")
        except tables.exceptions.NodeError:
            h5file.removeNode('/', "Data_", 1)
            gcolumns = h5file.createGroup('/', "Data_", "Data")

        start = 0 if start is None else start
        stop = gcolumns.X.nrows if stop is None else stop

        atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom()
        filters = DenseDesignMatrixPyTables.filters
        x = h5file.createCArray(gcolumns, 'X', atom = atom, shape = ((stop - start, data.X.shape[1])),
                            title = "Data values", filters = filters)
        y = h5file.createCArray(gcolumns, 'y', atom = atom, shape = ((stop - start, 10)),
                            title = "Data targets", filters = filters)
        x[:] = data.X[start:stop]
        y[:] = data.y[start:stop]

        h5file.removeNode('/', "Data", 1)
        h5file.renameNode('/', "Data", "Data_")
        h5file.flush()
        return h5file, gcolumns