def update(self, gene, weight, ref_allele, allele, dosage_row):
        if self.D is None:
            self.gene_list = self.get_gene_list()
            self.gene_index = {
                gene: k
                for (k, gene) in enumerate(self.gene_list)
            }

            self.n_genes = len(self.gene_list)
            self.n_samples = len(dosage_row)

            self.D_file = h5py_cache.File(self.output_binary_file,
                                          'w',
                                          chunk_cache_mem_size=self.cache_size)
            n_genes_chunk = np.min((self.n_genes, 10))
            self.D = self.D_file.create_dataset("pred_expr",
                                                shape=(self.n_genes,
                                                       self.n_samples),
                                                chunks=(n_genes_chunk,
                                                        self.n_samples),
                                                dtype=np.dtype('float32'),
                                                scaleoffset=4,
                                                compression='gzip')

        if gene in self.gene_index:  # assumes dosage coding 0 to 2
            # assumes non-ambiguous SNPs to resolve strand issues:
            if ref_allele == allele or self.complements[ref_allele] == allele:
                self.D[self.gene_index[gene], :] += dosage_row * weight
            else:
                self.D[self.gene_index[gene], :] += (
                    2 - dosage_row) * weight  # Update all cases for that gene
    def __init__(self,
                 data_file_pth,
                 chunk_cache_mem_size_bytes=20 * 1024 ** 3,
                 desired_chunk_size_bytes=0.1 * 1024 ** 2,
                 compression='lzf'):
        """

        :param data_file_pth: Path to the hdf5 file
        :param chunk_cache_mem_size_bytes: HDF5 chunk cache size. Larger the better. Default is 20GiB
        :param desired_chunk_size_bytes: Chunk size for individual chunks. h5py docs recommends keeping this between
            10 KiB and 1 MiB. Default is 0.1 MiB. Pass in -1 to switch to h5py automagic chunk size.
        """
        import h5py
        import h5py_cache

        self.h5py = h5py

        self.desired_chunk_size_bytes = desired_chunk_size_bytes
        if not os.path.exists(data_file_pth):
            self.f = h5py.File(data_file_pth, 'w', libver='latest')
        else:
            self.f = h5py_cache.File(
                data_file_pth,
                'r',
                chunk_cache_mem_size=chunk_cache_mem_size_bytes,
                libver='latest',
                w0=0.1,
                n_cache_chunks=int(chunk_cache_mem_size_bytes / desired_chunk_size_bytes))
        self.i = 0
        self.is_swmr_hdf_version = h5py.version.hdf5_version_tuple >= (1, 9, 178)
        self.compression = compression
Exemple #3
0
    def __init__(self,
                 datapath,
                 dataset,
                 *nargs,
                 train_percentage=100.,
                 val_split=False,
                 **kwargs):
        if datapath not in self.refs:
            f = h5py_cache.File(datapath, chunk_cache_mem_size=1024**3 // 10)
            self.refs[datapath] = f

        length = len(self.refs[datapath][dataset])
        split = int(np.ceil(train_percentage * length))
        if val_split:
            start = split
            end = length
        else:
            start = 0
            end = split

        super(CachingHDF5Matrix, self).__init__(datapath,
                                                dataset,
                                                *nargs,
                                                start=start,
                                                end=end,
                                                **kwargs)

        # bugfix: ndim is not changed to reflect the normalizer's output
        if self.normalizer is not None:
            self._base_ndim = self.normalizer(self.data[0:1]).ndim
        else:
            self._base_ndim = self.data.ndim
Exemple #4
0
    def __init__(self, filename):
        '''Init opening the file and finding all data groups. Currently only
        searches the /Data/Images group.
        
        Parameters
        ----------
            filename :
                The file path to load as a string.
        
        '''

        ## necessary declaration in case something goes wrong
        self.file_hdl = None
        self.metaDataJSON = None
        self.list_data = None

        # check for string
        if not isinstance(filename, str):
            raise TypeError('Filename is supposed to be a string!')

        # try opening the file
        try:
            #self.file_hdl = h5py.File(filename, 'r')
            self.file_hdl = h5py_cache.File(filename,
                                            'r',
                                            chunk_cache_mem_size=10 * 1024**2)
        except:
            print('Error opening file for readonly: "{}"'.format(filename))
            raise

        self._find_groups()
Exemple #5
0
    def extract_all_files(self):
        nd2_files = self.extract_all_meta()
        meta_handle = pandas_hdf5_handler(self.headpath + "/metadata.hdf5")
        meta_df = meta_handle.read_df("data", read_metadata=True)
        channels = meta_df.metadata["channels"]
        y_dim = meta_df.metadata["height"]
        x_dim = meta_df.metadata["width"]
        ttl_indices = len(meta_df)

        chunk_shape = (1, meta_df.metadata['height'],
                       meta_df.metadata['width'])
        chunk_bytes = (2 * np.multiply.accumulate(np.array(chunk_shape))[-1])
        chunk_cache_mem_size = 2 * chunk_bytes

        with h5py_cache.File(
                self.hdf5path + "/extracted.hdf5",
                "w",
                chunk_cache_mem_size=chunk_cache_mem_size) as h5pyfile:
            for c, channel in enumerate(channels):
                hdf5_dataset = h5pyfile.create_dataset(
                    str(channel), (ttl_indices, y_dim, x_dim),
                    chunks=chunk_shape,
                    dtype='uint16')
                for file_idx in meta_df["file_idx"].unique():
                    nd2path = self.headpath + "/" + nd2_files[file_idx]
                    with ND2Reader(nd2path) as nd2file:
                        file_df = meta_df[meta_df["file_idx"] == file_idx]
                        for idx, item in file_df.iterrows():
                            t = item["timepoint"]
                            v = item["fov"]
                            nd2_image = nd2file.get_frame_2D(c=c, t=t, v=v)
                            hdf5_dataset[idx, :, :] = nd2_image
Exemple #6
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists, delete it or move it if you want it generated again")
        return

    Utilities.ensure_requisite_folders(args.output)

    logging.info("Reading input")
    data = pandas.read_table(args.input)

    logging.info("Opening output")
    f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2)))

    n_genes = data.shape[1]-2
    n_samples = data.shape[0]
    n_genes_chunk = np.min((n_genes, 10))

    logging.info("Processing expression")
    p = f.create_dataset("pred_expr", shape=(n_genes, n_samples),
                                        chunks=(n_genes_chunk, n_samples),
                                        dtype=np.dtype('float32'), scaleoffset=4, compression='gzip')
    g = f.create_dataset("genes", (n_genes,), dtype="S30")

    for i, gene in enumerate(data.columns.values[2:]):
        p[i, :] = data[gene].to_numpy()
        g[i] = np.string_(gene)

    logging.info("saving samples")
    s = f.create_dataset("samples", (n_samples,), dtype="S25")
    for i in xrange(0, n_samples):
        s[i] = np.string_(data["IID"][i])
    f.close()
    logging.info("Done")
Exemple #7
0
def _structure_file(file_path):
    logging.info("Acquiring HDF5 expression cache")
    HDF5_CACHE = int(60 * (1024 ** 2))
    file =  h5py_cache.File(file_path, 'r', chunk_cache_mem_size=HDF5_CACHE, w0=1.0, dtype='float32')
    genes = [g for g in file['genes']]
    h5 = file['pred_expr']
    return genes, h5
Exemple #8
0
 def writehdf5(fovnum, num_entries, timepoint_list, file_idx, num_fovs):
     with ND2Reader(self.nd2filename) as nd2file:
         y_dim = self.metadata['height']
         x_dim = self.metadata['width']
         with h5py_cache.File(
                 self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5",
                 "w",
                 chunk_cache_mem_size=self.chunk_cache_mem_size
         ) as h5pyfile:
             for i, channel in enumerate(self.metadata["channels"]):
                 hdf5_dataset = h5pyfile.create_dataset(str(channel),\
                 (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16')
                 # If Elements crashed nd2reader does not index into the file correctly, this is a hard fix
                 if self.metadata["failed_file"]:
                     for j in range(len(timepoint_list)):
                         frame = timepoint_list[j]
                         nd2_image = nd2file.get_frame_2D(
                             c=i, t=0, v=fovnum + frame * num_fovs)
                         hdf5_dataset[j, :, :] = nd2_image
                 else:
                     for j in range(len(timepoint_list)):
                         frame = timepoint_list[j]
                         nd2_image = nd2file.get_frame_2D(c=i,
                                                          t=frame,
                                                          v=fovnum)
                         hdf5_dataset[j, :, :] = nd2_image
     return "Done."
Exemple #9
0
def train_new_top_model(all_features_hdf5, all_labels_hdf5):
    #The Bottleneck or other image extracted features are  stored in h5 files
    # Use HDF5MatrixCacheIterator to use them. By default HDF5MatrixCacheIterator doesnt transform and doesnt shuffle.
    # so we can use old ImageDataGenerator

    datagen = ImageDataGenerator(validation_split=0.2)
    f1_trainvalidation = h5.File(all_features_hdf5, 'r')
    shape = f1_trainvalidation['data'].shape
    f1_trainvalidation.close()
    #15 Gb is upper limit for cache memory.
    total_mem_usage, dividing_factor = calculateDividingFactor(shape)

    #print("Dividing factor {}:".format(dividing_factor))
    print("Cached memory usage: {}".format(total_mem_usage / (1024**3) /
                                           dividing_factor))
    chunk_shape = (1, max(shape[1] // 2, 1), max(shape[2] // 2,
                                                 1), max(shape[3] // 2, 1))
    f1_trainvalidation = h5c.File(all_features_hdf5,
                                  'r',
                                  chunk_cache_mem_size=total_mem_usage //
                                  dividing_factor)
    f1_label = h5.File(all_labels_hdf5, 'r')
    train_generator = datagen.flow_hdf5(f1_trainvalidation['data'],
                                        f1_label['data'],
                                        subset='training',
                                        batch_size=batch_size,
                                        shuffle=False)
    validation_generator = datagen.flow_hdf5(f1_trainvalidation['data'],
                                             f1_label['data'],
                                             subset='validation',
                                             batch_size=batch_size,
                                             shuffle=False)
    model = Sequential()
    model.add(Flatten(input_shape=shape[1:]))
    model.add(Dense(1024, activation='relu'))
    #model.add(Dense(256,input_shape=(train_data_shape[1:],),activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(85, activation='sigmoid'))

    #model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])
    model.compile(loss='binary_crossentropy', optimizer="adam", metrics=[f1])
    model.fit_generator(
        generator=train_generator,
        steps_per_epoch=int(ceil(train_generator.samples / batch_size)),
        validation_data=validation_generator,
        max_queue_size=
        10,  # use a value which can fit batch_size * image_size * max_queue_size in your CPU memory
        workers=
        4,  # I don't see multi workers can have any performance benefit without multi threading
        use_multiprocessing=False,  # HDF5Matrix cannot support multi-threads
        validation_steps=int(ceil(validation_generator.samples / batch_size)),
        shuffle=True,
        epochs=epochs)
    model.save_weights(top_model_weights_path)
Exemple #10
0
    def __init__(self, data_dir, subset='train', download=True):
        super().__init__()

        if download:
            Cifar10Dataset.download(data_dir)

        h5_file = os.path.join(data_dir, 'cifar-10.h5')
        self.h5_file = h5_file
        self.subset = subset
        with h5py_cache.File(h5_file, 'r', chunk_cache_mem_size=1024**3) as f:
            self.length = f[subset]['images'].shape[0]
Exemple #11
0
    def write_hdf5(self, filepath, dataset_id, vid_data):
        """
        Writes data to an hdf5 file

        :param filepath: filepath of hdf5
        :param dataset_id: id of dataset in hdf5
        :param vid_data: data to write
        :return: None
        """
        hdf5_lock = self.add_hdf5(filepath)
        with hdf5_lock:
            h5py_file = h5c.File(filepath, 'a', libver='latest', chunk_cache_mem_size=(1024**2)*16)
            h5py_file.create_dataset(dataset_id, data=vid_data, dtype='uint8')
            h5py_file.close()
Exemple #12
0
    def read_hdf5(self, filepath, dataset_id, start_frame, end_frame):
        """
        Reads data from an hdf5

        :param filepath: filepath of hdf5
        :param dataset_id: id of dataset in hdf5
        :param start_frame: frame to start from
        :param end_frame: frame to end before (exclusive)
        :return: data
        """
        hdf5_lock = self.add_hdf5(filepath)
        with hdf5_lock:
            h5py_file = h5c.File(filepath, 'r', libver='latest', chunk_cache_mem_size=(1024**2)*16)
            dataset = h5py_file[dataset_id]
            return dataset[start_frame: end_frame]
Exemple #13
0
 def writehdf5(fovnum,num_entries,timepoint_list,file_idx, num_fovs):
     with ND2Reader(self.nd2filename) as nd2file:
         for key,item in self.nd2reader_override.items():
             nd2file.metadata[key] = item                
         y_dim = self.metadata['height']
         x_dim = self.metadata['width']
         with h5py_cache.File(self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5","w",chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile:
             for i,channel in enumerate(self.metadata["channels"]):
                 hdf5_dataset = h5pyfile.create_dataset(str(channel),\
                 (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16')
                 for j in range(len(timepoint_list)):
                     frame = timepoint_list[j]
                     nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum)
                     hdf5_dataset[j,:,:] = nd2_image
     return "Done."
Exemple #14
0
    def __getitem__(self, index):
        subset = self.subset

        with h5py_cache.File(self.h5_file, 'r', chunk_cache_mem_size=1024**3) as f:
            image_bytes = torch.from_numpy(f[subset]['images'][index])
            label = torch.from_numpy(f[subset]['labels'][index])

        image = (image_bytes.float() * 2) / 255 - 1

        sample = {
            'input': image,
            'label': label,
        }

        return sample
Exemple #15
0
    def init_hdf5(self,
                  file_name,
                  dataset_name,
                  array,
                  t_len,
                  t_dim_out,
                  dtype='uint16',
                  singleton_chunk_dims=[]):
        """Initializes an empty hdf5 file and dataset to write to, given an array
        with the target shape in all axes but the time axis. The time axis
        is then specified by t_len.

        Args:
            file_name (str): Name of the hdf5 file, assumed to be in the temp folder
            initialized by this class.
            dataset_name (str): The name of the hdf5 dataset to initialize.
            array (array): Array which is of the same size as the dataset,
            except in the time dimension.
            t_len (int): Total size of the dataset time dimension.
            t_dim_out (int): Axis of the dataset time dimension.

            dtype(str, optional): Specifies the array datatype to initialize an
            hdf5 file for. A 16 bit unsigned integer by default.
        """
        out_shape = list(array.shape)
        out_shape[t_dim_out] = t_len
        out_shape = tuple(out_shape)

        chunk_shape = np.array(list(out_shape))

        min_arr = np.ones(chunk_shape.shape, dtype=int) * self.img_chunk_size
        chunk_shape = np.minimum(chunk_shape, min_arr)
        chunk_shape[t_dim_out] = 1

        if singleton_chunk_dims != []:
            for dim in singleton_chunk_dims:
                chunk_shape[dim] = 1

        chunk_shape = tuple(chunk_shape)

        with h5py_cache.File(
                self.temp_path + file_name + ".hdf5",
                "a",
                chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile:
            hdf5_dataset = h5pyfile.create_dataset(dataset_name,
                                                   out_shape,
                                                   chunks=chunk_shape,
                                                   dtype=dtype)
Exemple #16
0
 def writehdf5(fidx_channels_paths):
     y_dim = self.metadata['height']
     x_dim = self.metadata['width']
     num_channels = len(self.metadata["channels"])
     
     file_idx, channels, filepaths = fidx_channels_paths
     datasets = {}
     with h5py_cache.File(self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5","w",chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile:
         for i,channel in enumerate(self.metadata["channels"]):
             hdf5_dataset = h5pyfile.create_dataset(str(channel),\
             (len(filepaths)/num_channels,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16')
             datasets[channel] = hdf5_dataset
         for i in range(len(filepaths)):
             curr_channel = channels[i]
             curr_file = filepaths[i]
             datasets[curr_channel][i//num_channels,:,:] = imread(curr_file)
     return "Done."
Exemple #17
0
    def extract_fov(self, fovnum):
        nd2file = ND2Reader(self.nd2filename)
        num_fovs = len(nd2file.metadata["fields_of_view"])

        with h5py_cache.File(
                self.hdf5path + "/fov_" + str(fovnum) + ".hdf5",
                "w",
                chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile:
            for i, channel in enumerate(nd2file.metadata["channels"]):
                y_dim = nd2file.metadata['height']
                x_dim = nd2file.metadata['width']
                t_dim = len(nd2file.metadata['frames'])
                hdf5_dataset = h5pyfile.create_dataset("channel_" + str(channel),\
                                (x_dim,y_dim,t_dim), chunks=self.chunk_shape, dtype='uint16')
                for frame in range(len(nd2file.metadata['frames'])):
                    nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum)
                    hdf5_dataset[:, :, int(frame)] = nd2_image
        nd2file.close()
Exemple #18
0
 def write_hdf5(self, file_name, array, ti, t_len, t_dim_out, dataset_name):
     """Writes an array to a particular dataset in an hdf5 file. Positions
     in time are left variable to enable chunking the dataset in time.
     
     Args:
         file_name (str): Name of the hdf5 file, assumed to be in the temp folder
         initialized by this class.
         array (array): Array to be written.
         ti (int): Initial time position to write array values to.
         t_len (int): Total size of the target time dimension.
         t_dim_out (int): Axis of the target time dimension.
         dataset_name (str): The name of the hdf5 dataset to write to.
     """
     with h5py_cache.File(
             self.temp_path + file_name + ".hdf5",
             "r+",
             chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile:
         indices = list(range(ti, min(ti + self.t_chunk, t_len)))
         self.reassign_idx(h5pyfile[dataset_name], array, indices,
                           t_dim_out)
Exemple #19
0
        def writehdf5(fovnum, num_entries, timepoint_list, file_idx):
            with ND2Reader(self.nd2filename) as nd2file:
                y_dim = self.metadata['height']
                x_dim = self.metadata['width']
                with h5py_cache.File(
                        self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5",
                        "w",
                        chunk_cache_mem_size=self.chunk_cache_mem_size
                ) as h5pyfile:
                    for i, channel in enumerate(self.metadata["channels"]):
                        hdf5_dataset = h5pyfile.create_dataset(str(channel),\
                        (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16')

                        for j in range(len(timepoint_list)):
                            frame = timepoint_list[j]
                            #                             frame = dataframe[j:j+1]["timepoints"].values[0]
                            #                         frame = self.metadf['frames'][timepoint] #not sure if this is necessary...
                            nd2_image = nd2file.get_frame_2D(c=i,
                                                             t=frame,
                                                             v=fovnum)
                            hdf5_dataset[j, :, :] = nd2_image
            return "Done."
def save_image_features(img_width, img_height, all_features_hdf5,
                        all_labels_hdf5, train):
    datagen = ImageDataGenerator()
    #Build a fake model
    model = Sequential()
    generator = datagen.flow_from_directory(full_data_dir,
                                            target_size=(img_height,
                                                         img_width),
                                            batch_size=batch_size,
                                            interpolation='lanczos',
                                            class_mode=None,
                                            table_pd=train)

    shape = (generator.samples, img_width, img_height, 3)
    chunk_shape = (1, 100, 100, 3)
    total_mem_usage, dividing_factor = calculateDividingFactor(shape)
    f1_all = h5c.File(all_features_hdf5,
                      'w',
                      chunk_cache_mem_size=total_mem_usage // dividing_factor)
    f1_label = h5.File(all_labels_hdf5, 'w')
    d1_all = f1_all.create_dataset('data',
                                   shape,
                                   dtype='float32',
                                   chunks=chunk_shape,
                                   compression="lzf")
    d1_label = f1_label.create_dataset(
        'data', (generator.samples, len(generator.table_pd.columns)),
        dtype='float32')
    model.write_generator(generator,
                          steps=int(ceil(generator.samples / batch_size)),
                          max_queue_size=10,
                          workers=4,
                          use_multiprocessing=False,
                          verbose=0,
                          d_set=d1_all,
                          label_set=d1_label)
    f1_all.close()
    f1_label.close()
Exemple #21
0
def main():
    '''
    Set parameters
    '''
    parser = argparse.ArgumentParser(description='VAD on the AMI corpus')
    parser.add_argument('--SRate', type=int, default=16000, metavar='SR',
                        help="Sample rate to be used")
    parser.add_argument('--SChannels', type=int, default=1, metavar='SC',
                        help="Number of channels")
    parser.add_argument('--SWidth', type=int, default=2, metavar='SW',
                        help="Sample Width")
    parser.add_argument('--FSize', type=int, default=30, metavar='FS',
                        help="Frame size in MS")
    parser.add_argument('--Path', type=str, default='/home/lucas/PycharmProjects/Papers_with_code/data/AMI', metavar='DP',
                        help="Path to data folder")
    parser.add_argument('--MFCC_WinS', type=int, default=4, metavar='MWS',
                        help="MFCC window size (in frames)")
    parser.add_argument('--Generate_Datasets', type=bool, default=True, metavar='GD',
                        help="Generate hdf5 datasets? (Default=True)")
    parser.add_argument('--Batch_Size', type=int, default=2048, metavar='BS',
                        help="Batch size (default 2048)")
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    args = parser.parse_args()

    #Calculate frame size in data points
    FRAME_SIZE = int(args.SRate * (args.FSize / 1000.0))

    #Change path
    PATH = args.Path
    os.chdir(PATH)

    AMI_speech = FileManager('AMI_speech_test', 'amicorpus')
    #AMI_speech.prepare_files(normalize=False, SAMPLE_RATE=args.SRate, SAMPLE_WIDTH=args.SWidth,
     #                        SAMPLE_CHANNELS=args.SChannels)
    #AMI_speech.collect_frames(FRAME_SIZE=FRAME_SIZE)
    #AMI_speech.label_frames(PATH,FRAME_LENGTH=30)
    speech_data = AMI_speech.data
    data = h5py_cache.File('data_AMI_test.hdf5', 'a', chunk_cache_mem_size=1024 ** 3)

    CUDA = True

    SLICE_MIN_MS = 2500
    SLICE_MAX_MS = 5000

    # Frame size to use for the labelling.
    #FRAME_SIZE_MS = 30

    # Convert slice ms to frame size.
    SLICE_MIN = int(SLICE_MIN_MS / 30)
    SLICE_MAX = int(SLICE_MAX_MS / 30)

    if 'labels' not in data:
        print('Preparing Data...')
        pos = 0
        l = len(AMI_speech.data['frames'])

        slices = []

        # Split frames into slices for feature extraction. Slices are used to extract features is "batches" to not overload system memory
        while pos + SLICE_MIN < l:
            slice_indexing = (pos, pos + SLICE_MIN)
            slices.append(slice_indexing)
            pos = slice_indexing[1]

        # Add remainder to last slice.
        slices[-1] = (slices[-1][0], l)

        #Get total frame count
        total = l + args.MFCC_WinS

        #Create datasets
        data.create_dataset('frames', (total, FRAME_SIZE), dtype=np.dtype(np.int16))
        data.create_dataset('mfcc', (total, 12), dtype=np.dtype(np.float32))
        data.create_dataset('delta', (total, 12), dtype=np.dtype(np.float32))

        #Create data set for labels
        dt = np.dtype(np.int8)
        data.create_dataset('labels', (total,), dtype=dt)

        pos = 0

        for s in slices:
            frames = AMI_speech.data['frames'][s[0]:s[1]]
            labels = AMI_speech.data['ground_truth'][s[0]:s[1]]
            if pos == 0:
                align_frames = np.zeros((args.MFCC_WinS - 1, FRAME_SIZE))
            else:
                align_frames = data['frames'][pos-args.MFCC_WinS+1:pos]
            frames, mfcc, delta = extract_features(align_frames=align_frames,speech_frames=frames, SAMPLE_WIDTH=args.SWidth, SAMPLE_RATE=args.SRate, SAMPLE_CHANNELS=args.SChannels, mfcc_window_size=args.MFCC_WinS, FRAME_SIZE_MS=args.FSize)
            data['frames'][pos:pos + len(labels)] = frames
            data['mfcc'][pos:pos + len(labels)] = mfcc
            data['delta'][pos:pos + len(labels)] = delta

            data['labels'][pos: pos + len(labels)] = labels
            pos += len(labels)
            print('Preparing ({0:.2f} %)'.format((pos * 100) / total), end='\r', flush=True)
        data.flush()
        print('\nDone')




    # Test generator features.
    #generator = DataGenerator(data, size_limit=10000)

    #generator.setup_generation(frame_count=3, step_size=5, batch_size=4)

    #generator.use_train_data()
    #X, y = generator.get_batch(50)

    #print(f'Load a few frames into memory:\n{X[0]}\n\nCorresponding label: {y[0]}')
    #print(len(AMI_speech.data['frames']))

    #Vis.plot_sample(frames=AMI_speech.data['frames'][1:],labels=AMI_speech.data['ground_truth'][1:])
    #Vis.plot_sample_webrtc(frames=AMI_speech.data['frames'][1:10000], sensitivity=0)
    #generator.plot_data(0, 400000)
    #print(data['frames'][0:2])

    #net = Net(large=False)
    #count_params(net)
    #print(net)

    # Test generator
    generator = DataGenerator(data)
    generator.setup_generation(frame_count=30, step_size=6, batch_size=2048)
    generator.use_train_data()

    print(generator.batch_count, 'training batches were found')

    # Compact instantiation of untrained network on CPU
    temp, CUDA = CUDA, False
    net, CUDA = Net(large=False, CUDA=CUDA), temp
    del temp

    test_net(net=net, num_batches=1, generator=generator)


    #Test simple lstm network
    set_seed(1001,CUDA=CUDA)
    #net = BiRNN(num_in=30,num_hidden=30,batch_size=2048,large=True,lstm=False,fcl=True,bidir=True,OBJ_CUDA=True)
    net = Net(large=True,lstm=False)
    train_net(net, data=data)
    #get_xml(PATH)
    #net, data, noise_level = '-3', init_pos = 50, length = 700, only_plot_net = False, timeit = True, FRAMES = 30, STEP_SIZE = 6, BATCH_SIZE = 2048, FEATURES = 24, OBJ_CUDA = True
    netvad(net = net, data=data,init_pos=10000,length= 20000, only_plot_net=True, timeit=True, FRAMES=30, STEP_SIZE=6, BATCH_SIZE=2048, FEATURES=24, OBJ_CUDA=True)
Exemple #22
0
 def __init__(self, file_path, start=0, end=None):
     super(H5Dataset, self).__init__()
     with h5c.File(file_path, 'r', chunk_cache_mem_size=CHUNK_CACHE_MEM_SIZE) as f:
         self.images = torch.from_numpy(f['images'][start : end])
         self.labels = torch.from_numpy(f['labels'][start : end]).to(torch.int32)
     print("Loaded images of shape {}, type {}, and labels of shape {}, type {}.".format(self.images.shape, self.images.dtype, self.labels.shape, self.labels.dtype))
Exemple #23
0

dataset_path = '/mnt/hdd/PROX/snapshot_realcams_v3'
outfilename = 'realcams.hdf5'

h5file_path = os.path.join('/home/yzhang/Videos/PROXE', outfilename)


batch_gen = BatchGeneratorWithSceneMeshMatfile(dataset_path=dataset_path,
                                                scene_verts_path = '/home/yzhang/Videos/PROXE/scenes_downsampled',
                                                scene_sdf_path = '/home/yzhang/Videos/PROXE/scenes_sdf',
                                                device=torch.device('cuda'))


### create the dataset used in the hdf5 file
with h5c.File(h5file_path, mode='w',chunk_cache_mem_size=1024**2*128) as hdf5_file:
    while batch_gen.has_next_batch():
        train_data = batch_gen.next_batch(1)
                    
        if train_data is None:
            continue


        train_data_np = [x.detach().cpu().numpy() for x in train_data[:-1]]
        break




    [depth_batch, seg_batch, body_batch, 
        cam_ext_batch, cam_int_batch, max_d_batch,
Exemple #24
0
import numpy as np
import os
import sys

CHUNK_CACHE = 1024**2 * 4000

if len(sys.argv) < 4:
    print('Error: not enough input arguments!')
    print('Usage: python3 shuffle_h5_mem_2pass.py IN_H5 OUT_H5 STEPS')
    exit(-1)

fin = sys.argv[1]
fout = sys.argv[2]
steps = int(sys.argv[3])

with h5c.File(fin, 'r', chunk_cache_mem_size=CHUNK_CACHE) as f:
    images_shape = f['images'].shape
    labels_shape = f['labels'].shape
    n = labels_shape[0]
    max_in_memory = int(np.floor(n / steps))
    blocks = list(np.arange(0, n, max_in_memory)) + [labels_shape[0]]
    blocks = blocks[:-1]
    blocks[-1] = n
    m = len(blocks) - 1
    print('n:', n, 'm:', m, 'blocks:', blocks)
    chunk_shape = (1000, ) + images_shape[1:]
    db_idx = np.random.permutation(np.repeat(np.arange(m), max_in_memory))
    print('db_idx:', len(db_idx))
    offsets = [0] * m

    # 1st pass: Assign instances to randomly chosen bins.
Exemple #25
0
def main(args):
    if tf.test.is_gpu_available():
        print(bc.OKGREEN + bc.BOLD + '#' * 9 + ' USING GPU ' + '#' * 9 +
              bc.ENDC)
    else:
        print(bc.FAIL + bc.BOLD + '#' * 9 + ' NOT USING GPU ' + '#' * 9 +
              bc.ENDC)

    # Get agent name
    tokens = args.p.split('/')
    if args.p[-1] == '/':
        assert (tokens.pop() == '')
    dir_agent = '-'.join(tokens[-1].split('_')[:-3]) + '.save'
    print(dir_agent)

    # run this first to avoid failing after huge overhead
    model_ok, initial_epoch = model_exists(args.m, dir_agent)

    PATH_DIR_SAVE = os.path.join(args.m, dir_agent)
    PATH_DIR_CKPT = os.path.join(PATH_DIR_SAVE, 'ckpts')

    n_epoch = args.epochs
    hypers = {
        'lr': 0.00015,
        'batch_size': 128,
        'hl_activations': [ReLU, ReLU, ReLU, ReLU, ReLU, ReLU],
        'hl_sizes': [1024, 1024, 512, 512, 512, 256],
        'decay': 0.,
        'bNorm': True,
        'dropout': True,
        'regularizer': None
    }

    # checking input data format.
    if args.p.split('.')[-1] in ['hdf5', 'HDF5']:
        f = h5py_cache.File(p,
                            'r',
                            chunk_cache_mem_size=1 * 1024**3,
                            swmr=True)
        gen_tr = Gen4h5(f['X_tr'], f['Y_tr'], hypers['batch_size'], False)
        gen_va = Gen4h5(f['X_va'], f['Y_va'], hypers['batch_size'], False)
    else:
        X, Y, mask = CV(args.p)
        gen_tr = DataGenerator(X[mask], Y[mask], hypers['batch_size'])
        gen_va = DataGenerator(X[~mask], Y[~mask], 1000)

    os.makedirs(PATH_DIR_CKPT, exist_ok=True)

    # Callbacks: save best & latest models.
    callbacks = [
        ModelCheckpoint(os.path.join(PATH_DIR_SAVE, 'best.h5'),
                        monitor='val_loss',
                        verbose=1,
                        save_best_only=True,
                        save_weights_only=True,
                        mode='auto',
                        period=1),
        ModelCheckpoint(os.path.join(PATH_DIR_CKPT,
                                     '{epoch:02d}-{val_accuracy:.2f}.h5'),
                        monitor='val_loss',
                        verbose=1,
                        save_best_only=False,
                        save_weights_only=True,
                        mode='auto',
                        period=1),
        CSVLogger(os.path.join(PATH_DIR_SAVE, 'training.log'), append=True)
    ]

    m = Mlp(io_sizes=(glb.SIZE_OBS_VEC, glb.SIZE_ACT_VEC),
            out_activation=Softmax,
            loss='categorical_crossentropy',
            metrics=['accuracy'],
            **hypers,
            verbose=1)

    if model_ok:
        # continue from previously saved
        msg = "Saved model found. Resuming training."
        print(bc.OKGREEN + bc.BOLD + msg + bc.ENDC)
        h5s = os.listdir(PATH_DIR_CKPT)
        h5s.sort()
        saved_h5 = os.path.join(PATH_DIR_CKPT, h5s[-1])
        m.construct_model(saved_h5, weights_only=True)
    else:
        # create new model
        msg = "{} doesn't exist or is empty. Creating new model."
        print(bc.WARNING + bc.BOLD + msg.format(PATH_DIR_SAVE) + bc.ENDC)
        os.makedirs(PATH_DIR_CKPT, exist_ok=True)
        m.construct_model()

    m.train_model(gen_tr,
                  gen_va,
                  n_epoch=n_epoch,
                  callbacks=callbacks,
                  verbose=False,
                  workers=args.w,
                  use_mp=True,
                  max_q_size=args.q,
                  initial_epoch=initial_epoch)
def transpose(dset_in,
              file_name_out,
              dset_name_out,
              chunk_cache_mem_size=1024**3,
              w0=1.0,
              dtype=None,
              R2=None,
              C2=None,
              close_file_when_done=False,
              access_axis=1,
              show_progress=False):
    """Transpose large matrix in HDF5 for fast access on transposed row

    Assuming the input dataset is chunked along its column (second) index, this function
    transposes it to an output dataset that is chunked along its column index.  Done naively,
    this operation can be extraordinarily slow, because we need to access rows in the input
    in order to write columns in the output.  But row access is slow because the entire
    column chunk needs to be read.  A similar problem occurs for writing.

    This function splits the difference, using the chunk cache to maximum advantage.  Roughly
    speaking, if we have sufficient memory to store M elements, we can set the output chunk
    size to sqrt(M) and process sqrt(M) chunks at a time.

    The input dataset may actually be one-dimensional, but interpreted as a matrix of shape
    R1xC1 = C2xR2.  Or the input may simply be a matrix of that shape, in which case R2 and
    C2 will be inferred.


    Parameters
    ----------
    dset_in : h5py.Dataset
        Open dataset in an h5py File object used as input.

    file_name_out : str
    dset_name_out : str
        Names of file and dataset in that file used for output.

    chunk_cache_mem_size : int
    w0 : float

    dtype : dtype for h5py dataset
        Default is None. If None, dtype of dset_in is selected.
        Parameters passed to `h5py_File_with_cache`.  See that function's documentation.

    R2, C2 : int
        Number of rows and columns in returned dataset.  These default to None, in which case
        the shape is inferred from the shape of the input.  Note that if the input is 1-D,
        this could turn out badly because each row will have just one element, which will be
        slow.

    close_file_when_done : bool
        If True, close output file and return None.  If False, return file handle and dataset.
        Default is False.

    access_axis : int
        Axis along which the output array will be accessed.  Default is 1, meaning the second
        (column) index.  Anything else will be interpreted as 0.

    show_progress : bool
        Periodically show progress through the main loop.  Defaults to False.


    """
    import numpy as np
    import h5py
    import h5py_cache

    if not dtype:
        dtype = dset_in.dtype

    # Figure out the basic output sizes
    if R2 is None:
        if len(dset_in.shape) > 1:
            R2 = dset_in.shape[1]
        else:
            R2 = 1
    if C2 is None:
        C2 = dset_in.shape[0]

    bytes_per_object = np.dtype(dtype).itemsize
    num_chunk_elements = chunk_cache_mem_size // bytes_per_object
    sqrt_n_c_e = int(np.sqrt(num_chunk_elements))

    assert R2 * C2 == dset_in.size, (
        "Requested size {0}*{1}={2}".format(R2, C2, R2 * C2) +
        " is incompatible with input size {0}".format(dset_in.size))

    # If the transposition can be done in memory, just do it
    if dset_in.size <= num_chunk_elements:
        # print("Doing transpose in memory")
        file_out = h5py.File(file_name_out, 'a')
        if dset_name_out in file_out:
            del file_out[dset_name_out]
        dset_out = file_out.create_dataset(dset_name_out,
                                           shape=(R2, C2),
                                           dtype=dtype)
        dset_out[:] = (dset_in[:].reshape(C2, R2).T).astype(dtype)

    else:

        # Set up output file and dset
        if access_axis == 1:
            n_cache_chunks = min(sqrt_n_c_e, R2)
            chunk_size = min(num_chunk_elements // n_cache_chunks, C2)
            chunks = (1, chunk_size)
        else:
            n_cache_chunks = min(sqrt_n_c_e, C2)
            chunk_size = min(num_chunk_elements // n_cache_chunks, R2)
            chunks = (chunk_size, 1)
        file_out = h5py_cache.File(file_name_out, 'a', chunk_cache_mem_size,
                                   w0, n_cache_chunks)
        if dset_name_out in file_out:
            del file_out[dset_name_out]
        dset_out = file_out.create_dataset(dset_name_out,
                                           shape=(R2, C2),
                                           dtype=dtype,
                                           chunks=chunks)

        # Depending on whether input is 1-D or 2-D, we do this differently
        if len(dset_in.shape) == 1:

            def submatrix_dset_in(r2a, r2b, c2a, c2b):
                temp = np.empty((c2b - c2a, r2b - r2a), dtype=dtype)
                C1 = R2
                c1a, c1b = r2a, r2b
                for r1 in range(c2a, c2b):
                    try:
                        temp[r1 - c2a] = (dset_in[r1 * C1 + c1a:r1 * C1 +
                                                  c1b]).astype(dtype)
                    except ValueError:
                        print(r2a, r2b, "\t", c2a, c2b, "\n", r1, c1a, c1b,
                              "\t", C1)
                        print(r1 - c2a, r1 * C1 + c1a, r1 * C1 + c1b,
                              dset_in.shape, temp.shape)
                        raise
                return temp
        else:

            def submatrix_dset_in(r2a, r2b, c2a, c2b):
                return (dset_in[c2a:c2b, r2a:r2b]).astype(dtype)

        # Do the transposition
        i = 1
        for c2a in range(0, C2, chunk_size):
            for r2a in range(0, R2, n_cache_chunks):
                if show_progress:
                    print("\t\t\t{0} of {1}".format(
                        i,
                        int(
                            np.ceil(C2 / chunk_size) *
                            np.ceil(R2 / n_cache_chunks))))
                c2b = min(C2, c2a + chunk_size)
                r2b = min(R2, r2a + n_cache_chunks)
                dset_out[r2a:r2b,
                         c2a:c2b] = submatrix_dset_in(r2a, r2b, c2a, c2b).T
                i += 1

    if close_file_when_done:
        file_out.close()
        return
    else:
        return file_out, dset_out
Exemple #27
0
parser.add_argument("savefile", type=str, help="File to save results")
parser.add_argument("-s", "--size", type=int, help="Batch size")
parser.add_argument("-f",
                    "--freq",
                    type=float,
                    help="Central gaussian frequency")
parser.add_argument("-w",
                    "--width",
                    type=float,
                    help="Width of gaussian frequency")

args = parser.parse_args()

cfreq = args.freq
freq_width = args.width
data = h5py_cache.File(args.file, "r", chunk_cache_mem_size=500 * 1024**2)
print("Opening {} as data file...".format(sys.argv[1]))
keys = list(data.keys())
data = [data[key] for key in keys]  # read all datasets in the hf5 file
print("Components: {}".format(keys))
print("Data {}".format(data[0].shape))

rfile = h5py_cache.File(args.reffile, "r", chunk_cache_mem_size=500 * 1024**2)
reference = [rfile[key] for key in keys]
print("Reference {}".format(reference[0].shape))

l = data[0].shape[2]  # length of temporal dimension
interp = 4  # optional zero padding of data to increase resilution, which is equivalent to interpolation
flen = (l * interp) // 2  # length of data in frequency
# domain is 2 times smaller due to FFT symmetry
print("Temporal length of data {}, length of freq domain {}".format(l, flen))
Exemple #28
0
datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                             validation_split=0.2,
                             apply_gen_transform=True)

f1_trainvalidation = h5.File(all_features_hdf5, 'r')
shape = f1_trainvalidation['data'].shape
f1_trainvalidation.close()
#15 Gb is upper limit for cache memory.
total_mem_usage, dividing_factor = calculateDividingFactor(shape)

#print("Dividing factor {}:".format(dividing_factor))
print("Cached memory usage: {}".format(total_mem_usage / (1024**3) /
                                       dividing_factor))
chunk_shape = (1, 100, 100, 3)
f1_trainvalidation = h5c.File(all_features_hdf5,
                              'r',
                              chunk_cache_mem_size=total_mem_usage //
                              dividing_factor)
f1_label = h5.File(all_labels_hdf5, 'r')
train_generator = datagen.flow_hdf5(f1_trainvalidation['data'],
                                    f1_label['data'],
                                    subset='training',
                                    batch_size=batch_size,
                                    shuffle=False)
validation_generator = datagen.flow_hdf5(f1_trainvalidation['data'],
                                         f1_label['data'],
                                         subset='validation',
                                         batch_size=batch_size,
                                         shuffle=False)

# fine-tune the model
model.fit_generator(